626 files changed, 24345 insertions, 14718 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index a75f669af..f5bd44798 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,23 +1,27 @@
-# Python CircleCI 2.0 configuration file
+# Python CircleCI 2.1 configuration file
 #
-# Check https://circleci.com/docs/2.0/language-python/ for more details
+# Check https://circleci.com/docs/2.1/language-python/ for more details
 #
-version: 2
-jobs:
-  build:
-    docker:
-      # CircleCI maintains a library of pre-built images
-      # documented at https://circleci.com/docs/2.0/circleci-images/
-      # circleci/python3.8 images come with old versions of Doxygen(1.6.x),
-      # therefore a new base image chose instead to guarantee to
-      # have a newer version >= 1.8.10 to avoid warnings
-      # that related to the default behaviors or non-exist config options
-      - image: cimg/base:2021.05
+version: 2.1
+
+# Aliases to reuse
+_defaults: &defaults
+  docker:
+    # CircleCI maintains a library of pre-built images
+    # documented at https://circleci.com/docs/2.1/circleci-images/
+    # circleci/python3.8 images come with old versions of Doxygen(1.6.x),
+    # therefore a new base image chose instead to guarantee to
+    # have a newer version >= 1.8.10 to avoid warnings
+    # that related to the default behaviors or non-exist config options
+    - image: cimg/python:3.9
+  working_directory: ~/repo
 
-    working_directory: ~/repo
 
+jobs:
+  build:
+    <<: *defaults
     steps:
-      - checkout:
+      - checkout
 
       - run:
           name: Check-skip
@@ -47,9 +51,10 @@ jobs:
           name: create virtual environment, install dependencies
           command: |
             sudo apt-get update
-            sudo apt-get install -y python3.8 python3.8-dev python3-venv graphviz texlive-fonts-recommended texlive-latex-recommended \
-              texlive-latex-extra latexmk texlive-xetex doxygen
-            python3.8 -m venv venv
+            #sudo apt-get install -y python3.9 python3.9-dev python3-venv graphviz texlive-fonts-recommended texlive-latex-recommended \
+            sudo apt-get install -y graphviz texlive-fonts-recommended texlive-latex-recommended \
+              texlive-latex-extra latexmk texlive-xetex texlive-lang-chinese doxygen
+            python3.9 -m venv venv
             . venv/bin/activate
 
       - run:
@@ -82,77 +87,92 @@ jobs:
             . venv/bin/activate
             cd doc
             # Don't use -q, show warning summary"
-            SPHINXOPTS="-j4 -n" make -e html || echo "ignoring errors for now, see gh-13114"
-
-      - run:
-          name: build devdocs
-          no_output_timeout: 30m
-          command: |
-            . venv/bin/activate
-            cd doc
-            make clean
-            SPHINXOPTS="-j4 -q" make -e html
+            SPHINXOPTS="-j2 -n" make -e html || echo "ignoring errors for now, see gh-13114"
+            if [[ $(find build/html -type f | wc -l) -lt 1000 ]]; then
+                echo "doc build failed: build/html is empty"
+                exit -1
+            fi
 
       - run:
           name: build neps
           command: |
             . venv/bin/activate
             cd doc/neps
-            SPHINXOPTS="-j4 -q" make -e html
+            SPHINXOPTS="-j2 -q" make -e html
 
       - store_artifacts:
           path: doc/build/html/
 
-
       - store_artifacts:
           path: doc/neps/_build/html/
      #      destination: neps
 
+      - persist_to_workspace:
+          root: ~/repo
+          paths:
+            - doc/build/html
+            - doc/neps/_build
+            - tools/ci/push_docs_to_repo.py
+
+  deploy:
+    <<: *defaults
+    steps:
+      - checkout
+
+      - attach_workspace:
+          at: ~/repo
+
       - add_ssh_keys:
           fingerprints:
-            - "9f:8c:e5:3f:53:40:0b:ee:c9:c3:0f:fd:0f:3c:cc:55"
+            - "45:d8:d1:d6:f7:53:47:c5:d0:9e:35:19:79:e7:ff:24"
 
       -  run:
           name: deploy devdocs
           command: |
-            if [ "${CIRCLE_BRANCH}" == "main" ]; then
-              touch doc/build/html/.nojekyll
-
-              ./tools/ci/push_docs_to_repo.py doc/build/html \
-                  git@github.com:numpy/devdocs.git \
-                  --committer "numpy-circleci-bot" \
-                  --email "numpy-circleci-bot@nomail" \
-                  --message "Docs build of $CIRCLE_SHA1" \
-                  --force
-            else
-              echo "Not on the main branch; skipping deployment"
-            fi
+            touch doc/build/html/.nojekyll
+
+            ./tools/ci/push_docs_to_repo.py doc/build/html \
+                --committer "numpy-circleci-bot" \
+                --email "numpy-circleci-bot@nomail" \
+                --message "Docs build of $CIRCLE_SHA1" \
+                --count 5 \
+                --force \
+                git@github.com:numpy/devdocs.git
 
       - add_ssh_keys:
           fingerprints:
-            - "11:fb:19:69:80:3a:6d:37:9c:d1:ac:20:17:cd:c8:17"
+            - "df:8b:fb:34:2d:38:7d:49:fc:1b:e8:44:4f:bd:2c:0e"
 
       - run:
           name: select SSH key for neps repo
           command: |
-            cat <<\EOF > ~/.ssh/config
+            cat \<<\EOF > ~/.ssh/config
             Host github.com
               IdentitiesOnly yes
-              IdentityFile /home/circleci/.ssh/id_rsa_11fb1969803a6d379cd1ac2017cdc817
+              IdentityFile /home/circleci/.ssh/id_rsa_df8bfb342d387d49fc1be8444fbd2c0e
             EOF
 
       -  run:
           name: deploy neps
           command: |
-            if [ "${CIRCLE_BRANCH}" == "main" ]; then
-              touch doc/neps/_build/html/.nojekyll
-
-              ./tools/ci/push_docs_to_repo.py doc/neps/_build/html \
-                  git@github.com:numpy/neps.git \
-                  --committer "numpy-circleci-bot" \
-                  --email "numpy-circleci-bot@nomail" \
-                  --message "Docs build of $CIRCLE_SHA1" \
-                  --force
-            else
-              echo "Not on the main branch; skipping deployment"
-            fi
+            touch doc/neps/_build/html/.nojekyll
+
+            ./tools/ci/push_docs_to_repo.py doc/neps/_build/html \
+                --committer "numpy-circleci-bot" \
+                --email "numpy-circleci-bot@nomail" \
+                --message "Docs build of $CIRCLE_SHA1" \
+                --count 5 \
+                --force \
+                git@github.com:numpy/neps.git \
+
+workflows:
+  version: 2
+  default:
+    jobs:
+      - build
+      - deploy:
+          requires:
+            - build
+          filters:
+            branches:
+              only: main
diff --git a/.cirrus.star b/.cirrus.star
index 20460b8b2..25c7b7dfd 100644
--- a/.cirrus.star
+++ b/.cirrus.star
@@ -16,8 +16,9 @@ def main(ctx):
     if env.get("CIRRUS_REPO_FULL_NAME") != "numpy/numpy":
         return []
 
-    # if env.get("CIRRUS_CRON", "") == "nightly":
-    #     return fs.read("ci/cirrus_wheels.yml")
+    # only run the wheels entry on a cron job
+    if env.get("CIRRUS_CRON", "") == "nightly":
+        return fs.read("tools/ci/cirrus_wheels.yml")
 
     # Obtain commit message for the event. Unfortunately CIRRUS_CHANGE_MESSAGE
     # only contains the actual commit message on a non-PR trigger event.
@@ -31,8 +32,8 @@ def main(ctx):
     if "[skip cirrus]" in dct["message"] or "[skip ci]" in dct["message"]:
         return []
 
-    config =  fs.read("tools/ci/cirrus_general.yml")
-
     # add extra jobs to the cirrus run by += adding to config
+    config = fs.read("tools/ci/cirrus_wheels.yml")
+    config += fs.read("tools/ci/cirrus_macosx_arm64.yml")
 
     return config
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
deleted file mode 100644
index e09631a59..000000000
--- a/.devcontainer/Dockerfile
+++ /dev/null
@@ -1,18 +0,0 @@
-# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/python-3-anaconda/.devcontainer/base.Dockerfile
-
-FROM mcr.microsoft.com/vscode/devcontainers/anaconda:0-3
-
-# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
-ARG NODE_VERSION="none"
-RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
-
-
-# Copy environment.yml (if found) to a temp location so we update the environment. Also
-# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
-COPY environment.yml* /tmp/conda-tmp/
-RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env create -f /tmp/conda-tmp/environment.yml; fi \
-    && rm -rf /tmp/conda-tmp
-
-# [Optional] Uncomment this section to install additional OS packages.
-# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
-#     && apt-get -y install --no-install-recommends <your-package-list-here>
diff --git a/.devcontainer/add-notice.sh b/.devcontainer/add-notice.sh
deleted file mode 100644
index bd6d6f340..000000000
--- a/.devcontainer/add-notice.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-# Display a notice when not running in GitHub Codespaces
-
-cat << 'EOF' > /usr/local/etc/vscode-dev-containers/conda-notice.txt
-When using "conda" from outside of GitHub Codespaces, note the Anaconda repository
-contains restrictions on commercial use that may impact certain organizations. See
-https://aka.ms/vscode-remote/conda/anaconda
-EOF
-
-notice_script="$(cat << 'EOF'
-if [ -t 1 ] && [ "${IGNORE_NOTICE}" != "true" ] && [ "${TERM_PROGRAM}" = "vscode" ] && [ "${CODESPACES}" != "true" ] && [ ! -f "$HOME/.config/vscode-dev-containers/conda-notice-already-displayed" ]; then
-    cat "/usr/local/etc/vscode-dev-containers/conda-notice.txt"
-    mkdir -p "$HOME/.config/vscode-dev-containers"
-    ((sleep 10s; touch "$HOME/.config/vscode-dev-containers/conda-notice-already-displayed") &)
-fi
-EOF
-)"
-
-echo "${notice_script}" | tee -a /etc/bash.bashrc >> /etc/zsh/zshrc
-\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 6011b2008..a31be7931 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,55 +1,17 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
-// https://github.com/microsoft/vscode-dev-containers/tree/v0.245.2/containers/python-3-anaconda
 {
-	"name": "Numpy (devcontainer)",
-	"build": { 
-		"context": "..",
-		"dockerfile": "Dockerfile",
-		"args": {
-			"NODE_VERSION": "lts/*"
-		}
-	},
+	// More info about Features: https://containers.dev/features
+	"image": "mcr.microsoft.com/devcontainers/universal:2",
+	"features": {},
+
+	"onCreateCommand": ".devcontainer/setup.sh",
+	"postCreateCommand": "",
 
-	// Configure tool-specific properties.
 	"customizations": {
-		// Configure properties specific to VS Code.
 		"vscode": {
-			// Set *default* container specific settings.json values on container create.
-			"settings": { 
-				"python.defaultInterpreterPath": "/opt/conda/bin/python",
-				"python.linting.enabled": true,
-				"python.linting.pylintEnabled": true,
-				"python.formatting.autopep8Path": "/opt/conda/bin/autopep8",
-				"python.formatting.yapfPath": "/opt/conda/bin/yapf",
-				"python.linting.flake8Path": "/opt/conda/bin/flake8",
-				"python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
-				"python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-				"python.linting.pylintPath": "/opt/conda/bin/pylint"
-			},
-			
-			// Add the IDs of extensions you want installed when the container is created.
 			"extensions": [
-				"ms-python.python",
-				"ms-python.vscode-pylance",
-				"njpwerner.autodocstring",
-				"ms-toolsai.jupyter",
-				"donjayamanne.python-environment-manager",
-				"ms-azuretools.vscode-docker"
-			]
-		}
-	},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Use 'postCreateCommand' to run commands after the container is created.
-	"postCreateCommand": "conda init",
-
-	// Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
-	"remoteUser": "vscode",
-	"features": {
-		"ghcr.io/devcontainers/features/rust:1": {
-			"version": "latest"
+				"ms-python.python"
+			],
+			"settings": {}
 		}
 	}
 }
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
new file mode 100755
index 000000000..4ea718ec9
--- /dev/null
+++ b/.devcontainer/setup.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -e
+
+curl micro.mamba.pm/install.sh | bash
+
+conda init --all
+micromamba shell init -s bash
+micromamba env create -f environment.yml --yes
+# Note that `micromamba activate numpy-dev` doesn't work, it must be run by the
+# user (same applies to `conda activate`)
+
+git submodule update --init
diff --git a/.gitattributes b/.gitattributes
index 911db2b72..537650d39 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -12,6 +12,7 @@ numpy/linalg/lapack_lite/f2c.h linguist-vendored
 tools/npy_tempita/* linguist-vendored
 numpy/core/include/numpy/libdivide/* linguist-vendored
 numpy/core/src/umath/svml/* linguist-vendored
+numpy/core/src/common/dlpack/dlpack.h linguist-vendored
 
 # Mark some files as generated
 numpy/linalg/lapack_lite/f2c_*.c linguist-generated
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index cfb07f450..a15a3a144 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -45,8 +45,10 @@ body:
 
 - type: textarea
   attributes:
-    label: "NumPy/Python version information:"
-    description: Output from `import sys, numpy; print(numpy.__version__, sys.version)`.
+    label: "Runtime information:"
+    description: >
+      Output from `import sys, numpy; print(numpy.__version__); print(sys.version)`
+      If you are running NumPy 1.24+, also show `print(numpy.show_runtime())`
   validations:
     required: true
 
@@ -58,4 +60,4 @@ body:
     placeholder: |
       << your explanation here >>
   validations:
-    required: false
-\ No newline at end of file
+    required: false
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..b59fe181d
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: daily
+    commit-message:
+      prefix: "MAINT"
+    labels:
+      - "03 - Maintenance"
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index fd38b04cc..70e11a4a4 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -3,8 +3,8 @@ name: Build_Test
 on:
   push:
     branches:
+      # coverage comparison in the "full" step needs to run on main after merges
       - main
-      - maintenance/**
   pull_request:
     branches:
       - main
@@ -16,7 +16,7 @@ defaults:
 
 env:
   DOWNLOAD_OPENBLAS: 1
-  PYTHON_VERSION: 3.8
+  PYTHON_VERSION: 3.9
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -27,15 +27,15 @@ permissions:
 
 jobs:
   lint:
-    if: "github.repository == 'numpy/numpy' && github.ref != 'refs/heads/main' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
+    if: github.repository == 'numpy/numpy' && github.event_name != 'push'
     runs-on: ubuntu-latest
     continue-on-error: true
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install linter requirements
@@ -46,16 +46,16 @@ jobs:
         python tools/linter.py --branch origin/${{ github.base_ref }}
 
   smoke_test:
-    if: "github.repository == 'numpy/numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
+    if: "github.repository == 'numpy/numpy'"
     runs-on: ubuntu-latest
     env:
       WITHOUT_SIMD: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -63,17 +63,18 @@ jobs:
   basic:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11-dev"]
+        python-version: ["3.9", "3.10", "3.11", "pypy3.9-v7.3.11"]
     env:
       EXPECT_CPU_FEATURES: "SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ matrix.python-version }}
     - uses: ./.github/actions
@@ -82,20 +83,21 @@ jobs:
     needs: [smoke_test]
     # provides GCC  7, 8
     runs-on: ubuntu-20.04
+    if: github.event_name != 'push'
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
     # comes with python3.6
-    - name: Install Python3.8
+    - name: Install Python3.9
       run: |
         sudo apt update
         # for add-apt-repository
         sudo apt install software-properties-common -y
         sudo add-apt-repository ppa:deadsnakes/ppa -y
-        sudo apt install python3.8-dev -y
-        sudo ln -s /usr/bin/python3.8 /usr/bin/pythonx
+        sudo apt install python3.9-dev -y
+        sudo ln -s /usr/bin/python3.9 /usr/bin/pythonx
         pythonx -m pip install --upgrade pip setuptools wheel
         pythonx -m pip install -r test_requirements.txt
     - name: Install Compilers
@@ -118,14 +120,15 @@ jobs:
   without_optimizations:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       WITHOUT_OPTIMIZATIONS: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -133,14 +136,15 @@ jobs:
   with_baseline_only:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CPU_DISPATCH: "none"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -148,14 +152,15 @@ jobs:
   without_avx512:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CPU_DISPATCH: "max -xop -fma4 -avx512f -avx512cd -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -163,29 +168,31 @@ jobs:
   without_avx512_avx2_fma3:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CPU_DISPATCH: "SSSE3 SSE41 POPCNT SSE42 AVX F16C"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
   debug:
     needs: [smoke_test]
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_DEBUG: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -193,14 +200,15 @@ jobs:
   blas64:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       NPY_USE_BLAS_ILP64: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -214,11 +222,11 @@ jobs:
       RUN_COVERAGE: 1
       INSTALL_PICKLE5: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -226,6 +234,7 @@ jobs:
   benchmark:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       PYTHONOPTIMIZE: 2
       BLAS: None
@@ -235,11 +244,11 @@ jobs:
       NPY_LAPACK_ORDER: MKL,OPENBLAS,ATLAS,LAPACK
       USE_ASV: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -247,16 +256,17 @@ jobs:
   relaxed_strides_debug:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CHECK_BLAS: 1
       NPY_USE_BLAS_ILP64: 1
       NPY_RELAXED_STRIDES_DEBUG: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -264,29 +274,35 @@ jobs:
   use_wheel:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_WHEEL: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
-  no_array_func:
+  numpy2_flag:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
-      NUMPY_EXPERIMENTAL_ARRAY_FUNCTION: 0
+      # Test for numpy-2.0 feature-flagged behavior.
+      NPY_NUMPY_2_BEHAVIOR: 1
+      # Using the future "weak" state doesn't pass tests
+      # currently unfortunately
+      NPY_PROMOTION_STATE: legacy
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -294,55 +310,45 @@ jobs:
   no_openblas:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       BLAS: None
       LAPACK: None
       ATLAS: None
       DOWNLOAD_OPENBLAS: ''
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
-  pypy38:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@v4
-      with:
-        python-version: pypy-3.8-v7.3.9
-    - uses: ./.github/actions
-
   sdist:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_SDIST: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
   armv7_simd_test:
     needs: [smoke_test]
-    # make sure this (20.04) matches the base docker image (focal) below
-    runs-on: ubuntu-20.04
+    # make sure this matches the base docker image below
+    runs-on: ubuntu-22.04
+    if: github.event_name != 'push'
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
@@ -353,19 +359,20 @@ jobs:
       run: |
         # use x86_64 cross-compiler to speed up the build
         sudo apt update
-        sudo apt install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf
+        sudo apt install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf gfortran-arm-linux-gnueabihf
 
         # Keep the `test_requirements.txt` dependency-subset synced
-        docker run --name the_container --interactive -v /:/host arm32v7/ubuntu:focal /bin/bash -c "
+        docker run --name the_container --interactive -v /:/host arm32v7/ubuntu:22.04 /bin/bash -c "
           apt update &&
-          apt install -y git python3 python3-dev python3-pip &&
-          pip3 install cython==0.29.30 setuptools\<49.2.0 hypothesis==6.23.3 pytest==6.2.5 'typing_extensions>=4.2.0' &&
+          apt install -y git python3 python3-dev python3-pip  &&
+          python3 -m pip install cython==0.29.34 setuptools\<49.2.0 hypothesis==6.23.3 pytest==6.2.5 'typing_extensions>=4.2.0' &&
           ln -s /host/lib64 /lib64 &&
           ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu &&
           ln -s /host/usr/arm-linux-gnueabihf /usr/arm-linux-gnueabihf &&
           rm -rf /usr/lib/gcc/arm-linux-gnueabihf && ln -s /host/usr/lib/gcc-cross/arm-linux-gnueabihf /usr/lib/gcc/arm-linux-gnueabihf &&
           rm -f /usr/bin/arm-linux-gnueabihf-gcc && ln -s /host/usr/bin/arm-linux-gnueabihf-gcc /usr/bin/arm-linux-gnueabihf-gcc &&
           rm -f /usr/bin/arm-linux-gnueabihf-g++ && ln -s /host/usr/bin/arm-linux-gnueabihf-g++ /usr/bin/arm-linux-gnueabihf-g++ &&
+          rm -f /usr/bin/arm-linux-gnueabihf-gfortran && ln -s /host/usr/bin/arm-linux-gnueabihf-gfortran /usr/bin/arm-linux-gnueabihf-gfortran &&
           rm -f /usr/bin/arm-linux-gnueabihf-ar && ln -s /host/usr/bin/arm-linux-gnueabihf-ar /usr/bin/arm-linux-gnueabihf-ar &&
           rm -f /usr/bin/arm-linux-gnueabihf-as && ln -s /host/usr/bin/arm-linux-gnueabihf-as /usr/bin/arm-linux-gnueabihf-as &&
           rm -f /usr/bin/arm-linux-gnueabihf-ld && ln -s /host/usr/bin/arm-linux-gnueabihf-ld /usr/bin/arm-linux-gnueabihf-ld &&
@@ -378,6 +385,7 @@ jobs:
           uname -a &&
           gcc --version &&
           g++ --version &&
+          arm-linux-gnueabihf-gfortran --version &&
           python3 --version &&
           git config --global --add safe.directory /numpy
           cd /numpy &&
@@ -387,7 +395,7 @@ jobs:
     - name: Run SIMD Tests
       run: |
         docker run --rm --interactive -v $(pwd):/numpy the_build /bin/bash -c "
-          cd /numpy && python3 runtests.py -n -v -- -k test_simd
+          cd /numpy && F90=arm-linux-gnueabihf-gfortran python3 runtests.py -n -v -- -k 'test_simd or test_kind'
         "
 
   sde_simd_avx512_test:
@@ -397,11 +405,11 @@ jobs:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install Intel SDE
@@ -421,3 +429,39 @@ jobs:
     # ICL implies SKX, CLX and CNL
     - name: Run SIMD tests (Ice Lake)
       run: sde -icl -- python runtests.py -n -v -- -k test_simd
+
+  intel_spr_sde_test:
+    needs: [smoke_test]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.4.0
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install Intel SDE
+      run: |
+        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/751535/sde-external-9.14.0-2022-10-25-lin.tar.xz
+        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
+        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+    - name: Install dependencies
+      run: |
+        python -m pip install -r test_requirements.txt
+        sudo apt install gcc-12 g++-12
+    - name: Build and install NumPy
+      run: |
+        export CC=/usr/bin/gcc-12
+        export CXX=/usr/bin/g++-12
+        python -m pip install -e .
+    - name: Show config
+      run: |
+        python -c "import numpy as np; np.show_config()"
+    # Run only a few tests, running everything in an SDE takes a long time
+    # Using pytest directly, unable to use python runtests.py -n -t ...
+    # Disabled running in the SDE because of an SDE bug
+    - name: Run linalg/ufunc/umath tests
+      run: |
+        python -m pytest numpy/core/tests/test_umath* numpy/core/tests/test_ufunc.py numpy/linalg/tests/test_*
+        #sde -spr -- python -m pytest numpy/core/tests/test_umath* numpy/core/tests/test_ufunc.py numpy/linalg/tests/test_*
diff --git a/.github/workflows/circleci.yml b/.github/workflows/circleci.yml
index de191d068..75638f6b6 100644
--- a/.github/workflows/circleci.yml
+++ b/.github/workflows/circleci.yml
@@ -4,21 +4,22 @@
 
 name: CircleCI artifact redirector
 
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
 on: [status]
+
+permissions: read-all
+
 jobs:
   circleci_artifacts_redirector_job:
     runs-on: ubuntu-latest
+    if: "github.repository == 'numpy/numpy' && !contains(github.event.head_commit.message, '[circle skip]') && !contains(github.event.head_commit.message, '[skip circle]')  && github.event.context == 'ci/circleci: build'"
     name: Run CircleCI artifacts redirector
     permissions:
-      pull-requests: write
-    # if: github.repository == 'numpy/numpy'
+      statuses: write
     steps:
       - name: GitHub Action step
-        uses: larsoner/circleci-artifacts-redirector-action@master
+        uses: larsoner/circleci-artifacts-redirector-action@0a7552bf8cf99cbd40a8928fa48e858e205b98c8 # master
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
+          api-token: ${{ secrets.CIRCLE_TOKEN }}
           artifact-path: 0/doc/build/html/index.html
           circleci-jobs: build
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 000000000..05eca6116
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: ["main"]
+  schedule:
+    - cron: "0 0 * * 1"
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["python"]
+        # CodeQL supports [ $supported-codeql-languages ]
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+
+      # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+      # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+
+      # ℹ️ Command-line programs to run using the OS shell.
+      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+      #   If the Autobuild fails above, remove it and uncomment the following three lines.
+      #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+      # - run: |
+      #   echo "Run, Build Application using script"
+      #   ./location_of_script_within_repo/buildscript.sh
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+        with:
+          category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/cygwin.yml b/.github/workflows/cygwin.yml
index c028b2974..1865825c4 100644
--- a/.github/workflows/cygwin.yml
+++ b/.github/workflows/cygwin.yml
@@ -3,10 +3,6 @@
 # if: github.repository == 'numpy/numpy'
 name: Test on Cygwin
 on:
-  push:
-    branches:
-      - main
-      - maintenance/**
   pull_request:
     branches:
       - main
@@ -22,25 +18,25 @@ permissions:
 jobs:
   cygwin_build_test:
     runs-on: windows-latest
-    if: github.repository == 'numpy/numpy'
+    if: "github.repository == 'numpy/numpy'"
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: recursive
           fetch-depth: 0
       - name: Install Cygwin
-        uses: cygwin/cygwin-install-action@v2
+        uses: cygwin/cygwin-install-action@006ad0b0946ca6d0a3ea2d4437677fa767392401 # v4
         with:
           platform: x86_64
           install-dir: 'C:\tools\cygwin'
           packages: >-
-            python38-devel python38-zipp python38-importlib-metadata
-            python38-cython python38-pip python38-wheel python38-cffi
-            python38-pytz python38-setuptools python38-pytest
-            python38-hypothesis liblapack-devel
+            python39-devel python39-zipp python39-importlib-metadata
+            python39-cython python39-pip python39-wheel python39-cffi
+            python39-pytz python39-setuptools python39-pytest
+            python39-hypothesis liblapack-devel
             gcc-fortran gcc-g++ git dash
       - name: Set Windows PATH
-        uses: egor-tensin/cleanup-path@v1
+        uses: egor-tensin/cleanup-path@8469525c8ee3eddabbd3487658621a6235b3c581 # v3
         with:
           dirs: 'C:\tools\cygwin\bin;C:\tools\cygwin\lib\lapack'
       - name: Verify that bash is Cygwin bash
@@ -53,34 +49,35 @@ jobs:
       - name: Verify python version
         # Make sure it's the Cygwin one, not a Windows one
         run: |
-          dash -c "which python3.8; /usr/bin/python3.8 --version -V"
+          dash -c "which python3.9; /usr/bin/python3.9 --version -V"
       - name: Build NumPy wheel
         run: |
-          dash -c "/usr/bin/python3.8 -m pip install 'setuptools<49.2.0' pytest pytz cffi pickle5 importlib_metadata typing_extensions"
-          dash -c "/usr/bin/python3.8 -m pip install -r test_requirements.txt"
-          dash -c "/usr/bin/python3.8 setup.py bdist_wheel"
+          dash -c "/usr/bin/python3.9 -m pip install 'setuptools<49.2.0' pytest pytz cffi pickle5 importlib_metadata typing_extensions"
+          dash -c "/usr/bin/python3.9 -m pip install -r test_requirements.txt"
+          dash -c "/usr/bin/python3.9 setup.py bdist_wheel"
       - name: Install new NumPy
         run: |
-          bash -c "/usr/bin/python3.8 -m pip install dist/numpy-*cp38*.whl"
+          bash -c "/usr/bin/python3.9 -m pip install dist/numpy-*cp39*.whl"
       - name: Rebase NumPy compiled extensions
         run: |
-          dash "tools/rebase_installed_dlls_cygwin.sh" 3.8
+          dash "tools/rebase_installed_dlls_cygwin.sh" 3.9
       - name: Run NumPy test suite
-        run: >-
-          dash -c "/usr/bin/python3.8 runtests.py -n -vv"
+        shell: "C:\\tools\\cygwin\\bin\\bash.exe -o igncr -eo pipefail {0}"
+        run: |
+          /usr/bin/python3.9 runtests.py -n
       - name: Upload wheel if tests fail
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         if: failure()
         with:
           name: numpy-cygwin-wheel
-          path: dist/numpy-*cp38*.whl
+          path: dist/numpy-*cp39*.whl
       - name: Check the extension modules on failure
         if: failure()
         run: |
-          dash -c "/usr/bin/python3.8 -m pip show numpy"
-          dash -c "/usr/bin/python3.8 -m pip show -f numpy | grep .dll"
+          dash -c "/usr/bin/python3.9 -m pip show numpy"
+          dash -c "/usr/bin/python3.9 -m pip show -f numpy | grep .dll"
           dash -c "/bin/tr -d '\r' <tools/list_installed_dll_dependencies_cygwin.sh >list_dlls_unix.sh"
-          dash "list_dlls_unix.sh" 3.8
+          dash "list_dlls_unix.sh" 3.9
       - name: Print installed package versions on failure
         if: failure()
         run: |
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
new file mode 100644
index 000000000..cc5530fd6
--- /dev/null
+++ b/.github/workflows/dependency-review.yml
@@ -0,0 +1,20 @@
+# Dependency Review Action
+#
+# This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging.
+#
+# Source repository: https://github.com/actions/dependency-review-action
+# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
+name: 'Dependency Review'
+on: [pull_request]
+
+permissions:
+  contents: read
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout Repository'
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@f46c48ed6d4f1227fb2d9ea62bf6bcbed315589e # v3.0.4
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 694483ed7..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Build Base Docker Image
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - "environment.yml"
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build_docker:
-    name: Build base Docker image
-    runs-on: ubuntu-latest
-    environment: numpy-dev
-    if: "github.repository_owner == 'numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v3
-      - name: Lint Docker
-        uses: brpaz/hadolint-action@v1.2.1
-        with:
-          dockerfile: ./tools/gitpod/Dockerfile
-      - name: Get refs
-        shell: bash
-        run: |
-          export raw_branch=${GITHUB_REF#refs/heads/}
-          echo "branch=${raw_branch//\//-}" >> $GITHUB_OUTPUT
-          echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-          echo "sha8=$(echo ${GITHUB_SHA} | cut -c1-8)" >> $GITHUB_OUTPUT
-        id: getrefs
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Cache Docker layers
-        uses: actions/cache@v3
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: ${{ runner.os }}-buildx-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v2
-        with:
-          context: "."
-          file: "./tools/gitpod/Dockerfile"
-          push: ${{ github.event_name != 'pull_request' }}
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
-          tags: |
-            numpy/numpy-dev:${{ steps.getrefs.outputs.date }}-${{ steps.getrefs.outputs.branch}}-${{ steps.getrefs.outputs.sha8 }}, numpy/numpy-dev:latest
-      - name: Image digest
-        # Return details of the image build: sha and shell
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
index 4fb48e6e6..5f01e72b2 100644
--- a/.github/workflows/emscripten.yml
+++ b/.github/workflows/emscripten.yml
@@ -19,19 +19,19 @@ permissions:
 jobs:
   build-wasm-emscripten:
     runs-on: ubuntu-22.04
-    if: github.repository == 'numpy/numpy'
+    if: "github.repository == 'numpy/numpy'"
     env:
-      PYODIDE_VERSION: 0.22.0a3
+      PYODIDE_VERSION: 0.23.1
       # PYTHON_VERSION and EMSCRIPTEN_VERSION are determined by PYODIDE_VERSION.
       # The appropriate versions can be found in the Pyodide repodata.json
       # "info" field, or in Makefile.envs:
       # https://github.com/pyodide/pyodide/blob/main/Makefile.envs#L2
-      PYTHON_VERSION: 3.10.7
-      EMSCRIPTEN_VERSION: 3.1.24
+      PYTHON_VERSION: 3.11.2
+      EMSCRIPTEN_VERSION: 3.1.32
       NODE_VERSION: 18
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: true
           # versioneer.py requires the latest tag to be reachable. Here we
@@ -42,11 +42,11 @@ jobs:
 
       - name: set up python
         id: setup-python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
 
-      - uses: mymindstorm/setup-emsdk@v11
+      - uses: mymindstorm/setup-emsdk@ab889da2abbcbb280f91ec4c215d3bb4f3a8f775 # v12
         with:
           version: ${{ env.EMSCRIPTEN_VERSION }}
           actions-cache-folder: emsdk-cache
@@ -58,7 +58,7 @@ jobs:
         run: CFLAGS=-g2 LDFLAGS=-g2 pyodide build
 
       - name: set up node
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@64ed1c7eab4cce3362f8c340dee64e5eaeef8f7c # v3.6.0
         with:
           node-version: ${{ env.NODE_VERSION }}
 
diff --git a/.github/workflows/gitpod.yml b/.github/workflows/gitpod.yml
deleted file mode 100644
index 1b521fc63..000000000
--- a/.github/workflows/gitpod.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Build Gitpod Docker image
-
-on:
-  push:
-    branches:
-      - main
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build_gitpod:
-    name: Build Gitpod Docker image
-    runs-on: ubuntu-latest
-    environment: numpy-dev
-    if: "github.repository_owner == 'numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Lint Docker
-        uses: brpaz/hadolint-action@v1.2.1
-        with:
-          dockerfile: ./tools/gitpod/gitpod.Dockerfile
-      - name: Get refs
-        shell: bash
-        run: |
-          export raw_branch=${GITHUB_REF#refs/heads/}
-          echo "branch=${raw_branch//\//-}" >> $GITHUB_OUTPUT
-          echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-          echo "sha8=$(echo ${GITHUB_SHA} | cut -c1-8)" >> $GITHUB_OUTPUT
-        id: getrefs
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Cache Docker layers
-        uses: actions/cache@v3
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: ${{ runner.os }}-buildx-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v2
-        with:
-          context: "."
-          file: "./tools/gitpod/gitpod.Dockerfile"
-          push: ${{ github.event_name != 'pull_request' }}
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
-          tags: |
-            numpy/numpy-gitpod:${{ steps.getrefs.outputs.date }}-${{ steps.getrefs.outputs.branch}}-${{ steps.getrefs.outputs.sha8 }}, numpy/numpy-gitpod:latest
-      - name: Image digest
-        # Return details of the image build: sha and shell
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index d673b2faf..9ceafebb7 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,19 +1,19 @@
 name: "Pull Request Labeler"
 on:
   pull_request_target:
-    types: [opened, synchronize, reopened, edited]
+    types: [opened]
 
 permissions: {}
 
 jobs:
   pr-labeler:
-    permissions:
-      contents: read  #  to read a configuration file
-      pull-requests: write  #  to add labels to pull requests
-
     runs-on: ubuntu-latest
+    permissions: 
+      pull-requests: write  # to add labels
     steps:
     - name: Label the PR
-      uses: gerrymanoim/pr-prefix-labeler@v3
+      uses: gerrymanoim/pr-prefix-labeler@c8062327f6de59a9ae1c19f7f07cacd0b976b6fa # v3
+      continue-on-error: true
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      if: github.repository == 'numpy/numpy'
diff --git a/.github/workflows/linux_meson.yml b/.github/workflows/linux_meson.yml
index 9b21820fb..126eb8f29 100644
--- a/.github/workflows/linux_meson.yml
+++ b/.github/workflows/linux_meson.yml
@@ -1,10 +1,6 @@
 name: Test Meson build (Linux)
 
 on:
-  push:
-    branches:
-      - main
-      - maintenance/**
   pull_request:
     branches:
       - main
@@ -25,14 +21,15 @@ permissions:
   contents: read # to fetch code (actions/checkout)
 
 jobs:
-  meson_devpy:
+  meson_spin:
+    if: "github.repository == 'numpy/numpy'"
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install dependencies
@@ -44,7 +41,7 @@ jobs:
       env:
         TERM: xterm-256color
       run:
-        ./dev.py build -- --werror
+        spin build -- --werror
     - name: Check build-internal dependencies
       run:
         ninja -C build -t missingdeps
@@ -57,4 +54,4 @@ jobs:
         TERM: xterm-256color
       run: |
         pip install pytest hypothesis typing_extensions
-        ./dev.py test
+        spin test
diff --git a/.github/workflows/linux_musl.yml b/.github/workflows/linux_musl.yml
new file mode 100644
index 000000000..7d90c20ed
--- /dev/null
+++ b/.github/workflows/linux_musl.yml
@@ -0,0 +1,66 @@
+name: Test musllinux_x86_64
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+
+jobs:
+  musllinux_x86_64:
+    runs-on: ubuntu-latest
+    if: "github.repository == 'numpy/numpy'"
+    container:
+      # Use container used for building musllinux wheels
+      # it has git installed, all the pythons, etc
+      image: quay.io/pypa/musllinux_1_1_x86_64
+
+    steps:
+    - name: setup
+      run: |
+        apk update --quiet
+
+        # using git commands to clone because versioneer doesn't work when
+        # actions/checkout is used for the clone step in a container
+
+        git config --global --add safe.directory $PWD 
+        
+        if [ $GITHUB_EVENT_NAME != pull_request ]; then
+            git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+            git reset --hard $GITHUB_SHA
+        else        
+            git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+            git fetch origin $GITHUB_REF:my_ref_name
+            git checkout $GITHUB_BASE_REF
+            git -c user.email="you@example.com" merge --no-commit my_ref_name
+        fi
+
+        ln -s /usr/local/bin/python3.10 /usr/local/bin/python
+
+    - name: test musllinux_x86_64
+      run: |
+        python -m venv test_env
+        source test_env/bin/activate
+
+        # required for figuring out the system tags in openblas_support
+        pip install packaging       
+        
+        # install openblas by co-opting the CIBW setup script
+        RUNNER_OS=Linux sh tools/wheels/cibw_before_build.sh .
+
+        pip install -r build_requirements.txt
+        pip install pytest hypothesis typing_extensions
+
+        # use meson to build and test 
+        spin build
+        spin test
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 0cef4f7a9..55fd3ba38 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -25,12 +25,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.1.0
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6
+        uses: ossf/scorecard-action@80e868c13c90f172d68d1f4501dee99e2479f7af # v2.1.3
         with:
           results_file: results.sarif
           results_format: sarif
@@ -42,7 +42,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable
       # uploads of run results in SARIF format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: SARIF file
           path: results.sarif
@@ -50,6 +50,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@807578363a7869ca324a79039e6db9c843e0e100 # v2.1.27
+        uses: github/codeql-action/upload-sarif@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.1.27
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a716139c4..f8f91b49d 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -38,12 +38,12 @@ jobs:
   get_commit_message:
     name: Get commit message
     runs-on: ubuntu-latest
-    if: github.repository == 'numpy/numpy'
+    if: "github.repository == 'numpy/numpy'"
     outputs:
       message: ${{ steps.commit_message.outputs.message }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         # Gets the correct commit message for pull request
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -75,18 +75,16 @@ jobs:
         # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
         buildplat:
           - [ubuntu-20.04, manylinux_x86_64]
-          - [macos-12, macosx_*]
+          - [ubuntu-20.04, musllinux_x86_64]
+          - [macos-12, macosx_x86_64]
           - [windows-2019, win_amd64]
           - [windows-2019, win32]
-        # TODO: uncomment PyPy 3.9 builds once PyPy
-        # re-releases a new minor version
-        # NOTE: This needs a bump of cibuildwheel version, also, once that happens.
-        python: ["cp38", "cp39", "cp310", "cp311", "pp38"] #, "pp39"]
+        python: ["cp39", "cp310", "cp311", "pp39"]
         exclude:
           # Don't build PyPy 32-bit windows
           - buildplat: [windows-2019, win32]
-            python: "pp38"
-          - buildplat: [windows-2019, win32]
+            python: "pp39"
+          - buildplat: [ ubuntu-20.04, musllinux_x86_64 ]
             python: "pp39"
     env:
       IS_32_BIT: ${{ matrix.buildplat[1] == 'win32' }}
@@ -94,7 +92,7 @@ jobs:
       IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: true
           # versioneer.py requires the latest tag to be reachable. Here we
@@ -104,35 +102,51 @@ jobs:
           fetch-depth: 0
 
       # Used to push the built wheels
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
         with:
           python-version: "3.x"
 
+      # We need rtools 4.0 to have 32 bit support on windows
+      - if: runner.os == 'windows'
+        uses: r-windows/install-rtools@13886bb4048f1b862d33869a18b73cdd446a3961 # main
+
       - name: setup rtools for 32-bit
         run: |
           echo "PLAT=i686" >> $env:GITHUB_ENV
-          echo "MSYSTEM=MINGW32" >> $env:GITHUB_ENV
-          echo "PATH=$env:RTOOLS40_HOME\mingw32\bin;$env:PATH" >> $env:GITHUB_ENV
+          echo "PATH=c:\rtools40\mingw32\bin;$env:PATH" >> $env:GITHUB_ENV
           gfortran --version
         if: ${{ matrix.buildplat[1] == 'win32' }}
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.11.2
+        uses: pypa/cibuildwheel@5e15bb25b428e1bf2daf2215f173d2b40135f56f # v2.12.3
         env:
           CIBW_BUILD: ${{ matrix.python }}-${{ matrix.buildplat[1] }}
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: ${{ matrix.python }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }}
           path: ./wheelhouse/*.whl
 
+      - uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3 # v2.2.0
+        with:
+          # for installation of anaconda-client, required for upload to
+          # anaconda.org
+          # default (and activated) environment name is test
+          # Note that this step is *after* specific pythons have been used to
+          # build and test the wheel
+          auto-update-conda: true
+          python-version: "3.10"
+
       - name: Upload wheels
         if: success()
-        shell: bash
+        shell: bash -el {0}
+        # see https://github.com/marketplace/actions/setup-miniconda for why
+        # `-el {0}` is required.
         env:
           NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
           NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
         run: |
+          conda install -y anaconda-client
           source tools/wheels/upload_wheels.sh
           set_upload_vars
           # trigger an upload to
@@ -142,6 +156,7 @@ jobs:
           # https://anaconda.org/multibuild-wheels-staging/numpy
           # The tokens were originally generated at anaconda.org
           upload_wheels
+
   build_sdist:
     name: Build sdist
     needs: get_commit_message
@@ -156,10 +171,11 @@ jobs:
     runs-on: ubuntu-latest
     env:
       IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
-      IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      # commented out so the sdist doesn't upload to nightly
+      # IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: true
           # versioneer.py requires the latest tag to be reachable. Here we
@@ -168,10 +184,10 @@ jobs:
           # https://github.com/actions/checkout/issues/338
           fetch-depth: 0
       # Used to push the built wheels
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
         with:
           # Build sdist on lowest supported Python
-          python-version: "3.8"
+          python-version: "3.9"
       - name: Build sdist
         run: |
           python setup.py sdist
@@ -182,25 +198,37 @@ jobs:
           python -m pip install dist/*.gz
           pip install -r test_requirements.txt
           cd .. # Can't import numpy within numpy src directory
-          python -c "import numpy; print(numpy.__version__); numpy.test();"
+          python -c "import numpy, sys; print(numpy.__version__); sys.exit(numpy.test() is False)"
 
       - name: Check README rendering for PyPI
         run: |
           python -mpip install twine
           twine check dist/*
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: sdist
           path: ./dist/*
 
+      - uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3 # v2.2.0
+        with:
+          # for installation of anaconda-client, required for upload to
+          # anaconda.org
+          # default (and activated) environment name is test
+          # Note that this step is *after* specific pythons have been used to
+          # build and test
+          auto-update-conda: true
+          python-version: "3.10"
+
       - name: Upload sdist
         if: success()
-        shell: bash
+        shell: bash -el {0}
         env:
           NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
-          NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
+          # commented out so the sdist doesn't upload to nightly
+          # NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
         run: |
+          conda install -y anaconda-client
           source tools/wheels/upload_wheels.sh
           set_upload_vars
           # trigger an upload to
diff --git a/.github/workflows/windows_meson.yml b/.github/workflows/windows_meson.yml
new file mode 100644
index 000000000..d4372bd3c
--- /dev/null
+++ b/.github/workflows/windows_meson.yml
@@ -0,0 +1,88 @@
+name: Test Meson build (Windows)
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+env:
+  PYTHON_VERSION: 3.11
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  meson:
+    name: Meson windows build/test
+    runs-on: windows-2019
+    # if: "github.repository == 'numpy/numpy'"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - name: Setup Python
+      uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Install dependencies
+      run: |
+        pip install -r build_requirements.txt
+    - name: openblas-libs
+      run: |
+        # Download and install pre-built OpenBLAS library
+        # with 32-bit interfaces
+        # Unpack it in the pkg-config hardcoded path
+        choco install unzip -y
+        choco install wget -y
+        choco install -y --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite
+        wget https://anaconda.org/multibuild-wheels-staging/openblas-libs/v0.3.21/download/openblas-v0.3.21-win_amd64-gcc_10_3_0.zip
+        unzip -d c:\opt openblas-v0.3.21-win_amd64-gcc_10_3_0.zip
+        echo "PKG_CONFIG_PATH=c:\opt\64\lib\pkgconfig;" >> $env:GITHUB_ENV
+    - name: meson-configure
+      run: |
+        meson setup build --prefix=$PWD\build-install -Ddebug=false --optimization 2 --vsenv 
+    - name: meson-build
+      run: |
+        meson compile -C build -v
+
+    - name: meson-install
+      run: |
+        cd build
+        meson install --no-rebuild
+    - name: build-path
+      run: |
+        echo "installed_path=$PWD\build-install\Lib\site-packages" >> $env:GITHUB_ENV
+    - name: post-install
+      run: |
+        $numpy_path = "${env:installed_path}\numpy"
+        $libs_path = "${numpy_path}\.libs"
+        mkdir ${libs_path}
+        $ob_path = "C:/opt/64/bin/"
+        cp $ob_path/*.dll $libs_path
+        # Write _distributor_init.py to load .libs DLLs.
+        python -c "from tools import openblas_support; openblas_support.make_init(r'${numpy_path}')"
+
+    - name: prep-test
+      run: |
+        echo "PYTHONPATH=${env:installed_path}" >> $env:GITHUB_ENV
+        python -m pip install -r test_requirements.txt
+        python -m pip install threadpoolctl
+
+    - name: test
+      run: |
+        mkdir tmp
+        cd tmp
+        echo "============================================"
+        python -c "import numpy; print(numpy.show_runtime())"
+        echo "============================================"
+        echo "LASTEXITCODE is '$LASTEXITCODE'"
+        python -c "import numpy, sys; sys.exit(numpy.test(verbose=3) is False)"
+        echo "LASTEXITCODE is '$LASTEXITCODE'"
diff --git a/.gitignore b/.gitignore
index 9851fcc77..e5784971e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,8 +132,6 @@ numpy/core/include/numpy/__ufunc_api.c
 numpy/core/include/numpy/__umath_generated.c
 numpy/core/include/numpy/_umath_doc_generated.h
 numpy/core/include/numpy/config.h
-numpy/core/include/numpy/multiarray_api.txt
-numpy/core/include/numpy/ufunc_api.txt
 numpy/core/lib/
 numpy/core/src/common/npy_sort.h
 numpy/core/src/common/templ_common.h
@@ -181,6 +179,7 @@ benchmarks/html
 benchmarks/env
 benchmarks/numpy
 benchmarks/_asv_compare.conf.json
+test.obj
 # cythonized files
 cythonize.dat
 numpy/random/_mtrand/_mtrand.c
@@ -211,6 +210,7 @@ tools/swig/test/Array.py
 *.dispatch.h
 # wrapped sources of dispatched targets, e.g. *.dispatch.avx2.c
 *.dispatch.*.c
+*.dispatch.*.cpp
 # _simd module
 numpy/core/src/_simd/_simd.dispatch.c
 numpy/core/src/_simd/_simd_data.inc
@@ -218,8 +218,10 @@ numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary.dispatch.c
 numpy/core/src/umath/loops_unary_fp.dispatch.c
+numpy/core/src/umath/loops_unary_fp_le.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
 numpy/core/src/umath/loops_arithmetic.dispatch.c
+numpy/core/src/umath/loops_logical.dispatch.c
 numpy/core/src/umath/loops_minmax.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c
@@ -227,9 +229,8 @@ numpy/core/src/umath/loops_umath_fp.dispatch.c
 numpy/core/src/umath/loops_hyperbolic.dispatch.c
 numpy/core/src/umath/loops_modulo.dispatch.c
 numpy/core/src/umath/loops_comparison.dispatch.c
-# npysort module
-numpy/core/src/npysort/x86-qsort.dispatch.c
-numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
+numpy/core/src/umath/loops_unary_complex.dispatch.c
+numpy/core/src/umath/loops_autovec.dispatch.c
 # multiarray module
 numpy/core/src/multiarray/argfunc.dispatch.c
 numpy/core/src/multiarray/arraytypes.h
diff --git a/.gitmodules b/.gitmodules
index 1ea274daf..d849a3caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "numpy/core/src/umath/svml"]
 	path = numpy/core/src/umath/svml
 	url = https://github.com/numpy/SVML.git
+[submodule "numpy/core/src/npysort/x86-simd-sort"]
+	path = numpy/core/src/npysort/x86-simd-sort
+	url = https://github.com/intel/x86-simd-sort
diff --git a/.gitpod.yml b/.gitpod.yml
deleted file mode 100644
index c46752f10..000000000
--- a/.gitpod.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Rebuilding NumPy on init - rather than on prebuild: this ensures
-# that even forks do have a usable freshly built NumPy
-# Might delegate this later to prebuild with Q2 improvements on gitpod
-# https://www.gitpod.io/docs/config-start-tasks/#configuring-the-terminal
-# -------------------------------------------------------------------------
-
-image: numpy/numpy-gitpod:latest
-tasks:
-  - name: Prepare development environment
-    init: |
-      mkdir -p .vscode
-      cp tools/gitpod/settings.json .vscode/settings.json
-      rm -f /workspace/numpy/.git/shallow.lock  
-      conda activate numpy-dev
-      git pull --unshallow  # need to force this else the prebuild fails
-      git fetch --tags
-      python setup.py build_ext --inplace
-      echo "🛠 Completed rebuilding NumPy!! 🛠 "
-      echo "📖 Building docs 📖 "
-      cd doc
-      make html
-      echo "✨ Pre-build complete! You can close this terminal ✨ "
-
-# --------------------------------------------------------
-# exposing ports for liveserve
-ports:
-  - port: 5500
-    onOpen: notify
-
-# --------------------------------------------------------
-# some useful extensions to have
-vscode:
-  extensions:
-    - eamodio.gitlens
-    - njpwerner.autodocstring
-    - lextudio.restructuredtext
-    - ritwickdey.liveserver
-    - ms-python.python
-    - yzhang.markdown-all-in-one
-    - bungcip.better-toml
-    - mhutchie.git-graph
-
-# --------------------------------------------------------
-# using prebuilds for the container - note: atm this only
-# works for the NumPy repo
-# With this configuration the prebuild will happen on push to master 
-github:
-  prebuilds:
-    # enable for main/default branch
-    master: true
-    # enable for other branches (defaults to false) 
-    branches: false 
-    # enable for pull requests coming from this repo (defaults to true) 
-    pullRequests: false
-    # enable for pull requests coming from forks (defaults to false)
-    pullRequestsFromForks: false
-    # add a check to pull requests (defaults to true)
-    addCheck: false
-    # add a "Review in Gitpod" button as a comment to pull requests (defaults to false)
-    addComment: false
-    # add a "Review in Gitpod" button to the pull request's description (defaults to false)
-    addBadge: false
-    # add a label once the prebuild is ready to pull requests (defaults to false)
-    addLabel: false
-    
-\ No newline at end of file
diff --git a/.hadolint.yaml b/.hadolint.yaml
deleted file mode 100644
index 0188ba2cf..000000000
--- a/.hadolint.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
----
-ignored:
-  - DL3006
-  - DL3008
-  - SC2016
-  - DL3004
-  - DL3007
-\ No newline at end of file
diff --git a/.mailmap b/.mailmap
index 8ff3c21e8..fd89ecd65 100644
--- a/.mailmap
+++ b/.mailmap
@@ -8,11 +8,14 @@
 # This file is up-to-date if the command git log --format="%aN <%aE>" | sort -u
 # gives no duplicates.
 
+@amagicmuffin <2014wcheng@gmail.com>
 @8bitmp3 <19637339+8bitmp3@users.noreply.github.com>
+@dg3192 <113710955+dg3192@users.noreply.github.com>
 @DWesl <22566757+DWesl@users.noreply.github.com>
 @Endolith <endolith@gmail.com>
 @GalaxySnail <ylc991@163.com>
 @Illviljan <14371165+Illviljan@users.noreply.github.com>
+@juztamau5 <juztamau5@gmail.com>
 @LSchroefl <65246829+LSchroefl@users.noreply.github.com>
 @Lbogula <bogulala7@gmail.com>
 @Lisa <34400837+lyzlisa@users.noreply.github.com>
@@ -34,6 +37,7 @@
 @yetanothercheer <yetanothercheer@protonmail.com>
 Aaron Baecker <abaecker@localhost>
 Aarthi Agurusa <agurusa@gmail.com>
+Adarsh Singh <adarshsng090@gmail.com> ADARSH SINGH <adarshsng090@gmail.com>
 Andrei Batomunkuev <abatomunkuev@myseneca.ca>
 Ajay DS <turingman@protonmail.com>
 Ajay DS <turingman@protonmail.com> <ajayds2001@gmail.com>
@@ -49,13 +53,16 @@ Aerik Pawson <45904740+aerikpawson@users.noreply.github.com>
 Ahmet Can Solak <asolak14@ku.edu.tr>
 Amrit Krishnan <amrit110@gmail.com>
 Amrit Krishnan <amrit110@gmail.com> <amritk@vectorinstitute.ai>
+Alban Desmaison <desmaison.alban@gmail.com>
 Albert Jornet Puig <albert.jornet@ic3.cat>
 Alberto Rubiales <arubiales11@gmail.com>
+Ales Crocaro <alescrocaro@gmail.com>
 Alex Rockhill <aprockhill206@gmail.com>
 Alex Griffing <argriffi@ncsu.edu>
 Alex Griffing <argriffi@ncsu.edu> <argriffing@gmail.com>
 Alex Griffing <argriffi@ncsu.edu> <argriffing@users.noreply.github.com>
 Alex Henrie <alexhenrie24@gmail.com> <alex.henrie@utah.edu>
+Alex Low <aleksandrosansan@gmail.com>
 Alex Rogozhnikov <iamfullofspam@gmail.com> <arogozhnikov@users.noreply.github.com>
 Alex Thomas <alexthomas93@users.noreply.github.com>
 Alexander Belopolsky <abalkin@enlnt.com>
@@ -115,6 +122,9 @@ Bertrand Lefebvre <bertrand.l3f@gmail.com>
 Bharat Raghunathan <bharatraghunthan9767@gmail.com>
 Bharat Raghunathan <bharatraghunthan9767@gmail.com> <bharatr@symphonyai.com>
 Bob Eldering <eldering@jive.eu>
+Bram Ton <bram@cbbg.nl>
+Bram Ton <bram@cbbg.nl> <b.t.ton@saxion.nl>
+Bram Ton <bram@cbbg.nl> <github@cbbg.nl>
 Brent Brewington <brent.brewington@gmail.com>
 Brett R Murphy <bmurphy@enthought.com>
 Brian Soto <iambriansoto@gmail.com>
@@ -122,6 +132,7 @@ Brian Soto <iambriansoto@gmail.com> <Iamsoto@users.noreply.github.com>
 Brian Soto <iambriansoto@gmail.com> <theintrocode@gmail.com>
 Brigitta Sipőcz <bsipocz@gmail.com>
 Brigitta Sipőcz <bsipocz@gmail.com> <b.sipocz@gmail.com>
+Brock Mendel <jbrockmendel@gmail.com>
 Bryan Van de Ven <bryanv@continuum.io> Bryan Van de Ven <bryan@Laptop-3.local>
 Bryan Van de Ven <bryanv@continuum.io> Bryan Van de Ven <bryan@laptop.local>
 Brénainn Woodsend <bwoodsend@gmail.com>
@@ -163,6 +174,7 @@ Daniel Rasmussen <daniel.rasmussen@appliedbrainresearch.com>
 Daniel G. A. Smith <dgasmith@icloud.com>
 Daniel G. A. Smith <dgasmith@icloud.com> <malorian@me.com>
 Dario Mory <daaawx@gmail.com>
+Dave Goel <deego3@gmail.com>
 David Badnar <bdvd001@gmail.com>
 David Cortes <david.cortes.rivera@gmail.com>
 David Huard <david.huard@gmail.com> dhuard <dhuard@localhost>
@@ -179,6 +191,7 @@ Derek Homeier <derek@astro.physik.uni-goettingen.de> <dhomeie@gwdg.de>
 Derek Homeier <derek@astro.physik.uni-goettingen.de> <derek@astro.phsik.uni-goettingen.de>
 Derrick Williams <myutat@gmail.com>
 Devin Shanahan <dshanahan88@gmail.com>
+Digya Acharya <digyaacharyaa@gmail.com>
 Dima Pasechnik <dima@pasechnik.info>
 Dima Pasechnik <dima@pasechnik.info> <dimpase@gmail.com>
 Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com>
@@ -227,6 +240,7 @@ Gregory R. Lee <grlee77@gmail.com> <gregory.lee@cchmc.org>
 Guo Ci <zguoci@gmail.com> guoci <zguoci@gmail.com>
 Guo Shuai <gs0801@foxmail.com>
 Hameer Abbasi <einstein.edison@gmail.com> <hameerabbasi@yahoo.com>
+Hannah Aizenman <story645@gmail.com>
 Han Genuit <hangenuit@gmail.com>
 Hanno Klemm <hanno.klemm@maerskoil.com> hklemm <hanno.klemm@maerskoil.com>
 Harsh Mishra <erbeusgriffincasper@gmail.com>
@@ -243,6 +257,7 @@ Irina Maria Mocan <28827042+IrinaMaria@users.noreply.github.com>
 Irvin Probst <irvin.probst@ensta-bretagne.fr>
 Ivan Meleshko <vn.mlshk@gmail.com>
 Isabela Presedo-Floyd <irpf.design@gmail.com> <ipresedo@calpoly.edu>
+Ganesh Kathiresan <ganesh3597@gmail.com>
 Gerhard Hobler <gerhard.hobler@tuwien.ac.at>
 Giannis Zapantis <sdi1900059@di.uoa.gr>
 Guillaume Peillex <guillaume.peillex@gmail.com>
@@ -250,6 +265,7 @@ Jack J. Woehr <jwoehr@softwoehr.com>
 Jaime Fernandez <jaime.frio@gmail.com>
 Jaime Fernandez <jaime.frio@gmail.com> <jaime.fernandez@hp.com>
 Jaime Fernandez <jaime.frio@gmail.com> <jaimefrio@google.com>
+Jake Close <jsclose@umich.edu>
 Jakob Jakobson <jakobjakobson13@posteo.de>
 Jakob Jakobson <jakobjakobson13@posteo.de> <43045863+jakobjakobson13@users.noreply.github.com>
 James Bourbeau <jrbourbeau@gmail.com> <jrbourbeau@users.noreply.github.com>
@@ -269,7 +285,10 @@ Jérémie du Boisberranger <jeremie.du-boisberranger@inria.fr> jeremiedbb <34657
 Jérome Eertmans <jeertmans@icloud.com>
 Jérôme Richard <jeromerichard111@msn.com> <ubuntu@ip-172-31-17-195.eu-west-3.compute.internal>
 Jerome Kelleher <jerome.kelleher@ed.ac.uk>
+Jessé Pires <jesserocha@alunos.utfpr.edu.br>
 Jessi J Zhao <35235453+jessijzhao@users.noreply.github.com>
+João Fontes Gonçalves <jfontesgoncalves@gmail.com>
+Johnathon Cusick <jonathan.cusick09@gmail.com>
 Jhong-Ken Chen (陳仲肯) <kenny.kuo.fs@gmail.com>
 Jhong-Ken Chen (陳仲肯) <kenny.kuo.fs@gmail.com> <37182101+kennychenfs@users.noreply.github.com>
 Johannes Hampp <johannes.hampp@zeu.uni-giessen.de> <42553970+euronion@users.noreply.github.com>
@@ -319,9 +338,11 @@ Lars Buitinck <larsmans@gmail.com> Lars Buitinck <l.buitinck@esciencecenter.nl>
 Lars Buitinck <larsmans@gmail.com> Lars Buitinck <L.J.Buitinck@uva.nl>
 Lars Grüter <lagru@mailbox.org>
 Lars Grüter <lagru@mailbox.org> <lagru@users.noreply.github.com>
+Leona Taric <92495067+LeonaTaric@users.noreply.github.com>
 Leonardus Chen <leonardus.chen@gmail.com>
 Licht Takeuchi <licht-t@outlook.jp> <licht-t@math.dis.titech.ac.jp>
 Lorenzo Mammana <mammanalorenzo@outlook.it> <lorenzom96@hotmail.it>
+Lillian Zha <lillianzha@gmail.com>
 Luis Pedro Coelho <luis@luispedro.org> <lpc@cmu.edu>
 Luke Zoltan Kelley <lkelley@cfa.harvard.edu>
 Madhulika Jain Chambers <madhulikajain@gmail.com> <53166646+madhulikajc@users.noreply.github.com>
@@ -346,6 +367,7 @@ Martin Teichmann <martin.teichmann@xfel.eu> <lkb.teichmann@gmail.com>
 Mary Conley <sleeplessinseattle.dev@gmail.com>
 Masashi Kishimoto <drehbleistift@gmail.com>
 Matheus Vieira Portela <matheus.v.portela@gmail.com>
+Matheus Santana Patriarca <matheussantanapatriarca2019@outlook.com>
 Mathieu Lamarre <mlamarre@ea.com> <mathieu@vlam3d.com>
 Matías Ríos <riosm@dickinson.edu>
 Matt Ord <Matthew.ord1@gmail.com>
@@ -375,6 +397,7 @@ Michael Schnaitter <schnaitterm@knights.ucf.edu> <schnaitterm@users.noreply.gith
 Michael Seifert <michaelseifert04@yahoo.de>
 Michel Fruchart <michel.fruchart@ens-lyon.org> <fruchart@users.noreply.github.com>
 Mike Toews <mwtoews@gmail.com>
+Miles Cranmer <miles.cranmer@gmail.com>
 Mircea Akos Bruma <bruma.mircea.a@gmail.com>
 Mircea Akos Bruma <bruma.mircea.a@gmail.com> <akos@debian-gnu-linux-vm.localdomain>
 Mitchell Faas <Faas.Mitchell@gmail.com> <35742861+Mitchell-Faas@users.noreply.github.com>
@@ -391,6 +414,9 @@ Nicholas A. Del Grosso <delgrosso@bio.lmu.de> nickdg <delgrosso@bio.lmu.de>
 Nicholas McKibben <nicholas.bgp@gmail.com>
 Nick Minkyu Lee <mknicklee@protonmail.com> fivemok <9394929+fivemok@users.noreply.github.com>
 Oliver Eberle <oliver_eberle@web.de>
+Oleksiy Kononenko <Oleksiy.S.Kononenko@gmail.com>
+Oleksiy Kononenko <Oleksiy.S.Kononenko@gmail.com> <35204136+oleksiyskononenko@users.noreply.github.com>
+Omar Ali <Omarh90@gmail.com>
 Omid Rajaei <rajaei.net@gmail.com>
 Omid Rajaei <rajaei.net@gmail.com> <89868505+rajaeinet@users.noreply.github.com>
 Ondřej Čertík <ondrej.certik@gmail.com>
@@ -403,6 +429,8 @@ Paul Ivanov <pivanov5@bloomberg.net> <pi@berkeley.edu>
 Paul Ivanov <pivanov5@bloomberg.net> <paul.ivanov@local>
 Paul YS Lee <leeyspaul@gmail.com> Paul <leeyspaul@users.noreply.github.com>
 Paul Jacobson <hpj3@myuw.net>
+Pey Lian Lim <lim@stsci.edu>
+Pey Lian Lim <lim@stsci.edu> <2090236+pllim@users.noreply.github.com>
 Pearu Peterson <pearu.peterson@gmail.com> <pearu@pearu-laptop.(none)>
 Pete Peeradej Tanruangporn <pete.tanru@gmail.com>
 Peter Bell <peterbell10@live.co.uk>
@@ -447,18 +475,22 @@ Samesh Lakhotia <samesh.lakhotia@gmail.com>
 Samesh Lakhotia <samesh.lakhotia@gmail.com> <43701530+sameshl@users.noreply.github.com>
 Sami Salonen <ssalonen@gmail.com> <sami.salonen@eniram.fi>
 Sanchez Gonzalez Alvaro <as12513@imperial.ac.uk>
+Sanya Sinha <83265366+ssanya942@users.noreply.github.com>
 Saransh Chopra <saransh0701@gmail.com>
 Saullo Giovani <saullogiovani@gmail.com>
 Saurabh Mehta <e.samehta@gmail.com>
 Sayantika Banik <sayantikabanik122@gmail.com>
+Schrijvers Luc <begasus@gmail.com>
 Sebastian Berg <sebastian@sipsolutions.net>
 Sebastian Schleehauf <slepton@posteo.de>
 Serge Guelton <serge.guelton@telecom-bretagne.eu>
 Sergei Vorfolomeev <svorfolomeev@vmssoftware.com> <39548292+vorfol@users.noreply.github.com>
+Shuangchi He <yulvchi@qq.com>
 Shubham Gupta <mastershubham@gmail.com>
 Shubham Gupta <mastershubham@gmail.com> <63910248+shubham11941140@users.noreply.github.com>
 Shekhar Prasad Rajak <shekharrajak@live.com>
 Shen Zhou <shen_zhou@u.nus.edu>
+Shreya Singh <shreyas1696@gmail.com>
 Shota Kawabuchi <shota.kawabuchi+GitHub@gmail.com>
 Siavash Eliasi <siavashserver@gmail.com>
 Simon Conseil <contact@saimon.org> <simon.conseil@univ-lyon1.fr>
@@ -503,6 +535,7 @@ Travis Oliphant <teoliphant@gmail.com> <oliphant@enthought.com>
 Travis Oliphant <teoliphant@gmail.com> <travis@continuum.io>
 Valentin Haenel <valentin@haenel.co> <valentin.haenel@gmx.de>
 Valentin Haenel <valentin@haenel.co> <vhaenel@anaconda.com>
+Vardhaman Kalloli <83634399+cyai@users.noreply.github.com>
 Varun Nayyar <nayyarv@gmail.com> <nayyarv@users.noreply.github.com>
 Vinith Kishore <vinith.kishore@unibas.ch>
 Vinith Kishore <vinith.kishore@unibas.ch> <85550536+vinith2@users.noreply.github.com>
diff --git a/.travis.yml b/.travis.yml
index e67099f84..c10e20483 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,7 +26,7 @@ cache:
 
 jobs:
   include:
-    - python: "3.8"
+    - python: "3.9"
       os: linux
       arch: ppc64le
       env:
@@ -37,7 +37,7 @@ jobs:
        # VSX4 still not supported by ubuntu/gcc-11
        - EXPECT_CPU_FEATURES="VSX VSX2 VSX3"
 
-    - python: "3.8"
+    - python: "3.9"
       os: linux
       arch: s390x
       # fixes VX assembler ambiguous errors
diff --git a/22137.new_feature.rst b/22137.new_feature.rst
new file mode 100644
index 000000000..a095115ec
--- /dev/null
+++ b/22137.new_feature.rst
@@ -0,0 +1,8 @@
+Added `NPY_ENABLE_CPU_FEATURES` environment variable
+----------------------------------------------------
+
+Users may now choose to enable only a subset of the built CPU features at
+runtime by specifying the `NPY_ENABLE_CPU_FEATURES` environment variable.
+Note that these specified features must be outside the baseline, since those
+are always assumed. Errors will be raised if attempting to enable a feature
+that is either not supported by your CPU, or that NumPy was not built with.
+\ No newline at end of file
diff --git a/INSTALL.rst b/INSTALL.rst
index f136b7cfc..9ac3aa526 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -14,7 +14,7 @@ Prerequisites
 
 Building NumPy requires the following installed software:
 
-1) Python__ 3.8.x or newer.
+1) Python__ 3.9.x or newer.
 
    Please note that the Python development headers also need to be installed,
    e.g., on Debian/Ubuntu one needs to install both `python3` and
diff --git a/LICENSE.txt b/LICENSE.txt
index e1fb614d6..014d51c98 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2005-2022, NumPy Developers.
+Copyright (c) 2005-2023, NumPy Developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index 02f8f17fb..c0e0b7388 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <h1 align="center">
-<img src="/branding/logo/primary/numpylogo.svg" width="300">
+<img src="https://raw.githubusercontent.com/numpy/numpy/main/branding/logo/primary/numpylogo.svg" width="300">
 </h1><br>
 
 
@@ -37,7 +37,7 @@ Testing:
 
 NumPy requires `pytest` and `hypothesis`.  Tests can then be run after installation with:
 
-    python -c 'import numpy; numpy.test()'
+    python -c "import numpy, sys; sys.exit(numpy.test() is False)"
 
 Code of Conduct
 ----------------------
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1ceaf9a7a..e99ee1002 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -49,14 +49,16 @@ stages:
             if ! `gcc 2>/dev/null`; then
                 sudo apt install gcc
             fi
-            sudo apt install python3
-            sudo apt install python3-dev
+            sudo add-apt-repository ppa:deadsnakes/ppa -y
+            sudo apt install python3.9
+            sudo apt install python3.9-dev
+            sudo apt install python3.9-distutils
             # python3 has no setuptools, so install one to get us going
-            python3 -m pip install --user --upgrade pip 'setuptools<49.2.0'
-            python3 -m pip install --user -r test_requirements.txt
+            python3.9 -m pip install --user --upgrade pip 'setuptools<49.2.0'
+            python3.9 -m pip install --user -r test_requirements.txt
       displayName: 'install python/requirements'
     - script: |
-            python3 runtests.py --show-build-log --cpu-baseline=native --cpu-dispatch=none \
+            python3.9 runtests.py --show-build-log --cpu-baseline=native --cpu-dispatch=none \
             --debug-info --mode=full -- -rsx --junitxml=junit/test-results.xml
       displayName: 'Run native baseline Build / Tests'
     - task: PublishTestResults@2
@@ -78,7 +80,7 @@ stages:
     steps:
     - task: UsePythonVersion@0
       inputs:
-        versionSpec: '3.8'
+        versionSpec: '3.9'
         addToPath: true
         architecture: 'x64'
     - script: >-
@@ -91,7 +93,7 @@ stages:
       displayName: 'Run Lint Checks'
       failOnStderr: true
 
-  - job: Linux_Python_38_32bit_full_with_asserts
+  - job: Linux_Python_39_32bit_full_with_asserts
     pool:
       vmImage: 'ubuntu-20.04'
     steps:
@@ -104,7 +106,7 @@ stages:
             /bin/bash -xc " \
             git config --global --add safe.directory /numpy && \
             cd /numpy && \
-            /opt/python/cp38-cp38/bin/python -mvenv venv && \
+            /opt/python/cp39-cp39/bin/python -mvenv venv && \
             source venv/bin/activate && \
             target=\$(python3 tools/openblas_support.py) && \
             cp -r \$target/lib/* /usr/lib && \
@@ -121,7 +123,7 @@ stages:
       inputs:
         testResultsFiles: '**/test-*.xml'
         failTaskOnFailedTests: true
-        testRunTitle: 'Publish test results for Python 3.8-32 bit full Linux'
+        testRunTitle: 'Publish test results for Python 3.9-32 bit full Linux'
 
 
   - job: macOS
@@ -130,14 +132,17 @@ stages:
     strategy:
       maxParallel: 3
       matrix:
-          Python38:
-            PYTHON_VERSION: '3.8'
+          Python39:
+            PYTHON_VERSION: '3.9'
             USE_OPENBLAS: '1'
-          Python38-ILP64:
-            PYTHON_VERSION: '3.8'
+          Python39-ILP64:
+            PYTHON_VERSION: '3.9'
             NPY_USE_BLAS_ILP64: '1'
             USE_OPENBLAS: '1'
     steps:
+    - script: |
+        git submodule update --init
+      displayName: 'Fetch submodules'
     # the @0 refers to the (major) version of the *task* on Microsoft's
     # end, not the order in the build matrix nor anything to do
     # with version of Python selected
@@ -182,6 +187,9 @@ stages:
     - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
       displayName: 'Check for unreachable code paths in Python modules'
 
+    - script: git submodule update --init
+      displayName: 'Fetch submodules'
+
     # prefer usage of clang over gcc proper
     # to match likely scenario on many user mac machines
     - script: python setup.py build -j 4 build_src --verbose-cfg install
@@ -234,38 +242,39 @@ stages:
       inputs:
         testResultsFiles: '**/test-*.xml'
         failTaskOnFailedTests: true
-        testRunTitle: 'Publish test results for Python 3.8 64-bit full Mac OS'
+        testRunTitle: 'Publish test results for Python 3.9 64-bit full Mac OS'
 
 
   - job: Windows
     pool:
       vmImage: 'windows-2019'
     strategy:
-      maxParallel: 6
+      maxParallel: 5
       matrix:
-          Python38-32bit-fast:
-            PYTHON_VERSION: '3.8'
+          Python39-32bit-full:
+            PYTHON_VERSION: '3.9'
             PYTHON_ARCH: 'x86'
-            TEST_MODE: fast
+            TEST_MODE: full
             BITS: 32
-          Python38-64bit-full:
-            PYTHON_VERSION: '3.8'
+          Python310-64bit-fast:
+            PYTHON_VERSION: '3.10'
             PYTHON_ARCH: 'x64'
-            TEST_MODE: full
+            TEST_MODE: fast
             BITS: 64
-          Python39-32bit-fast:
-            PYTHON_VERSION: '3.9'
+          Python311-32bit-fast:
+            PYTHON_VERSION: '3.11'
             PYTHON_ARCH: 'x86'
             TEST_MODE: fast
             BITS: 32
-          Python39-64bit-full:
-            PYTHON_VERSION: '3.9'
+          Python311-64bit-full:
+            PYTHON_VERSION: '3.11'
             PYTHON_ARCH: 'x64'
             TEST_MODE: full
             BITS: 64
             NPY_USE_BLAS_ILP64: '1'
-          PyPy38-64bit-fast:
-            PYTHON_VERSION: 'PyPy'
+
+          PyPy39-64bit-fast:
+            PYTHON_VERSION: 'pypy3.9'
             PYTHON_ARCH: 'x64'
             TEST_MODE: fast
             BITS: 64
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
index 8fe677eab..0e2cd6cba 100644
--- a/azure-steps-windows.yml
+++ b/azure-steps-windows.yml
@@ -1,29 +1,11 @@
 steps:
+- script: git submodule update --init
+  displayName: 'Fetch submodules'
 - task: UsePythonVersion@0
   inputs:
     versionSpec: $(PYTHON_VERSION)
     addToPath: true
     architecture: $(PYTHON_ARCH)
-  condition: not(contains(variables['PYTHON_VERSION'], 'PyPy'))
-- powershell: |
-    # UsePythonVersion only supports pypy3.6, we need 3.8
-    # https://github.com/microsoft/azure-pipelines-tasks/pull/15553
-    $url = "https://downloads.python.org/pypy/pypy3.8-v7.3.9-win64.zip"
-    $output = "pypy.zip"
-    $wc = New-Object System.Net.WebClient
-    $wc.DownloadFile($url, $output)
-    echo "downloaded $url to $output"
-    mkdir pypy3
-    Expand-Archive $output -DestinationPath pypy3
-    # move pypy3/pypy-c-*/* pypy3
-    move pypy3/pypy*/* pypy3
-    $pypypath = Join-Path (Get-Item .).FullName pypy3
-    $env:Path = $pypypath + ";" + $env:Path
-    setx PATH $env:Path
-    python -mensurepip
-    echo "##vso[task.prependpath]$pypypath"
-  condition: contains(variables['PYTHON_VERSION'], 'PyPy')
-  displayName: "Install PyPy3.8 "
 
 - script: python -m pip install --upgrade pip wheel
   displayName: 'Install tools'
@@ -32,8 +14,9 @@ steps:
   displayName: 'Install dependencies; some are optional to avoid test skips'
 
 - powershell: |
-    choco install -y rtools
-    refreshenv
+    # rtools 42+ does not support 32 bits builds.
+    choco install --confirm --no-progress --side-by-side rtools --version=4.0.0.20220206
+    echo "##vso[task.setvariable variable=RTOOLS40_HOME]c:\rtools40"
   displayName: 'Install rtools'
 
 - powershell: |
diff --git a/benchmarks/README.rst b/benchmarks/README.rst
index 2700e95e7..ef841a818 100644
--- a/benchmarks/README.rst
+++ b/benchmarks/README.rst
@@ -22,8 +22,8 @@ By default, `asv` ships with support for anaconda and virtualenv::
     pip install asv
     pip install virtualenv
 
-After contributing new benchmarks, you should test them locally
-before submitting a pull request.
+After contributing new benchmarks, you should test them locally before
+submitting a pull request.
 
 To run all benchmarks, navigate to the root NumPy directory at
 the command line and execute::
@@ -31,11 +31,21 @@ the command line and execute::
     python runtests.py --bench
 
 where ``--bench`` activates the benchmark suite instead of the
-test suite. This builds NumPy and runs  all available benchmarks
+test suite. This builds NumPy and runs all available benchmarks
 defined in ``benchmarks/``. (Note: this could take a while. Each
 benchmark is run multiple times to measure the distribution in
 execution times.)
 
+For **testing** benchmarks locally, it may be better to run these without
+replications::
+
+    cd benchmarks/
+    export REGEXP="bench.*Ufunc"
+    asv run --dry-run --show-stderr --python=same --quick -b $REGEXP
+
+Where the regular expression used to match benchmarks is stored in ``$REGEXP``,
+and `--quick` is used to avoid repetitions.
+
 To run benchmarks from a particular benchmark module, such as
 ``bench_core.py``, simply append the filename without the extension::
 
@@ -69,6 +79,27 @@ Command-line help is available as usual via ``asv --help`` and
 
 .. _ASV documentation: https://asv.readthedocs.io/
 
+Benchmarking versions
+---------------------
+
+To benchmark or visualize only releases on different machines locally, the tags with their commits can be generated, before being run with ``asv``, that is::
+
+    cd benchmarks
+    # Get commits for tags
+    # delete tag_commits.txt before re-runs
+    for gtag in $(git tag --list --sort taggerdate | grep "^v"); do
+    git log $gtag --oneline -n1 --decorate=no | awk '{print $1;}' >> tag_commits.txt
+    done
+    # Use the last 20
+    tail --lines=20 tag_commits.txt > 20_vers.txt
+    asv run HASHFILE:20_vers.txt
+    # Publish and view
+    asv publish
+    asv preview
+
+For details on contributing these, see the `benchmark results repository`_.
+
+.. _benchmark results repository: https://github.com/HaoZeke/asv-numpy
 
 Writing benchmarks
 ------------------
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index b60135524..267450448 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -43,7 +43,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/asv_compare.conf.json.tpl b/benchmarks/asv_compare.conf.json.tpl
index 01f4e41de..f0ef0bf49 100644
--- a/benchmarks/asv_compare.conf.json.tpl
+++ b/benchmarks/asv_compare.conf.json.tpl
@@ -47,7 +47,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index 7b9f1d3e6..35fc87eac 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -26,7 +26,7 @@ def dirty_lock(lock_name, lock_on_count=1):
     lock_path = os.path.abspath(os.path.join(
         os.path.dirname(__file__), "..", "env", lock_name)
     )
-    # ASV load the 'benchmark_dir' to discovering the available benchmarks
+    # ASV loads the 'benchmark_dir' to discover the available benchmarks
     # the issue here is ASV doesn't capture any strings from stdout or stderr
     # during this stage so we escape it and lock on the second increment
     try:
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 4fcd7ace5..fe1cd37b6 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -45,6 +45,12 @@ class Core(Benchmark):
     def time_array_l_view(self):
         np.array(self.l_view)
 
+    def time_can_cast(self):
+        np.can_cast(self.l10x10, self.float64_dtype)
+
+    def time_can_cast_same_kind(self):
+        np.can_cast(self.l10x10, self.float64_dtype, casting="same_kind")
+
     def time_vstack_l(self):
         np.vstack(self.l)
 
@@ -66,6 +72,9 @@ class Core(Benchmark):
     def time_empty_100(self):
         np.empty(100)
 
+    def time_empty_like(self):
+        np.empty_like(self.l10x10)
+
     def time_eye_100(self):
         np.eye(100)
 
@@ -206,13 +215,41 @@ class Indices(Benchmark):
     def time_indices(self):
         np.indices((1000, 500))
 
-class VarComplex(Benchmark):
-    params = [10**n for n in range(0, 9)]
-    def setup(self, n):
-        self.arr = np.random.randn(n) + 1j * np.random.randn(n)
 
-    def teardown(self, n):
-        del self.arr
+class StatsMethods(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = [['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
+              [100**n for n in range(0, 2)]]
+    param_names = ['dtype', 'size']
+
+    def setup(self, dtype, size):
+        try:
+            self.data = np.ones(size, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(size, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = np.random.randn(size) + 1j * np.random.randn(size)
+
+    def time_min(self, dtype, size):
+        self.data.min()
+
+    def time_max(self, dtype, size):
+        self.data.max()
+
+    def time_mean(self, dtype, size):
+        self.data.mean()
+
+    def time_std(self, dtype, size):
+        self.data.std()
+
+    def time_prod(self, dtype, size):
+        self.data.prod()
+
+    def time_var(self, dtype, size):
+        self.data.var()
 
-    def time_var(self, n):
-        self.arr.var()
+    def time_sum(self, dtype, size):
+        self.data.sum()
diff --git a/benchmarks/benchmarks/bench_creation.py b/benchmarks/benchmarks/bench_creation.py
new file mode 100644
index 000000000..3a577df7a
--- /dev/null
+++ b/benchmarks/benchmarks/bench_creation.py
@@ -0,0 +1,81 @@
+from .common import Benchmark, TYPES1
+
+import numpy as np
+
+
+class MeshGrid(Benchmark):
+    """ Benchmark meshgrid generation
+    """
+    params = [[16, 32],
+              [2, 3, 4],
+              ['ij', 'xy'], TYPES1]
+    param_names = ['size', 'ndims', 'ind', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndims, ind, ndtype):
+        self.grid_dims = [(np.random.ranf(size)).astype(ndtype) for
+                          x in range(ndims)]
+
+    def time_meshgrid(self, size, ndims, ind, ndtype):
+        np.meshgrid(*self.grid_dims, indexing=ind)
+
+
+class Create(Benchmark):
+    """ Benchmark for creation functions
+    """
+    # (64, 64), (128, 128), (256, 256)
+    # , (512, 512), (1024, 1024)
+    params = [[16, 32, 128, 256, 512,
+               (16, 16), (32, 32)],
+              ['C', 'F'],
+              TYPES1]
+    param_names = ['shape', 'order', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, order, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_full(self, shape, order, npdtypes):
+        np.full(shape, self.xarg[1], dtype=npdtypes, order=order)
+
+    def time_full_like(self, shape, order, npdtypes):
+        np.full_like(self.xarg, self.xarg[0], order=order)
+
+    def time_ones(self, shape, order, npdtypes):
+        np.ones(shape, dtype=npdtypes, order=order)
+
+    def time_ones_like(self, shape, order, npdtypes):
+        np.ones_like(self.xarg, order=order)
+
+    def time_zeros(self, shape, order, npdtypes):
+        np.zeros(shape, dtype=npdtypes, order=order)
+
+    def time_zeros_like(self, shape, order, npdtypes):
+        np.zeros_like(self.xarg, order=order)
+
+    def time_empty(self, shape, order, npdtypes):
+        np.empty(shape, dtype=npdtypes, order=order)
+
+    def time_empty_like(self, shape, order, npdtypes):
+        np.empty_like(self.xarg, order=order)
+
+
+class UfuncsFromDLP(Benchmark):
+    """ Benchmark for creation functions
+    """
+    params = [[16, 32, (16, 16),
+               (32, 32), (64, 64)],
+              TYPES1]
+    param_names = ['shape', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, npdtypes):
+        if npdtypes in ['longdouble', 'clongdouble']:
+            raise NotImplementedError(
+                'Only IEEE dtypes are supported')
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_from_dlpack(self, shape, npdtypes):
+        np.from_dlpack(self.xarg)
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 2e44ff76b..cc37bef39 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -248,7 +248,7 @@ class Sort(Benchmark):
         # In NumPy 1.17 and newer, 'merge' can be one of several
         # stable sorts, it isn't necessarily merge sort.
         ['quick', 'merge', 'heap'],
-        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'],
+        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'],
         [
             ('random',),
             ('ordered',),
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
index 357adbb87..e316d07f3 100644
--- a/benchmarks/benchmarks/bench_io.py
+++ b/benchmarks/benchmarks/bench_io.py
@@ -1,7 +1,7 @@
-from .common import Benchmark, get_squares
+from .common import Benchmark, get_squares, get_squares_
 
 import numpy as np
-from io import StringIO
+from io import SEEK_SET, StringIO, BytesIO
 
 
 class Copy(Benchmark):
@@ -67,6 +67,15 @@ class Savez(Benchmark):
         np.savez('tmp.npz', **self.squares)
 
 
+class LoadNpyOverhead(Benchmark):
+    def setup(self):
+        self.buffer = BytesIO()
+        np.save(self.buffer, get_squares_()['float32'])
+
+    def time_loadnpy_overhead(self):
+        self.buffer.seek(0, SEEK_SET)
+        np.load(self.buffer)
+
 class LoadtxtCSVComments(Benchmark):
     # benchmarks for np.loadtxt comment handling
     # when reading in CSV files
diff --git a/benchmarks/benchmarks/bench_itemselection.py b/benchmarks/benchmarks/bench_itemselection.py
index 518258a8f..46a39372c 100644
--- a/benchmarks/benchmarks/bench_itemselection.py
+++ b/benchmarks/benchmarks/bench_itemselection.py
@@ -7,7 +7,7 @@ class Take(Benchmark):
     params = [
         [(1000, 1), (1000, 2), (2, 1000, 1), (1000, 3)],
         ["raise", "wrap", "clip"],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["shape", "mode", "dtype"]
 
     def setup(self, shape, mode, dtype):
@@ -21,7 +21,7 @@ class Take(Benchmark):
 class PutMask(Benchmark):
     params = [
         [True, False],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["values_is_scalar", "dtype"]
 
     def setup(self, values_is_scalar, dtype):
@@ -41,3 +41,21 @@ class PutMask(Benchmark):
     def time_sparse(self, values_is_scalar, dtype):
         np.putmask(self.arr, self.sparse_mask, self.vals)
 
+
+class Put(Benchmark):
+    params = [
+        [True, False],
+        TYPES1 + ["O", "i,O"]]
+    param_names = ["values_is_scalar", "dtype"]
+
+    def setup(self, values_is_scalar, dtype):
+        if values_is_scalar:
+            self.vals = np.array(1., dtype=dtype)
+        else:
+            self.vals = np.ones(1000, dtype=dtype)
+
+        self.arr = np.ones(1000, dtype=dtype)
+        self.indx = np.arange(1000, dtype=np.intp)
+
+    def time_ordered(self, values_is_scalar, dtype):
+        np.put(self.arr, self.indx, self.vals)
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
index b64f8ab17..f792116a6 100644
--- a/benchmarks/benchmarks/bench_lib.py
+++ b/benchmarks/benchmarks/bench_lib.py
@@ -132,11 +132,26 @@ class Unique(Benchmark):
         # produce a randomly shuffled array with the
         # approximate desired percentage np.nan content
         base_array = np.random.uniform(size=array_size)
-        base_array[base_array < percent_nans / 100.] = np.nan
+        n_nan = int(percent_nans * array_size)
+        nan_indices = np.random.choice(np.arange(array_size), size=n_nan)
+        base_array[nan_indices] = np.nan
         self.arr = base_array
 
-    def time_unique(self, array_size, percent_nans):
-        np.unique(self.arr)
+    def time_unique_values(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=False)
+
+    def time_unique_counts(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=True)
+
+    def time_unique_inverse(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=True, return_counts=False)
+
+    def time_unique_all(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=True,
+                  return_inverse=True, return_counts=True)
 
 
 class Isin(Benchmark):
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index a94ba1139..b4e39b084 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -190,3 +190,27 @@ class Einsum(Benchmark):
     # sum_of_products_contig_outstride0_one：non_contiguous arrays
     def time_einsum_noncon_contig_outstride0(self, dtype):
         np.einsum("i->", self.non_contiguous_dim1, optimize=True)
+
+
+class LinAlgTransposeVdot(Benchmark):
+    # Smaller for speed
+    # , (128, 128), (256, 256), (512, 512),
+    # (1024, 1024)
+    params = [[(16, 16), (32, 32),
+               (64, 64)], TYPES1]
+    param_names = ['shape', 'npdtypes']
+
+    def setup(self, shape, npdtypes):
+        self.xarg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.xarg = self.xarg.astype(npdtypes)
+        self.x2arg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.x2arg = self.x2arg.astype(npdtypes)
+        if npdtypes.startswith('complex'):
+            self.xarg += self.xarg.T*1j
+            self.x2arg += self.x2arg.T*1j
+
+    def time_transpose(self, shape, npdtypes):
+        np.transpose(self.xarg)
+
+    def time_vdot(self, shape, npdtypes):
+        np.vdot(self.xarg, self.x2arg)
diff --git a/benchmarks/benchmarks/bench_manipulate.py b/benchmarks/benchmarks/bench_manipulate.py
new file mode 100644
index 000000000..0a312479c
--- /dev/null
+++ b/benchmarks/benchmarks/bench_manipulate.py
@@ -0,0 +1,107 @@
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
+
+import numpy as np
+from collections import deque
+
+class BroadcastArrays(Benchmark):
+    params = [[(16, 32), (32, 64),
+               (64, 128), (128, 256),
+               (256, 512), (512, 1024)],
+              TYPES1]
+    param_names = ['shape', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, ndtype):
+        self.xarg = np.random.ranf(shape[0]*shape[1]).reshape(shape)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += np.random.ranf(1)*1j
+
+    def time_broadcast_arrays(self, shape, ndtype):
+        np.broadcast_arrays(self.xarg, np.ones(1))
+
+
+class BroadcastArraysTo(Benchmark):
+    params = [[16, 32, 64, 128, 256, 512],
+              TYPES1]
+    param_names = ['size', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndtype):
+        self.rng = np.random.default_rng()
+        self.xarg = self.rng.random(size)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += self.rng.random(1)*1j
+
+    def time_broadcast_to(self, size, ndtype):
+        np.broadcast_to(self.xarg, (size, size))
+
+
+class ConcatenateStackArrays(Benchmark):
+    # (64, 128), (128, 256), (256, 512)
+    params = [[(16, 32), (32, 64)],
+              [2, 3, 4, 5],
+              TYPES1]
+    param_names = ['shape', 'narrays', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, narrays, ndtype):
+        self.xarg = [np.random.ranf(shape[0]*shape[1]).reshape(shape)
+                     for x in range(narrays)]
+        self.xarg = [x.astype(ndtype) for x in self.xarg]
+        if ndtype.startswith('complex'):
+            [x + np.random.ranf(1)*1j for x in self.xarg]
+
+    def time_concatenate_ax0(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=0)
+
+    def time_concatenate_ax1(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=1)
+
+    def time_stack_ax0(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=0)
+
+    def time_stack_ax1(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=1)
+
+
+class DimsManipulations(Benchmark):
+    params = [
+        [(2, 1, 4), (2, 1), (5, 2, 3, 1)],
+    ]
+    param_names = ['shape']
+    timeout = 10
+
+    def setup(self, shape):
+        self.xarg = np.ones(shape=shape)
+        self.reshaped = deque(shape)
+        self.reshaped.rotate(1)
+        self.reshaped = tuple(self.reshaped)
+
+    def time_expand_dims(self, shape):
+        np.expand_dims(self.xarg, axis=1)
+
+    def time_expand_dims_neg(self, shape):
+        np.expand_dims(self.xarg, axis=-1)
+
+    def time_squeeze_dims(self, shape):
+        np.squeeze(self.xarg)
+
+    def time_flip_all(self, shape):
+        np.flip(self.xarg, axis=None)
+
+    def time_flip_one(self, shape):
+        np.flip(self.xarg, axis=1)
+
+    def time_flip_neg(self, shape):
+        np.flip(self.xarg, axis=-1)
+
+    def time_moveaxis(self, shape):
+        np.moveaxis(self.xarg, [0, 1], [-1, -2])
+
+    def time_roll(self, shape):
+        np.roll(self.xarg, 3)
+
+    def time_reshape(self, shape):
+        np.reshape(self.xarg, self.reshaped)
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
index ca07bd180..040b5ca73 100644
--- a/benchmarks/benchmarks/bench_reduce.py
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -45,19 +45,40 @@ class AnyAll(Benchmark):
         self.zeros.any()
 
 
-class MinMax(Benchmark):
-    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
-              np.int64, np.uint64, np.float32, np.float64, np.intp]
+class StatsReductions(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = ['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
     param_names = ['dtype']
 
     def setup(self, dtype):
-        self.d = np.ones(20000, dtype=dtype)
+        try:
+            self.data = np.ones(200, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(200, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = self.data * self.data.T*1j
 
     def time_min(self, dtype):
-        np.min(self.d)
+        np.min(self.data)
 
     def time_max(self, dtype):
-        np.max(self.d)
+        np.max(self.data)
+
+    def time_mean(self, dtype):
+        np.mean(self.data)
+
+    def time_std(self, dtype):
+        np.std(self.data)
+
+    def time_prod(self, dtype):
+        np.prod(self.data)
+
+    def time_var(self, dtype):
+        np.var(self.data)
+
 
 class FMinMax(Benchmark):
     params = [np.float32, np.float64]
@@ -72,6 +93,7 @@ class FMinMax(Benchmark):
     def time_max(self, dtype):
         np.fmax.reduce(self.d)
 
+
 class ArgMax(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -83,6 +105,7 @@ class ArgMax(Benchmark):
     def time_argmax(self, dtype):
         np.argmax(self.d)
 
+
 class ArgMin(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -94,6 +117,7 @@ class ArgMin(Benchmark):
     def time_argmin(self, dtype):
         np.argmin(self.d)
 
+
 class SmallReduction(Benchmark):
     def setup(self):
         self.d = np.ones(100, dtype=np.float32)
diff --git a/benchmarks/benchmarks/bench_scalar.py b/benchmarks/benchmarks/bench_scalar.py
index 650daa89d..638f66df5 100644
--- a/benchmarks/benchmarks/bench_scalar.py
+++ b/benchmarks/benchmarks/bench_scalar.py
@@ -65,3 +65,15 @@ class ScalarMath(Benchmark):
         other + int32
         other + int32
         other + int32
+
+
+class ScalarStr(Benchmark):
+    # Test scalar to str conversion
+    params = [TYPES1]
+    param_names = ["type"]
+
+    def setup(self, typename):
+        self.a = np.array([100] * 100, dtype=typename)
+
+    def time_str_repr(self, typename):
+        res = [str(x) for x in self.a]
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 36d8621e8..f7c77d90c 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -1,6 +1,9 @@
-from .common import Benchmark, get_squares_
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
 
 import numpy as np
+import itertools
+from packaging import version
+import operator
 
 
 ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
@@ -13,11 +16,13 @@ ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
           'isinf', 'isnan', 'isnat', 'lcm', 'ldexp', 'left_shift', 'less',
           'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
           'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
-          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf', 'multiply',
-          'negative', 'nextafter', 'not_equal', 'positive', 'power',
-          'rad2deg', 'radians', 'reciprocal', 'remainder', 'right_shift',
-          'rint', 'sign', 'signbit', 'sin', 'sinh', 'spacing', 'sqrt',
-          'square', 'subtract', 'tan', 'tanh', 'true_divide', 'trunc']
+          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf',
+          'multiply', 'negative', 'nextafter', 'not_equal', 'positive',
+          'power', 'rad2deg', 'radians', 'reciprocal', 'remainder',
+          'right_shift', 'rint', 'sign', 'signbit', 'sin',
+          'sinh', 'spacing', 'sqrt', 'square', 'subtract', 'tan', 'tanh',
+          'true_divide', 'trunc']
+arrayfuncdisp = ['real', 'round']
 
 
 for name in dir(np):
@@ -25,6 +30,30 @@ for name in dir(np):
         print("Missing ufunc %r" % (name,))
 
 
+class ArrayFunctionDispatcher(Benchmark):
+    params = [arrayfuncdisp]
+    param_names = ['func']
+    timeout = 10
+
+    def setup(self, ufuncname):
+        np.seterr(all='ignore')
+        try:
+            self.afdn = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        self.args = []
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * 1  # no nin
+            try:
+                self.afdn(*arg)
+            except TypeError:
+                continue
+            self.args.append(arg)
+
+    def time_afdn_types(self, ufuncname):
+        [self.afdn(*arg) for arg in self.args]
+
+
 class Broadcast(Benchmark):
     def setup(self):
         self.d = np.ones((50000, 100), dtype=np.float64)
@@ -34,6 +63,20 @@ class Broadcast(Benchmark):
         self.d - self.e
 
 
+class At(Benchmark):
+    def setup(self):
+        rng = np.random.default_rng(1)
+        self.vals = rng.random(10_000_000, dtype=np.float64)
+        self.idx = rng.integers(1000, size=10_000_000).astype(np.intp)
+        self.res = np.zeros(1000, dtype=self.vals.dtype)
+
+    def time_sum_at(self):
+        np.add.at(self.res, self.idx, self.vals)
+
+    def time_maximum_at(self):
+        np.maximum.at(self.res, self.idx, self.vals)
+
+
 class UFunc(Benchmark):
     params = [ufuncs]
     param_names = ['ufunc']
@@ -42,23 +85,179 @@ class UFunc(Benchmark):
     def setup(self, ufuncname):
         np.seterr(all='ignore')
         try:
-            self.f = getattr(np, ufuncname)
+            self.ufn = getattr(np, ufuncname)
         except AttributeError:
             raise NotImplementedError()
         self.args = []
-        for t, a in get_squares_().items():
-            arg = (a,) * self.f.nin
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * self.ufn.nin
             try:
-                self.f(*arg)
+                self.ufn(*arg)
             except TypeError:
                 continue
             self.args.append(arg)
 
     def time_ufunc_types(self, ufuncname):
-        [self.f(*arg) for arg in self.args]
+        [self.ufn(*arg) for arg in self.args]
+
+
+class MethodsV0(Benchmark):
+    """ Benchmark for the methods which do not take any arguments
+    """
+    params = [['__abs__', '__neg__', '__pos__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(self.xarg)
+
+
+class NDArrayLRShifts(Benchmark):
+    """ Benchmark for the shift methods
+    """
+    params = [['__lshift__', '__rshift__'],
+              ['intp', 'int8', 'int16',
+                'int32', 'int64', 'uint8',
+                'uint16', 'uint32', 'uint64']]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.vals = np.ones(1000,
+                            dtype=getattr(np, npdtypes)) * \
+                            np.random.randint(9)
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*[self.vals, 2])
+
+
+class Methods0D(Benchmark):
+    """Zero dimension array methods
+    """
+    params = [['__bool__', '__complex__', '__invert__',
+               '__float__', '__int__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.xarg = np.array(3, dtype=npdtypes)
+        if (npdtypes.startswith('complex') and
+           methname in ['__float__', '__int__']) or \
+           (npdtypes.startswith('int') and methname == '__invert__'):
+            # Skip
+            raise NotImplementedError
+
+    def time_ndarray__0d__(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class MethodsV1(Benchmark):
+    """ Benchmark for the methods which take an argument
+    """
+    params = [['__and__', '__add__', '__eq__', '__floordiv__', '__ge__',
+               '__gt__', '__le__', '__lt__', '__matmul__',
+               '__mod__', '__mul__', '__ne__', '__or__',
+               '__pow__', '__sub__', '__truediv__', '__xor__'],
+              TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        if (
+            npdtypes.startswith("complex")
+                and methname in ["__floordiv__", "__mod__"]
+        ) or (
+            not npdtypes.startswith("int")
+            and methname in ["__and__", "__or__", "__xor__"]
+        ):
+            raise NotImplementedError  # skip
+        values = get_squares_().get(npdtypes)
+        self.xargs = [values[0], values[1]]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*self.xargs)
+
+
+class NDArrayGetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 50*50).reshape(50, 50)
+
+    def time_methods_getitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+        getattr(mdat, '__getitem__')(margs)
+
+
+class NDArraySetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+
+    def time_methods_setitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+            mdat[margs] = 17
+
+
+class DLPMethods(Benchmark):
+    """ Benchmark for DLPACK helpers
+    """
+    params = [['__dlpack__', '__dlpack_device__'], DLPACK_TYPES]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        if npdtypes == 'bool':
+            if version.parse(np.__version__) > version.parse("1.25"):
+                self.xarg = values.get('int16')[0].astype('bool')
+            else:
+                raise NotImplementedError("Not supported before v1.25")
+        else:
+            self.xarg = values.get('int16')[0]
+
+    def time_ndarray_dlp(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class NDArrayAsType(Benchmark):
+    """ Benchmark for type conversion
+    """
+    params = [list(itertools.combinations(TYPES1, 2))]
+    param_names = ['typeconv']
+    timeout = 10
+
+    def setup(self, typeconv):
+        if typeconv[0] == typeconv[1]:
+            raise NotImplementedError(
+                    "Skipping test for converting to the same dtype")
+        self.xarg = get_squares_().get(typeconv[0])
+
+    def time_astype(self, typeconv):
+        self.xarg.astype(typeconv[1])
+
 
 class UFuncSmall(Benchmark):
-    """  Benchmark for a selection of ufuncs on a small arrays and scalars 
+    """  Benchmark for a selection of ufuncs on a small arrays and scalars
 
     Since the arrays and scalars are small, we are benchmarking the overhead 
     of the numpy ufunc functionality
diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
index f80bf90f9..898cc0818 100644
--- a/benchmarks/benchmarks/bench_ufunc_strides.py
+++ b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -1,156 +1,181 @@
-from .common import Benchmark
+from .common import Benchmark, get_data
 
 import numpy as np
 
-UNARY_UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
-        isinstance(obj, np.ufunc)]
-UNARY_OBJECT_UFUNCS = [uf for uf in UNARY_UFUNCS if "O->O" in uf.types]
-UNARY_OBJECT_UFUNCS.remove(getattr(np, 'invert'))
+UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
+          isinstance(obj, np.ufunc)]
+UFUNCS_UNARY = [uf for uf in UFUNCS if "O->O" in uf.types]
 
-stride = [1, 2, 4]
-stride_out = [1, 2, 4]
-dtype = ['e', 'f', 'd']
-
-class Unary(Benchmark):
-    params = [UNARY_OBJECT_UFUNCS, stride, stride_out, dtype]
-    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
-    timeout = 10
-
-    def setup(self, ufuncname, stride, stride_out, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = ufuncname
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr_out = np.empty(stride_out*N, dtype)
-        self.arr = np.random.rand(stride*N).astype(dtype)
-        if (ufuncname.__name__ == 'arccosh'):
-            self.arr = 1.0 + self.arr
-
-    def time_ufunc(self, ufuncname, stride, stride_out, dtype):
-        self.f(self.arr[::stride], self.arr_out[::stride_out])
-
-class AVX_UFunc_log(Benchmark):
-    params = [stride, dtype]
-    param_names = ['stride', 'dtype']
-    timeout = 10
-
-    def setup(self, stride, dtype):
-        np.seterr(all='ignore')
-        N = 10000
-        self.arr = np.array(np.random.random_sample(stride*N), dtype=dtype)
-
-    def time_log(self, stride, dtype):
-        np.log(self.arr[::stride])
-
-
-binary_ufuncs = [
-    'maximum', 'minimum', 'fmax', 'fmin'
-]
-binary_dtype = ['f', 'd']
-
-class Binary(Benchmark):
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
+class _AbstractBinary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in0', 'stride_in1' 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in0, stride_in1, stride_out, dtype):
+        ufunc_insig = f'{dtype}{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            for st_sig in (ufunc_insig, dtype):
+                test = [sig for sig in ufunc.types if sig.startswith(st_sig)]
+                if test:
+                    break
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"binary input of dtype {dtype}"
+                ) from None
+            tin, tout = test[0].split('->')
+        else:
+            tin = dtype + dtype
+            tout = dtype
+
+        self.ufunc_args = []
+        for i, (dt, stride) in enumerate(zip(tin, (stride_in0, stride_in1))):
+            self.ufunc_args += [get_data(
+                self.arrlen*stride, dt, i,
+                zeros=self.data_zeros,
+                finite=self.data_finite,
+                denormal=self.data_denormal,
+            )[::stride]]
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, ufuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
-        self.arr_out = np.empty(stride_out*N, dtype)
-
-    def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
-        self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
-               self.arr_out[::stride_out])
-
 
-binary_int_ufuncs = ['maximum', 'minimum']
-binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    def time_binary(self, ufunc, stride_in0, stride_in1, stride_out,
+             dtype):
+        ufunc(*self.ufunc_args)
 
-class BinaryInt(Binary):
+    def time_binary_scalar_in0(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0][0], *self.ufunc_args[1:])
 
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]
-
-class AVX_ldexp(Benchmark):
-
-    params = [dtype, stride]
-    param_names = ['dtype', 'stride']
-    timeout = 10
+    def time_binary_scalar_in1(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0], self.ufunc_args[1][0], *self.ufunc_args[2:])
 
-    def setup(self, dtype, stride):
-        np.seterr(all='ignore')
-        self.f = getattr(np, 'ldexp')
-        N = 10000
-        self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride*N), dtype='i')
-
-    def time_ufunc(self, dtype, stride):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_bfuncs = ['add',
-                'subtract',
-                'multiply',
-                'divide']
-cmplxstride = [1, 2, 4]
-cmplxdtype  = ['F', 'D']
-
-class AVX_cmplx_arithmetic(Benchmark):
-    params = [cmplx_bfuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
-    timeout = 10
-
-    def setup(self, bfuncname, stride, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
-        self.arr2 = np.ones(stride*N, dtype)
-
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_ufuncs = ['reciprocal',
-                'absolute',
-                'square',
-                'conjugate']
-
-class AVX_cmplx_funcs(Benchmark):
-    params = [cmplx_ufuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
+class _AbstractUnary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        arr_in = get_data(
+            stride_in*self.arrlen, dtype,
+            zeros=self.data_zeros,
+            finite=self.data_finite,
+            denormal=self.data_denormal,
+        )
+        self.ufunc_args = [arr_in[::stride_in]]
+
+        ufunc_insig = f'{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            test = [sig for sig in ufunc.types if sig.startswith(ufunc_insig)]
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"unary input of dtype {dtype}"
+                ) from None
+            tout = test[0].split('->')[1]
+        else:
+            tout = dtype
+
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, bfuncname, stride, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
 
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride])
+    def time_unary(self, ufunc, stride_in, stride_out, dtype):
+        ufunc(*self.ufunc_args)
+
+class UnaryFP(_AbstractUnary):
+    params = [UFUNCS_UNARY, [1, 2, 4], [1, 2, 4], ['e', 'f', 'd']]
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        _AbstractUnary.setup(self, ufunc, stride_in, stride_out, dtype)
+        if (ufunc.__name__ == 'arccosh'):
+            self.ufunc_args[0] += 1.0
+
+class UnaryFPSpecial(UnaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryFP(_AbstractBinary):
+    params = [
+        [np.maximum, np.minimum, np.fmax, np.fmin, np.ldexp],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4], ['f', 'd']
+    ]
+
+class BinaryFPSpecial(BinaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryComplex(_AbstractBinary):
+    params = [
+        [np.add, np.subtract, np.multiply, np.divide],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4],
+        ['F', 'D']
+    ]
+
+class UnaryComplex(_AbstractUnary):
+    params = [
+        [np.reciprocal, np.absolute, np.square, np.conjugate],
+        [1, 2, 4], [1, 2, 4], ['F', 'D']
+    ]
+
+class BinaryInt(_AbstractBinary):
+    arrlen = 100000
+    params = [
+        [np.maximum, np.minimum],
+        [1, 2], [1, 2], [1, 2],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class BinaryIntContig(_AbstractBinary):
+    params = [
+        [getattr(np, uf) for uf in (
+            'add', 'subtract', 'multiply', 'bitwise_and', 'bitwise_or',
+            'bitwise_xor', 'logical_and', 'logical_or', 'logical_xor',
+            'right_shift', 'left_shift',
+        )],
+        [1], [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class UnaryIntContig(_AbstractUnary):
+    arrlen = 100000
+    params = [
+        [getattr(np, uf) for uf in (
+            'positive', 'square', 'reciprocal', 'conjugate', 'logical_not',
+            'invert', 'isnan', 'isinf', 'isfinite',
+            'absolute', 'sign'
+        )],
+        [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
 
 class Mandelbrot(Benchmark):
     def f(self,z):
         return np.abs(z) < 4.0
 
     def g(self,z,c):
-        return np.sum(np.multiply(z,z) + c)
+        return np.sum(np.multiply(z, z) + c)
 
     def mandelbrot_numpy(self, c, maxiter):
-        output = np.zeros(c.shape, np.int)
+        output = np.zeros(c.shape, np.int32)
         z = np.empty(c.shape, np.complex64)
         for it in range(maxiter):
             notdone = self.f(z)
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
index 0c40e85b0..d10fe999d 100644
--- a/benchmarks/benchmarks/common.py
+++ b/benchmarks/benchmarks/common.py
@@ -1,5 +1,8 @@
-import numpy
+import numpy as np
 import random
+import os
+from functools import lru_cache
+from pathlib import Path
 
 # Various pre-crafted datasets/variables for testing
 # !!! Must not be changed -- only appended !!!
@@ -7,7 +10,7 @@ import random
 # sequences
 random.seed(1)
 # but will seed it nevertheless
-numpy.random.seed(1)
+np.random.seed(1)
 
 nx, ny = 1000, 1000
 # reduced squares based on indexes_rand, primarily for testing more
@@ -19,37 +22,37 @@ TYPES1 = [
     'int16', 'float16',
     'int32', 'float32',
     'int64', 'float64',  'complex64',
-    'longfloat', 'complex128',
+    'longdouble', 'complex128',
 ]
-if 'complex256' in numpy.sctypeDict:
-    TYPES1.append('complex256')
+if 'complex256' in np.sctypeDict:
+    TYPES1.append('clongdouble')
 
+DLPACK_TYPES = [
+    'int16', 'float16',
+    'int32', 'float32',
+    'int64', 'float64',  'complex64',
+    'complex128', 'bool',
+]
 
-def memoize(func):
-    result = []
-    def wrapper():
-        if not result:
-            result.append(func())
-        return result[0]
-    return wrapper
-
+# Path for caching
+CACHE_ROOT = Path(__file__).resolve().parent.parent / 'env' / 'numpy_benchdata'
 
 # values which will be used to construct our sample data matrices
 # replicate 10 times to speed up initial imports of this helper
 # and generate some redundancy
 
-@memoize
+@lru_cache(typed=True)
 def get_values():
-    rnd = numpy.random.RandomState(1)
-    values = numpy.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
+    rnd = np.random.RandomState(1)
+    values = np.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
     return values
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares():
     values = get_values()
-    squares = {t: numpy.array(values,
-                              dtype=getattr(numpy, t)).reshape((nx, ny))
+    squares = {t: np.array(values,
+                              dtype=getattr(np, t)).reshape((nx, ny))
                for t in TYPES1}
 
     # adjust complex ones to have non-degenerated imagery part -- use
@@ -60,42 +63,42 @@ def get_squares():
     return squares
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares_():
     # smaller squares
     squares_ = {t: s[:nxs, :nys] for t, s in get_squares().items()}
     return squares_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_vectors():
     # vectors
     vectors = {t: s[0] for t, s in get_squares().items()}
     return vectors
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes():
     indexes = list(range(nx))
     # so we do not have all items
     indexes.pop(5)
     indexes.pop(95)
 
-    indexes = numpy.array(indexes)
+    indexes = np.array(indexes)
     return indexes
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand():
     rnd = random.Random(1)
 
     indexes_rand = get_indexes().tolist()       # copy
     rnd.shuffle(indexes_rand)         # in-place shuffle
-    indexes_rand = numpy.array(indexes_rand)
+    indexes_rand = np.array(indexes_rand)
     return indexes_rand
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_():
     # smaller versions
     indexes = get_indexes()
@@ -103,12 +106,112 @@ def get_indexes_():
     return indexes_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand_():
     indexes_rand = get_indexes_rand()
     indexes_rand_ = indexes_rand[indexes_rand < nxs]
     return indexes_rand_
 
 
+@lru_cache(typed=True)
+def get_data(size, dtype, ip_num=0, zeros=False, finite=True, denormal=False):
+    """
+    Generates a cached random array that covers several scenarios that
+    may affect the benchmark for fairness and to stabilize the benchmark.
+
+    Parameters
+    ----------
+    size: int
+        Array length.
+
+    dtype: dtype or dtype specifier
+
+    ip_num: int
+        Input number, to avoid memory overload
+        and to provide unique data for each operand.
+
+    zeros: bool
+        Spreading zeros along with generated data.
+
+    finite: bool
+        Avoid spreading fp special cases nan/inf.
+
+    denormal:
+        Spreading subnormal numbers along with generated data.
+    """
+    dtype = np.dtype(dtype)
+    dname = dtype.name
+    cache_name = f'{dname}_{size}_{ip_num}_{int(zeros)}'
+    if dtype.kind in 'fc':
+        cache_name += f'{int(finite)}{int(denormal)}'
+    cache_name += '.bin'
+    cache_path = CACHE_ROOT / cache_name
+    if cache_path.exists():
+        return np.fromfile(cache_path, dtype)
+
+    array = np.ones(size, dtype)
+    rands = []
+    if dtype.kind == 'i':
+        dinfo = np.iinfo(dtype)
+        scale = 8
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for low, high in (
+            (-0x80, -1),
+            (1, 0x7f),
+            (-0x8000, -1),
+            (1, 0x7fff),
+            (-0x80000000, -1),
+            (1, 0x7fffffff),
+            (-0x8000000000000000, -1),
+            (1, 0x7fffffffffffffff),
+        ):
+            rands += [np.random.randint(
+                max(low, dinfo.min),
+                min(high, dinfo.max),
+                lsize, dtype
+            )]
+    elif dtype.kind == 'u':
+        dinfo = np.iinfo(dtype)
+        scale = 4
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for high in (0xff, 0xffff, 0xffffffff, 0xffffffffffffffff):
+            rands += [np.random.randint(1, min(high, dinfo.max), lsize, dtype)]
+    elif dtype.kind in 'fc':
+        scale = 1
+        if zeros:
+            scale += 1
+        if not finite:
+            scale += 2
+        if denormal:
+            scale += 1
+        dinfo = np.finfo(dtype)
+        lsize = size // scale
+        rands = [np.random.rand(lsize).astype(dtype)]
+        if not finite:
+            rands += [
+                np.empty(lsize, dtype=dtype), np.empty(lsize, dtype=dtype)
+            ]
+            rands[1].fill(float('nan'))
+            rands[2].fill(float('inf'))
+        if denormal:
+            rands += [np.empty(lsize, dtype=dtype)]
+            rands[-1].fill(dinfo.smallest_subnormal)
+
+    if rands:
+        if zeros:
+            rands += [np.zeros(lsize, dtype)]
+        stride = len(rands)
+        for start, r in enumerate(rands):
+            array[start:len(r)*stride:stride] = r
+
+    if not CACHE_ROOT.exists():
+        CACHE_ROOT.mkdir(parents=True)
+    array.tofile(cache_path)
+    return array
+
 class Benchmark:
     pass
diff --git a/build_requirements.txt b/build_requirements.txt
index fccc2e4fa..de57da279 100644
--- a/build_requirements.txt
+++ b/build_requirements.txt
@@ -1,5 +1,5 @@
 meson-python>=0.10.0
-Cython>=0.29.30,<3.0
-wheel==0.37.0
+Cython>=0.29.34,<3.0
+wheel==0.38.1
 ninja
-git+https://github.com/scientific-python/devpy
+spin==0.3
diff --git a/building_with_meson.md b/building_with_meson.md
index dd1ca1d94..259498998 100644
--- a/building_with_meson.md
+++ b/building_with_meson.md
@@ -9,17 +9,23 @@ into a problem._
 
 **Install build tools:** Use one of:
 
-- `mamba create -f environment.yml
-- `pip install -r build_requirements.txt  # also make sure you have pkg-config and the usual system dependencies for NumPy`
+- `mamba env create -f environment.yml && mamba activate numpy-dev
 
-**Compile and install:** `./dev.py build`
+- `python -m pip install -r build_requirements.txt
+  # also make sure you have pkg-config and the usual system dependencies for
+  # NumPy`
+
+Then install spin:
+- `python -m pip install spin`
+
+**Compile and install:** `spin build`
 
 This builds in the `build/` directory, and installs into the `build-install` directory.
 
-Then run the test suite or a shell via `dev.py`:
+Then run the test suite or a shell via `spin`:
 ```
-./dev.py test
-./dev.py ipython
+spin test
+spin ipython
 ```
 
 Alternatively, to use the package, add it to your `PYTHONPATH`:
@@ -38,7 +44,7 @@ Note that `pip` will use the default build system, which is (as of now) still
 After that is done, `pip install .` or `pip install --no-build-isolation .`
 will work as expected. As does building an sdist or wheel with `python -m build`.
 Note, however, that `pip install -e .` (in-place developer install) does not!
-Use `dev.py` instead (see above).
+Use `spin` instead (see above).
 
 
 
@@ -61,5 +67,5 @@ Libs: -L${libdir} -lopenblas
 Then build with:
 
 ```
-./dev.py build -- -Dpkg_config_path=${HOME}/lib/pkgconfig
+spin build -- -Dpkg_config_path=${HOME}/lib/pkgconfig
 ```
diff --git a/dev.py b/dev.py
deleted file mode 100755
index 205014938..000000000
--- a/dev.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-#
-# Example stub for running `python -m dev.py`
-#
-# Copy this into your project root.
-
-import os
-import sys
-import runpy
-
-sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
-try:
-    runpy.run_module("devpy", run_name="__main__")
-except ImportError:
-    print("Cannot import devpy; please install it using")
-    print()
-    print("  pip install git+https://github.com/scientific-python/devpy")
-    print()
-    sys.exit(1)
diff --git a/doc/changelog/1.24.0-changelog.rst b/doc/changelog/1.24.0-changelog.rst
new file mode 100644
index 000000000..7ae8cd4c9
--- /dev/null
+++ b/doc/changelog/1.24.0-changelog.rst
@@ -0,0 +1,634 @@
+
+Contributors
+============
+
+A total of 177 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* @DWesl
+* @amagicmuffin +
+* @dg3192 +
+* @h-vetinari
+* @juztamau5 +
+* @swagatip +
+* Aaron Meurer
+* Aayush Agrawal +
+* Adam Knapp +
+* Adarsh Singh +
+* Al-Baraa El-Hag
+* Alban Desmaison +
+* Ales Crocaro +
+* Alex Low +
+* Alexander Grund +
+* Alexander Kanavin +
+* Andras Deak
+* Andrew Nelson
+* Angéllica Araujo +
+* Anselm Hahn +
+* Ari Cooper Davis +
+* Axel Huebl +
+* Bas van Beek
+* Bhavuk Kalra
+* Bram Ton +
+* Brent Tan +
+* Brigitta Sipőcz
+* Callum O'Riley +
+* Charles Harris
+* Chiara Marmo
+* Christoph Reiter
+* Damien Caliste
+* Dan Schult +
+* Daniel da Silva
+* DanielHabenicht +
+* Dave Goel +
+* David Gilbertson +
+* Developer-Ecosystem-Engineering
+* Digya Acharya +
+* Dimitri Papadopoulos Orfanos
+* Eric-Shang +
+* Evgeni Burovski
+* Ewout ter Hoeven
+* Felix Hirwa Nshuti +
+* Frazer McLean +
+* Ganesh Kathiresan
+* Gavin Zhang +
+* Gaëtan de Menten
+* Greg Lucas
+* Greg Sadetsky +
+* Gregory P. Smith [Google LLC] +
+* Hameer Abbasi
+* Hannah Aizenman +
+* Hood Chatham
+* I-Shen Leong
+* Iantra Solari +
+* Ikko Ashimine +
+* Inessa Pawson
+* Jake Bowhay +
+* Jake Close +
+* Jarrod Millman
+* Jason Thai
+* JessePires +
+* Jessé Pires +
+* Jhonatan Cunha +
+* Jingxuan He +
+* Johnathon Cusick +
+* Jon Morris
+* Jordy Williams +
+* Josh Wilson
+* João Fontes Gonçalves +
+* Juan Luis Cano Rodríguez
+* Jyn Spring 琴春 +
+* KIU Shueng Chuan
+* Karl Otness +
+* Kevin Sheppard
+* Kinshuk Dua +
+* Leo Fang
+* Leona Taric +
+* Lev Maximov +
+* Lillian Zha +
+* Loïc Estève
+* M. Eric Irrgang +
+* Mariusz Felisiak
+* Mark Harfouche
+* Martin Zacho +
+* Matheus Santana Patriarca +
+* Matt Williams +
+* Matteo Raso +
+* Matthew Barber
+* Matthew Brett
+* Matthew Sterrett +
+* Matthias Bussonnier
+* Matthias Koeppe +
+* Matti Picus
+* Meekail Zain +
+* Melissa Weber Mendonça
+* Michael Osthege +
+* Michael Siebert +
+* Mike Toews
+* Miki Watanabe +
+* Miles Cranmer +
+* Monika Kubek +
+* Muhammad Jarir Kanji +
+* Mukulika Pahari
+* Namami Shanker
+* Nathan Goldbaum +
+* Nathan Rooy +
+* Navpreet Singh +
+* Noritada Kobayashi +
+* Oleksiy Kononenko +
+* Omar Ali +
+* Pal Barta +
+* Pamphile Roy
+* Patrick Hoefler +
+* Pearu Peterson
+* Pedro Nacht +
+* Petar Mlinarić +
+* Peter Hawkins
+* Pey Lian Lim
+* Pieter Eendebak
+* Pradipta Ghosh
+* Pranab Das +
+* Precision Wordcraft LLC +
+* PrecisionWordcraft +
+* Rafael CF Sousa +
+* Rafael Cardoso Fernandes Sousa
+* Rafael Sousa +
+* Raghuveer Devulapalli
+* Ralf Gommers
+* Rin Cat (鈴猫) +
+* Robert Kern
+* Rohit Davas +
+* Rohit Goswami
+* Ross Barnowski
+* Roy Smart +
+* Ruth Comer +
+* Sabiha Tahsin Soha +
+* Sachin Krishnan T V +
+* Sanjana M Moodbagil +
+* Sanya Sinha +
+* Sarah Coffland +
+* Saransh Chopra +
+* Satish Kumar Mishra +
+* Satish Mishra +
+* Sayed Adel
+* Schrijvers Luc +
+* Sebastian Berg
+* Serge Guelton
+* Seva Lotoshnikov +
+* Shashank Gupta +
+* Shoban Chiddarth +
+* Shreya Singh +
+* Shreyas Joshi +
+* Sicheng Zeng +
+* Simran Makandar +
+* Shuangchi He +
+* Srimukh Sripada +
+* Stefan van der Walt
+* Stefanie Molin +
+* Stuart Archibald
+* Tania Allard
+* Thomas A Caswell
+* Thomas Kastl +
+* Thomas Mansencal +
+* Tony Newton / Teng Liu +
+* Toshiki Kataoka
+* Tyler Reddy
+* Vardhaman Kalloli +
+* Warren Weckesser
+* Will Ayd +
+* William Stein +
+* XinRu Lin +
+* Yin Li +
+* Yunika Bajracharya +
+* Zachary Blackwood +
+* 渡邉 美希 +
+
+Pull requests merged
+====================
+
+A total of 444 pull requests were merged for this release.
+
+* `#12065 <https://github.com/numpy/numpy/pull/12065>`__: API: Optimize np.isin and np.in1d for integer arrays and add...
+* `#15782 <https://github.com/numpy/numpy/pull/15782>`__: ENH: complete the 'vars' list of a module
+* `#16022 <https://github.com/numpy/numpy/pull/16022>`__: ENH: Adding __array_ufunc__ capability to MaskedArrays
+* `#16154 <https://github.com/numpy/numpy/pull/16154>`__: ENH: Add support for symbol to polynomial package
+* `#16507 <https://github.com/numpy/numpy/pull/16507>`__: BUG: Do not allow nditer to reduce the mask
+* `#16971 <https://github.com/numpy/numpy/pull/16971>`__: BUG: Fix three ``complex``- & ``float128``-related issues with ``nd_grid``
+* `#19388 <https://github.com/numpy/numpy/pull/19388>`__: ENH: Support character and character string arrays
+* `#20321 <https://github.com/numpy/numpy/pull/20321>`__: ENH: allow NumPy created .npy files to be appended in-place
+* `#20659 <https://github.com/numpy/numpy/pull/20659>`__: BUG: cross product. Added dtype conversions of inputs. See. #19138
+* `#20913 <https://github.com/numpy/numpy/pull/20913>`__: ENH, SIMD: Extend universal intrinsics to support IBMZ
+* `#20914 <https://github.com/numpy/numpy/pull/20914>`__: BUG: change ``ma.mean`` dtype to be consistent with ``np.mean``
+* `#20924 <https://github.com/numpy/numpy/pull/20924>`__: MAINT: Simplify element setting and use it for filling
+* `#20949 <https://github.com/numpy/numpy/pull/20949>`__: MAINT: Rename INSTALL.rst.txt to INSTALL.rst
+* `#21041 <https://github.com/numpy/numpy/pull/21041>`__: ENH: Implement string comparison ufuncs (or almost)
+* `#21084 <https://github.com/numpy/numpy/pull/21084>`__: MAINT: Fix computation of numpy.array_api.linalg.vector_norm
+* `#21098 <https://github.com/numpy/numpy/pull/21098>`__: DOC: Fix axis naming in ``argpartition`` docs
+* `#21103 <https://github.com/numpy/numpy/pull/21103>`__: NEP: Add NEP 50 draft about fixing scalar promotion rules
+* `#21152 <https://github.com/numpy/numpy/pull/21152>`__: DOC: verifying bug existence and fixes - replacement for PR 17851
+* `#21248 <https://github.com/numpy/numpy/pull/21248>`__: DOC: improve description of the ``data`` entry in ``__array_interface__``
+* `#21308 <https://github.com/numpy/numpy/pull/21308>`__: MAINT: Start testing with Python 3.11.
+* `#21403 <https://github.com/numpy/numpy/pull/21403>`__: MAINT: remove some names from main numpy namespace
+* `#21437 <https://github.com/numpy/numpy/pull/21437>`__: ENH: Add floating point error checking to (almost) all casts
+* `#21468 <https://github.com/numpy/numpy/pull/21468>`__: ENH: Use ``threadpoolctl`` in ``show_runtime`` (a new function)
+* `#21471 <https://github.com/numpy/numpy/pull/21471>`__: DOC: adding docstring to TooHardError class
+* `#21483 <https://github.com/numpy/numpy/pull/21483>`__: SIMD: Use universal intrinsics to implement comparison functions
+* `#21501 <https://github.com/numpy/numpy/pull/21501>`__: DOC: improve ``ascontiguousarray()`` and ``asfortranarray()`` examples
+* `#21504 <https://github.com/numpy/numpy/pull/21504>`__: MAINT: Fix some typos.
+* `#21507 <https://github.com/numpy/numpy/pull/21507>`__: BUG: Better report integer division overflow
+* `#21527 <https://github.com/numpy/numpy/pull/21527>`__: PERF: Fast path for dtype lookup of float and long
+* `#21537 <https://github.com/numpy/numpy/pull/21537>`__: DOC: Add a policy about inactive PRs.
+* `#21564 <https://github.com/numpy/numpy/pull/21564>`__: TST: Enable doctests in IO Howto with testsetup and testcleanup
+* `#21567 <https://github.com/numpy/numpy/pull/21567>`__: BUG: Adding the default pytest doctest instead of the ValueError
+* `#21572 <https://github.com/numpy/numpy/pull/21572>`__: MAINT: allow absolute module names in refguide-check
+* `#21579 <https://github.com/numpy/numpy/pull/21579>`__: DOC: Improve docstring of numpy.testing.assert_allclose
+* `#21581 <https://github.com/numpy/numpy/pull/21581>`__: REL: Prepare main for NumPy 1.24.0 development
+* `#21582 <https://github.com/numpy/numpy/pull/21582>`__: MAINT: Fix some small refcounting and similar issues
+* `#21583 <https://github.com/numpy/numpy/pull/21583>`__: Add ``CODEOWNER`` for the ``array_api`` module
+* `#21587 <https://github.com/numpy/numpy/pull/21587>`__: DOC: update logarithm docs as per theory.
+* `#21591 <https://github.com/numpy/numpy/pull/21591>`__: BUG, STY: Fix doctest failure in EXAMPLE_DOCSTRING.
+* `#21595 <https://github.com/numpy/numpy/pull/21595>`__: ENH: Add strict parameter to assert_array_equal. Fixes #9542
+* `#21596 <https://github.com/numpy/numpy/pull/21596>`__: TYP,MAINT: Allow unsigned integer inplace-ops to accept signed...
+* `#21600 <https://github.com/numpy/numpy/pull/21600>`__: DOC: Add missing links for NEP36 and NEP49
+* `#21601 <https://github.com/numpy/numpy/pull/21601>`__: DOC: update NEP29 to address PEP0602
+* `#21602 <https://github.com/numpy/numpy/pull/21602>`__: BUILD: fix tag name for travis: it is v1.23.0rc1
+* `#21605 <https://github.com/numpy/numpy/pull/21605>`__: MAINT: Adapt npt._GenericAlias to Python 3.11 types.GenericAlias...
+* `#21609 <https://github.com/numpy/numpy/pull/21609>`__: MAINT: Fix some API versions.
+* `#21611 <https://github.com/numpy/numpy/pull/21611>`__: MAINT: make MismatchCAPIWarnining into MismatchCAPIError
+* `#21615 <https://github.com/numpy/numpy/pull/21615>`__: MAINT: Remove compatibility shims for old versions of PyPy
+* `#21616 <https://github.com/numpy/numpy/pull/21616>`__: DOC: Describe the changed Python release cadence better
+* `#21617 <https://github.com/numpy/numpy/pull/21617>`__: MAINT, STY: Make download-wheels download source files.
+* `#21620 <https://github.com/numpy/numpy/pull/21620>`__: MAINT: Fortify masked in-place ops against promotion warnings
+* `#21622 <https://github.com/numpy/numpy/pull/21622>`__: TST: Skip F2PY tests without Fortran compilers
+* `#21623 <https://github.com/numpy/numpy/pull/21623>`__: ENH: Add equals_nans kwarg to np.unique
+* `#21624 <https://github.com/numpy/numpy/pull/21624>`__: DOC: move ``import_array`` and ``import_umath`` above ``PyModule_Create``
+* `#21626 <https://github.com/numpy/numpy/pull/21626>`__: API: Introduce optional (and partial) NEP 50 weak scalar logic
+* `#21627 <https://github.com/numpy/numpy/pull/21627>`__: ENH: adding casting option to numpy.stack.
+* `#21630 <https://github.com/numpy/numpy/pull/21630>`__: MAINT: back out conversion of npymath component to c++
+* `#21632 <https://github.com/numpy/numpy/pull/21632>`__: API: Retain ``arr.base`` more strictly in ``np.frombuffer``
+* `#21639 <https://github.com/numpy/numpy/pull/21639>`__: BUG: use explicit einsum_path whenever it is given
+* `#21641 <https://github.com/numpy/numpy/pull/21641>`__: DOC: Note version-switcher update in the release walkthrough
+* `#21644 <https://github.com/numpy/numpy/pull/21644>`__: MAINT: Fixup ``unique``'s ``equal_nan`` kwarg to match ``np.array_equal``
+* `#21645 <https://github.com/numpy/numpy/pull/21645>`__: DEP: Remove ``normed=`` keyword argument from histograms
+* `#21648 <https://github.com/numpy/numpy/pull/21648>`__: ENH: issue overflow warning when using ``abs`` on ``np.int8(-128)``
+* `#21651 <https://github.com/numpy/numpy/pull/21651>`__: MAINT: Remove unused/not useful CODEOWNERS file again
+* `#21654 <https://github.com/numpy/numpy/pull/21654>`__: MAINT: limit the number of decimals in Polynomial representation
+* `#21658 <https://github.com/numpy/numpy/pull/21658>`__: MAINT: improved overflow check to avoid undefined behavior
+* `#21659 <https://github.com/numpy/numpy/pull/21659>`__: BUG: Fix a bug left after f2py2e refactor
+* `#21661 <https://github.com/numpy/numpy/pull/21661>`__: TYP, ENH: Add annotations for the ``equal_nan`` keyword to ``np.unique``
+* `#21662 <https://github.com/numpy/numpy/pull/21662>`__: DOC: ``np`` in double backticks.
+* `#21663 <https://github.com/numpy/numpy/pull/21663>`__: DEP: Deprecate (rather than remove) the int-via-float parsing...
+* `#21664 <https://github.com/numpy/numpy/pull/21664>`__: DOC: Misc RST reformatting.
+* `#21666 <https://github.com/numpy/numpy/pull/21666>`__: MAINT: Change array API unique_*() functions to not compare nans...
+* `#21675 <https://github.com/numpy/numpy/pull/21675>`__: DOC: Add note about broadcasting support for ``random.Generator.multinomial``
+* `#21677 <https://github.com/numpy/numpy/pull/21677>`__: DOC: RST Titles Underline reordering
+* `#21681 <https://github.com/numpy/numpy/pull/21681>`__: MAINT: Point documentation version switcher at the docs homepage
+* `#21687 <https://github.com/numpy/numpy/pull/21687>`__: BUG: switch _CMP_NEQ_OQ to _CMP_NEQ_UQ for npyv_cmpneq_f[32,64]
+* `#21689 <https://github.com/numpy/numpy/pull/21689>`__: DOC: Fix broadcasting documentation.
+* `#21690 <https://github.com/numpy/numpy/pull/21690>`__: BUG: Prevent attempted broadcasting of 0-D output operands in...
+* `#21691 <https://github.com/numpy/numpy/pull/21691>`__: BUG: Small fixupes found using valgrind
+* `#21692 <https://github.com/numpy/numpy/pull/21692>`__: MAINT: Renamed doc/records.rst.txt to doc/records.rst
+* `#21701 <https://github.com/numpy/numpy/pull/21701>`__: BUG: Enable fortran preprocessing for ifort on Windows
+* `#21704 <https://github.com/numpy/numpy/pull/21704>`__: DOC: Mention positional-only arguments in the array API compatibility...
+* `#21705 <https://github.com/numpy/numpy/pull/21705>`__: BLD, SIMD: Fix detect armhf and hardened the Neon/ASIMD compile-time...
+* `#21707 <https://github.com/numpy/numpy/pull/21707>`__: MAINT: generate_umath.py: do not write full path into output...
+* `#21709 <https://github.com/numpy/numpy/pull/21709>`__: TST: Fixup loadtxt int-via-float tests when in release mode
+* `#21712 <https://github.com/numpy/numpy/pull/21712>`__: BUG: ``.f2py_f2cmap`` doesn't map ``long_long`` and other options
+* `#21715 <https://github.com/numpy/numpy/pull/21715>`__: DOC: Tell people to use only-binary option
+* `#21723 <https://github.com/numpy/numpy/pull/21723>`__: DOC: Update sphinx, numpydoc, and pydata-sphinx-theme
+* `#21727 <https://github.com/numpy/numpy/pull/21727>`__: MAINT, SIMD: Handle overflow gracefully in integer division
+* `#21731 <https://github.com/numpy/numpy/pull/21731>`__: ENH: Ensure dispatcher TypeErrors report original name
+* `#21732 <https://github.com/numpy/numpy/pull/21732>`__: ENH: Add support for platforms with missing fenv flags
+* `#21733 <https://github.com/numpy/numpy/pull/21733>`__: ENH: Fix pointer size determination for cross build
+* `#21734 <https://github.com/numpy/numpy/pull/21734>`__: ENH: Check that we are targeting x86 or x64 architecture before...
+* `#21735 <https://github.com/numpy/numpy/pull/21735>`__: ENH: cross compilation: use sysconfig to determine if x86_64...
+* `#21745 <https://github.com/numpy/numpy/pull/21745>`__: MAINT: Remove FPE helper code that is unnecessary on C99/C++11
+* `#21748 <https://github.com/numpy/numpy/pull/21748>`__: BUG: Fix for npyv_orc_b8 and npyv_xnor_b8 for s390x (z13)
+* `#21749 <https://github.com/numpy/numpy/pull/21749>`__: BUG, SIMD: Fix detecting NEON/ASIMD on aarch64
+* `#21750 <https://github.com/numpy/numpy/pull/21750>`__: CI: Fix CI SIMD build on s390x
+* `#21755 <https://github.com/numpy/numpy/pull/21755>`__: BUG: Do not skip value-based promotion path for large Python...
+* `#21759 <https://github.com/numpy/numpy/pull/21759>`__: BUG: Fix small reference leaks found with pytest-leaks
+* `#21763 <https://github.com/numpy/numpy/pull/21763>`__: MAINT: use f-string format in test_abc.py
+* `#21764 <https://github.com/numpy/numpy/pull/21764>`__: BUG: Fix a potential variable misuse bug
+* `#21766 <https://github.com/numpy/numpy/pull/21766>`__: CI: Guard compile-time CPU features tests
+* `#21771 <https://github.com/numpy/numpy/pull/21771>`__: MAINT: Add a check of the return value of PyMem_Calloc().
+* `#21773 <https://github.com/numpy/numpy/pull/21773>`__: MAINT: fix typo in string_ufuncs.cpp
+* `#21776 <https://github.com/numpy/numpy/pull/21776>`__: CI: add workflow for non-optimized builds
+* `#21778 <https://github.com/numpy/numpy/pull/21778>`__: MAINT, SIMD: remove orphan path vsx/conversion.h
+* `#21779 <https://github.com/numpy/numpy/pull/21779>`__: MAINT: random: Disallow complex inputs in multivariate_normal
+* `#21786 <https://github.com/numpy/numpy/pull/21786>`__: MAINT: fix up use of ``NPY_NO_DEPRECATED_API`` usage in f2py
+* `#21789 <https://github.com/numpy/numpy/pull/21789>`__: ENH: Change f2c declarations with void return type to int
+* `#21790 <https://github.com/numpy/numpy/pull/21790>`__: ENH: add overflow handling for scalar ``negative`` operation
+* `#21793 <https://github.com/numpy/numpy/pull/21793>`__: ENH: Add overflow handling for negative integers scalar multiplication
+* `#21794 <https://github.com/numpy/numpy/pull/21794>`__: BUG: lib: A loadtxt error message had two values reversed.
+* `#21795 <https://github.com/numpy/numpy/pull/21795>`__: ENH: Ensure that assertion of unsigned dtypes does not return...
+* `#21796 <https://github.com/numpy/numpy/pull/21796>`__: BUG: Fix comparison for empty structured arrays
+* `#21797 <https://github.com/numpy/numpy/pull/21797>`__: MAINT: Reduce object construction in np.require
+* `#21798 <https://github.com/numpy/numpy/pull/21798>`__: MAINT: use PyErr_SetString in _import_array where possible
+* `#21807 <https://github.com/numpy/numpy/pull/21807>`__: ENH: Handle the value attribute in F2Py wrappers
+* `#21812 <https://github.com/numpy/numpy/pull/21812>`__: BUG: Define ``<``, ``<=``, ``>``, ``>=`` for masked arrays
+* `#21815 <https://github.com/numpy/numpy/pull/21815>`__: BUG: Fix discovered MachAr (still used within valgrind)
+* `#21817 <https://github.com/numpy/numpy/pull/21817>`__: ENH: Always fill object fields with None rather than NULL
+* `#21823 <https://github.com/numpy/numpy/pull/21823>`__: MAINT: Try fixing broken Anaconda uploads.
+* `#21828 <https://github.com/numpy/numpy/pull/21828>`__: MAINT: Update main after 1.23.0 release.
+* `#21830 <https://github.com/numpy/numpy/pull/21830>`__: DOC: Change substract to subtract in comment
+* `#21832 <https://github.com/numpy/numpy/pull/21832>`__: PERF: Micro optimize np.linspace
+* `#21835 <https://github.com/numpy/numpy/pull/21835>`__: TST: Add f2py CLI tests
+* `#21836 <https://github.com/numpy/numpy/pull/21836>`__: DOC: F2PY documentation improvements
+* `#21837 <https://github.com/numpy/numpy/pull/21837>`__: DOC: Document expectation for object array initialization
+* `#21842 <https://github.com/numpy/numpy/pull/21842>`__: BUG: Fix in1d for empty integer array as input
+* `#21844 <https://github.com/numpy/numpy/pull/21844>`__: DOC: Rephrase dimensionality, size in broadcasting.rst
+* `#21848 <https://github.com/numpy/numpy/pull/21848>`__: BUG: Handle NaNs correctly for float16 during sorting
+* `#21849 <https://github.com/numpy/numpy/pull/21849>`__: MAINT: Update the documentation Makefile
+* `#21851 <https://github.com/numpy/numpy/pull/21851>`__: BUG: Use ``keepdims`` during normalization in ``np.average`` and...
+* `#21853 <https://github.com/numpy/numpy/pull/21853>`__: DOC: Update the docstring for np.around
+* `#21854 <https://github.com/numpy/numpy/pull/21854>`__: DOC: mention changes to ``max_rows`` behaviour in ``np.loadtxt``
+* `#21855 <https://github.com/numpy/numpy/pull/21855>`__: DOC: Replace the mathematical notation N(...) with text.
+* `#21856 <https://github.com/numpy/numpy/pull/21856>`__: DOC: Mention uniform in the np.random.Generator.random function.
+* `#21857 <https://github.com/numpy/numpy/pull/21857>`__: BUG: Reject non integer array-likes with size 1 in delete
+* `#21860 <https://github.com/numpy/numpy/pull/21860>`__: BUG: Fix numpy.isin for timedelta dtype
+* `#21861 <https://github.com/numpy/numpy/pull/21861>`__: DOC: clarify loadtxt input cols requirement
+* `#21863 <https://github.com/numpy/numpy/pull/21863>`__: ENH,MAINT: Improve and simplify scalar floating point warnings
+* `#21872 <https://github.com/numpy/numpy/pull/21872>`__: CI: tools: Remove a long but obsolete comment from travis-test.sh
+* `#21875 <https://github.com/numpy/numpy/pull/21875>`__: ENH: Implement correct scalar and integer overflow errors for...
+* `#21876 <https://github.com/numpy/numpy/pull/21876>`__: DOC: Fixed minor typo in reference
+* `#21877 <https://github.com/numpy/numpy/pull/21877>`__: DOC: dark theme css rules
+* `#21879 <https://github.com/numpy/numpy/pull/21879>`__: CI: skip azp and circleCI logic
+* `#21881 <https://github.com/numpy/numpy/pull/21881>`__: MAINT: Disable checks for Win workaround for GCC
+* `#21884 <https://github.com/numpy/numpy/pull/21884>`__: MAINT: Fix non-void function does not return a value warning
+* `#21885 <https://github.com/numpy/numpy/pull/21885>`__: DOC: Link to PEP-484 in the typing docs
+* `#21886 <https://github.com/numpy/numpy/pull/21886>`__: Fix lib flags for librandom
+* `#21887 <https://github.com/numpy/numpy/pull/21887>`__: BLD: Allow GCC compile on mingw-w64-based systems
+* `#21890 <https://github.com/numpy/numpy/pull/21890>`__: BUG: Fix KeyError in crackfortran operator support
+* `#21894 <https://github.com/numpy/numpy/pull/21894>`__: BUG: Fix datetime_to_timedelta_resolve_descriptors signature
+* `#21895 <https://github.com/numpy/numpy/pull/21895>`__: ENH, CI: Add Emscripten to CI
+* `#21896 <https://github.com/numpy/numpy/pull/21896>`__: BLD: Make can_link_svml return False for 32bit builds on x86_64
+* `#21904 <https://github.com/numpy/numpy/pull/21904>`__: DOC: Add usage example to np.char.center docstring
+* `#21905 <https://github.com/numpy/numpy/pull/21905>`__: DOC: Fix the interpolation formulae for quantile and percentile
+* `#21909 <https://github.com/numpy/numpy/pull/21909>`__: DOC: fix typo in ``numpy._typing._NestedSequence`` docstring example
+* `#21921 <https://github.com/numpy/numpy/pull/21921>`__: DOC: Fix typo on Byte-swapping page
+* `#21922 <https://github.com/numpy/numpy/pull/21922>`__: DOC: fix typo on custom array containers page
+* `#21924 <https://github.com/numpy/numpy/pull/21924>`__: CI: Use OpenBLAS again on Cygwin
+* `#21925 <https://github.com/numpy/numpy/pull/21925>`__: BUG: Fix subarray to object cast ownership details
+* `#21926 <https://github.com/numpy/numpy/pull/21926>`__: MAINT: random: Annotate default_rng with cython.embedsignature
+* `#21927 <https://github.com/numpy/numpy/pull/21927>`__: DOC: Add a note about security and NumPy to the documentation
+* `#21928 <https://github.com/numpy/numpy/pull/21928>`__: BUG: Fix the implementation of numpy.array_api.vecdot
+* `#21943 <https://github.com/numpy/numpy/pull/21943>`__: DOC, MAINT: Document the C-API incompatibility error and point...
+* `#21945 <https://github.com/numpy/numpy/pull/21945>`__: MAINT, DOC: Update release guide
+* `#21946 <https://github.com/numpy/numpy/pull/21946>`__: BUG: Reorder extern "C" to only apply to function declarations...
+* `#21948 <https://github.com/numpy/numpy/pull/21948>`__: [DOC] Double backticks in lagfit.
+* `#21954 <https://github.com/numpy/numpy/pull/21954>`__: TST: Add tests for FP16 umath functions
+* `#21955 <https://github.com/numpy/numpy/pull/21955>`__: ENH: Vectorize FP16 umath functions using AVX512
+* `#21956 <https://github.com/numpy/numpy/pull/21956>`__: MAINT: Update main after 1.23.1 release.
+* `#21957 <https://github.com/numpy/numpy/pull/21957>`__: TYP,ENH: Mark all unhashable classes as such
+* `#21959 <https://github.com/numpy/numpy/pull/21959>`__: BUG: Use ``Popen`` to silently invoke f77 -v
+* `#21960 <https://github.com/numpy/numpy/pull/21960>`__: Revert "ENH: Adding __array_ufunc__ capability to MaskedArrays"
+* `#21963 <https://github.com/numpy/numpy/pull/21963>`__: BLD: remove outdated pin for ``packaging`` on macOS arm64
+* `#21965 <https://github.com/numpy/numpy/pull/21965>`__: Added priority in bug-report issue-template
+* `#21968 <https://github.com/numpy/numpy/pull/21968>`__: ENH: Add ``__array_ufunc__`` typing support to the ``nin=1`` ufuncs
+* `#21973 <https://github.com/numpy/numpy/pull/21973>`__: DOC: correct docstring for numpy.correlate()
+* `#21974 <https://github.com/numpy/numpy/pull/21974>`__: MAINT, TYP: Fix ``np.angle`` dtype-overloads
+* `#21976 <https://github.com/numpy/numpy/pull/21976>`__: ENH: Add the capability to swap the singleton bit generator
+* `#21977 <https://github.com/numpy/numpy/pull/21977>`__: ENH: Adding __array_ufunc__ capability to MaskedArrays
+* `#21979 <https://github.com/numpy/numpy/pull/21979>`__: BUG: Fix experimental dtype slot numbers
+* `#21981 <https://github.com/numpy/numpy/pull/21981>`__: TST: ensure ``np.equal.reduce`` raises a ``TypeError``
+* `#21982 <https://github.com/numpy/numpy/pull/21982>`__: MAINT: Do not let ``_GenericAlias`` wrap the underlying classes'...
+* `#21983 <https://github.com/numpy/numpy/pull/21983>`__: TYP,MAINT: Allow ``einsum`` subscripts to be passed via integer...
+* `#21984 <https://github.com/numpy/numpy/pull/21984>`__: MAINT,TYP: Add object-overloads for the ``np.generic`` rich comparisons
+* `#21986 <https://github.com/numpy/numpy/pull/21986>`__: MAINT: Remove two unused imports
+* `#21991 <https://github.com/numpy/numpy/pull/21991>`__: MAINT: rm old warning
+* `#21992 <https://github.com/numpy/numpy/pull/21992>`__: DOC: cross-reference descriptions of frombuffer and ndarray.tobytes
+* `#21993 <https://github.com/numpy/numpy/pull/21993>`__: BUG: fix ma.minimum.reduce with axis keyword
+* `#21995 <https://github.com/numpy/numpy/pull/21995>`__: BUG: Distinguish exact vs. equivalent dtype for C type aliases.
+* `#21996 <https://github.com/numpy/numpy/pull/21996>`__: BUG: Avoid errors on NULL during deepcopy
+* `#21997 <https://github.com/numpy/numpy/pull/21997>`__: DOC: unify ``np.transpose``, ``np.ndarray.transpose``, and ``np.ndarray.T``
+* `#21999 <https://github.com/numpy/numpy/pull/21999>`__: BUG: Fix masked median bug
+* `#22000 <https://github.com/numpy/numpy/pull/22000>`__: DOC: some improvements in the NumPy/MATLAB comparison
+* `#22002 <https://github.com/numpy/numpy/pull/22002>`__: DOC: add links to ``linalg`` in docs of ``dot`` and ``matmul``
+* `#22004 <https://github.com/numpy/numpy/pull/22004>`__: DEP: Finalize ragged array creation deprecation
+* `#22008 <https://github.com/numpy/numpy/pull/22008>`__: MAINT: remove unneeded ``__future__`` imports
+* `#22009 <https://github.com/numpy/numpy/pull/22009>`__: BUG: fix np.average for Fraction elements
+* `#22010 <https://github.com/numpy/numpy/pull/22010>`__: DOC: Add versionchanged for converter callable behavior.
+* `#22013 <https://github.com/numpy/numpy/pull/22013>`__: DOC: updated masked_print_option and added explanation
+* `#22014 <https://github.com/numpy/numpy/pull/22014>`__: BUG/ENH: Allow bit generators to supply their own constructor
+* `#22016 <https://github.com/numpy/numpy/pull/22016>`__: BUG: Revert using __array_ufunc__ for MaskedArray
+* `#22017 <https://github.com/numpy/numpy/pull/22017>`__: ENH: reorder includes for testing on top of system installations...
+* `#22022 <https://github.com/numpy/numpy/pull/22022>`__: MAINT,TYP: Allow the ``squeeze`` and ``transpose`` method to...
+* `#22024 <https://github.com/numpy/numpy/pull/22024>`__: BUG: Expose string_heapsort algorithm in a shared header
+* `#22043 <https://github.com/numpy/numpy/pull/22043>`__: BLD: use macos-11 image on azure, macos-1015 is deprecated
+* `#22045 <https://github.com/numpy/numpy/pull/22045>`__: ENH: allow importlib.LazyLoader to work with numpy and add test...
+* `#22046 <https://github.com/numpy/numpy/pull/22046>`__: BUG: Make ``mask_invalid`` consistent with ``mask_where`` if ``copy``...
+* `#22053 <https://github.com/numpy/numpy/pull/22053>`__: MAINT: Quiet the anaconda uploads.
+* `#22060 <https://github.com/numpy/numpy/pull/22060>`__: PERF: Improve import time of numpy
+* `#22071 <https://github.com/numpy/numpy/pull/22071>`__: MAINT: fix typo in test_array_object.py
+* `#22081 <https://github.com/numpy/numpy/pull/22081>`__: PERF: Remove numpy.compat._pep440 from default imports
+* `#22083 <https://github.com/numpy/numpy/pull/22083>`__: BUG: Fix skip condition for test_loss_of_precision[complex256]
+* `#22087 <https://github.com/numpy/numpy/pull/22087>`__: ENH: raise TypeError when arange() is called with string dtype
+* `#22090 <https://github.com/numpy/numpy/pull/22090>`__: MAINT: Simplify npymath
+* `#22096 <https://github.com/numpy/numpy/pull/22096>`__: PERF: Improve intrinsics for tobits and pack on Apple silicon
+* `#22099 <https://github.com/numpy/numpy/pull/22099>`__: TST: Remove workaround for gh-9968 from ``test_scalar_methods.test_roundtrip``
+* `#22102 <https://github.com/numpy/numpy/pull/22102>`__: BLD: Try building python3.11 wheels.
+* `#22105 <https://github.com/numpy/numpy/pull/22105>`__: TST: fix test_linear_interpolation_formula_symmetric
+* `#22110 <https://github.com/numpy/numpy/pull/22110>`__: BUG: Address failures in aarch64 gcc builds due to #22096
+* `#22111 <https://github.com/numpy/numpy/pull/22111>`__: BUG: Missed a case in the diff merge for #22110
+* `#22112 <https://github.com/numpy/numpy/pull/22112>`__: MAINT: remove redundant reversal of eigenvalues order in svd...
+* `#22116 <https://github.com/numpy/numpy/pull/22116>`__: MAINT,DOC: Remove sphinx-panels in favor of sphinx-design
+* `#22117 <https://github.com/numpy/numpy/pull/22117>`__: DOC: Reorganize user guide outline
+* `#22118 <https://github.com/numpy/numpy/pull/22118>`__: DOC: Fix documentation for percentile and quantile
+* `#22120 <https://github.com/numpy/numpy/pull/22120>`__: DOC: Explain spawn_key a little more.
+* `#22122 <https://github.com/numpy/numpy/pull/22122>`__: DOC: Explain how to use sequences of integers as seeds.
+* `#22124 <https://github.com/numpy/numpy/pull/22124>`__: MAINT: support IBM i system
+* `#22127 <https://github.com/numpy/numpy/pull/22127>`__: BLD: Add Python 3.11 wheels to aarch64 build
+* `#22130 <https://github.com/numpy/numpy/pull/22130>`__: MAINT: Update main after 1.23.2 release.
+* `#22132 <https://github.com/numpy/numpy/pull/22132>`__: DOC: Add a release note for bit generator reduce changes
+* `#22139 <https://github.com/numpy/numpy/pull/22139>`__: DEP: drop support for msvc<=1900 and Interix
+* `#22142 <https://github.com/numpy/numpy/pull/22142>`__: MAINT: Update setup.py for Python 3.11.
+* `#22143 <https://github.com/numpy/numpy/pull/22143>`__: MAINT: Update the RELEASE_WALKTHROUGH file.
+* `#22144 <https://github.com/numpy/numpy/pull/22144>`__: DOC: Add missing word
+* `#22147 <https://github.com/numpy/numpy/pull/22147>`__: DOC: fix linalg.tensorsolve docstring
+* `#22149 <https://github.com/numpy/numpy/pull/22149>`__: DOC: Copy-edit the 'partition' docstring.
+* `#22150 <https://github.com/numpy/numpy/pull/22150>`__: CI: Test NumPy build against old versions of GCC(6, 7, 8)
+* `#22152 <https://github.com/numpy/numpy/pull/22152>`__: BUG: Support using libunwind for backtrack
+* `#22154 <https://github.com/numpy/numpy/pull/22154>`__: DOC: add more prominent warnings to pin setuptools
+* `#22159 <https://github.com/numpy/numpy/pull/22159>`__: DEP: drop older cygwin compatibility shims
+* `#22161 <https://github.com/numpy/numpy/pull/22161>`__: MAINT: simplify complex math function handling in npymath
+* `#22163 <https://github.com/numpy/numpy/pull/22163>`__: PERF: Eliminate slow check for pypy during numpy import
+* `#22164 <https://github.com/numpy/numpy/pull/22164>`__: ENH: Improve tanh for architectures without efficient gather/scatter...
+* `#22168 <https://github.com/numpy/numpy/pull/22168>`__: ENH: Remove AVX related functions from non x86 based builds
+* `#22169 <https://github.com/numpy/numpy/pull/22169>`__: MAINT: fix defines for universal2 python builds of NumPy
+* `#22171 <https://github.com/numpy/numpy/pull/22171>`__: DOC: Note symmetry requirement in ``multivariate_normal`` error
+* `#22179 <https://github.com/numpy/numpy/pull/22179>`__: TST: Implemented an unused test for np.random.randint
+* `#22189 <https://github.com/numpy/numpy/pull/22189>`__: MAINT: fix an incorrect pointer type usage in f2py
+* `#22193 <https://github.com/numpy/numpy/pull/22193>`__: BUG: change overloads to play nice with pyright.
+* `#22194 <https://github.com/numpy/numpy/pull/22194>`__: BUG: Fix circleci build
+* `#22199 <https://github.com/numpy/numpy/pull/22199>`__: MAINT: Unpin towncrier
+* `#22204 <https://github.com/numpy/numpy/pull/22204>`__: TST,BUG: Use fork context to fix MacOS savez test
+* `#22205 <https://github.com/numpy/numpy/pull/22205>`__: DOC: Clarify that ``like`` is not passed to ``function``
+* `#22206 <https://github.com/numpy/numpy/pull/22206>`__: DOC: Fix typo disutils -> distutils in numpy.distutils migration...
+* `#22210 <https://github.com/numpy/numpy/pull/22210>`__: DOC: Fix typos in cast warning release note
+* `#22212 <https://github.com/numpy/numpy/pull/22212>`__: TYP,BUG: Reduce argument validation in C-based ``__class_getitem__``
+* `#22227 <https://github.com/numpy/numpy/pull/22227>`__: DOC: fix up release note
+* `#22228 <https://github.com/numpy/numpy/pull/22228>`__: MAINT: Remove long deprecated functionality from np.ma
+* `#22235 <https://github.com/numpy/numpy/pull/22235>`__: Remove incorrect comment about checking generated C files in
+* `#22236 <https://github.com/numpy/numpy/pull/22236>`__: BUG: Fix incorrect refcounting in new ``asarray`` path
+* `#22239 <https://github.com/numpy/numpy/pull/22239>`__: MAINT: Update main after 1.23.3 release.
+* `#22240 <https://github.com/numpy/numpy/pull/22240>`__: ENH: Use SVML for fp32 and fp64 power and arctan2
+* `#22242 <https://github.com/numpy/numpy/pull/22242>`__: BLD: add back stdlib.h include in pcg64.h
+* `#22245 <https://github.com/numpy/numpy/pull/22245>`__: MAINT: random: remove ``get_info`` from "extending with Cython"...
+* `#22251 <https://github.com/numpy/numpy/pull/22251>`__: TST: Move new ``asarray`` test to a more appropriate place.
+* `#22253 <https://github.com/numpy/numpy/pull/22253>`__: DOC: Update concatenate exception message.
+* `#22254 <https://github.com/numpy/numpy/pull/22254>`__: DOC: Improve ``converters`` parameter description for loadtxt
+* `#22256 <https://github.com/numpy/numpy/pull/22256>`__: DOC: Add C API example for NPY_ITER_MULTI_INDEX
+* `#22259 <https://github.com/numpy/numpy/pull/22259>`__: DOC: Better report integer division overflow (#21506)
+* `#22261 <https://github.com/numpy/numpy/pull/22261>`__: NEP: Make NEP 51 to propose changing the scalar representation
+* `#22263 <https://github.com/numpy/numpy/pull/22263>`__: DOC: Clarified how finfo works with complex numbers (#22260)
+* `#22272 <https://github.com/numpy/numpy/pull/22272>`__: MAINT, Haiku defines neither __STDC_NO_THREADS__ nor __GLIBC__
+* `#22276 <https://github.com/numpy/numpy/pull/22276>`__: DOC: Update for return value of np.ptp()
+* `#22279 <https://github.com/numpy/numpy/pull/22279>`__: DOC: Add example to msort docstring
+* `#22280 <https://github.com/numpy/numpy/pull/22280>`__: DOC: updated the description for array-like type in histogramdd...
+* `#22282 <https://github.com/numpy/numpy/pull/22282>`__: DOC: Add example for find
+* `#22285 <https://github.com/numpy/numpy/pull/22285>`__: DOC: Add ``isupper`` examples
+* `#22291 <https://github.com/numpy/numpy/pull/22291>`__: DOC: Add examples for isdigit and str_len
+* `#22292 <https://github.com/numpy/numpy/pull/22292>`__: DOC: Add copyto example
+* `#22294 <https://github.com/numpy/numpy/pull/22294>`__: DOC: add examples to np.char.multiply
+* `#22295 <https://github.com/numpy/numpy/pull/22295>`__: DOC: Added an example for isupper() function
+* `#22296 <https://github.com/numpy/numpy/pull/22296>`__: BUG: Memory leaks in numpy.nested_iters
+* `#22297 <https://github.com/numpy/numpy/pull/22297>`__: DOC: Add example to np.prod
+* `#22298 <https://github.com/numpy/numpy/pull/22298>`__: DOC: add example for ma.unique function
+* `#22299 <https://github.com/numpy/numpy/pull/22299>`__: DOC: Add examples for join and index
+* `#22300 <https://github.com/numpy/numpy/pull/22300>`__: DOC: examples for ``np.char.isdecimal`` and ``np.char.isnumeric``
+* `#22302 <https://github.com/numpy/numpy/pull/22302>`__: DOC: Change in the documentation for chebpts2 method
+* `#22304 <https://github.com/numpy/numpy/pull/22304>`__: DOC: default_rng cleanup
+* `#22306 <https://github.com/numpy/numpy/pull/22306>`__: ENH: Implement essential intrinsics required by the upcoming...
+* `#22308 <https://github.com/numpy/numpy/pull/22308>`__: DOC: add examples to numpy.char.replace
+* `#22309 <https://github.com/numpy/numpy/pull/22309>`__: DOC: Correct usage example for np.char.decode docstring
+* `#22312 <https://github.com/numpy/numpy/pull/22312>`__: MAINT: Shorten test output on travis builds
+* `#22313 <https://github.com/numpy/numpy/pull/22313>`__: DEP: Deprecate fastCopyAndTranspose
+* `#22314 <https://github.com/numpy/numpy/pull/22314>`__: MAINT: Shorten wheel test output on travis builds
+* `#22316 <https://github.com/numpy/numpy/pull/22316>`__: ENH: Allow creating structured void scalars by passing dtype
+* `#22319 <https://github.com/numpy/numpy/pull/22319>`__: TST: add functional tests for kron
+* `#22321 <https://github.com/numpy/numpy/pull/22321>`__: MAINT: core: Fix a typo in a datetime error message.
+* `#22324 <https://github.com/numpy/numpy/pull/22324>`__: MAINT: update function's __module__ attribute in deprecate
+* `#22325 <https://github.com/numpy/numpy/pull/22325>`__: SIMD: Improve the performance of NEON vector initializer
+* `#22326 <https://github.com/numpy/numpy/pull/22326>`__: CI, SIMD: Test and build without the support of AVX2 and AVX512
+* `#22327 <https://github.com/numpy/numpy/pull/22327>`__: BUG: Fix complex vector dot with more than NPY_CBLAS_CHUNK elements
+* `#22331 <https://github.com/numpy/numpy/pull/22331>`__: DOC: Adding examples to ``ma.max`` function
+* `#22332 <https://github.com/numpy/numpy/pull/22332>`__: DOC: Minor typo in docs
+* `#22334 <https://github.com/numpy/numpy/pull/22334>`__: DOC: Added missing dtype attribute to ``iinfo`` and ``finfo`` docstring
+* `#22336 <https://github.com/numpy/numpy/pull/22336>`__: MAINT: Rm numpydoc remnant example docstring.
+* `#22342 <https://github.com/numpy/numpy/pull/22342>`__: MAINT: update sde toolkit to 9.0, fix download link
+* `#22343 <https://github.com/numpy/numpy/pull/22343>`__: DOC: fixed two more typos in docstrings
+* `#22344 <https://github.com/numpy/numpy/pull/22344>`__: DOC: fixed minor typo in percentile docstring
+* `#22354 <https://github.com/numpy/numpy/pull/22354>`__: MAINT: switch sponsor link from numfocus to opencollective
+* `#22356 <https://github.com/numpy/numpy/pull/22356>`__: REV: Loosen ``lookfor``'s import try/except again
+* `#22357 <https://github.com/numpy/numpy/pull/22357>`__: TYP,ENH: Mark ``numpy.typing`` protocols as runtime checkable
+* `#22358 <https://github.com/numpy/numpy/pull/22358>`__: ENH,TYP: Add special casing for ``ndarray``-based indexing
+* `#22359 <https://github.com/numpy/numpy/pull/22359>`__: TYP,MAINT: Change more overloads to play nice with pyright
+* `#22360 <https://github.com/numpy/numpy/pull/22360>`__: TST,TYP: Bump mypy to 0.981
+* `#22362 <https://github.com/numpy/numpy/pull/22362>`__: DOC: Updated amin/amax output dimensions for tuple axis
+* `#22363 <https://github.com/numpy/numpy/pull/22363>`__: DOC: Added examples to ``maa.min`` function
+* `#22365 <https://github.com/numpy/numpy/pull/22365>`__: BUG: Add ``__array_api_version__`` to ``numpy.array_api`` namespace
+* `#22367 <https://github.com/numpy/numpy/pull/22367>`__: BUILD: add permissions to github actions
+* `#22371 <https://github.com/numpy/numpy/pull/22371>`__: MAINT: remove permission restrictions for PR labeler [skip ci]
+* `#22372 <https://github.com/numpy/numpy/pull/22372>`__: DOC: Update delimiter param description.
+* `#22373 <https://github.com/numpy/numpy/pull/22373>`__: DOC, MAINT: Remove unused files
+* `#22374 <https://github.com/numpy/numpy/pull/22374>`__: DOC: typo additional colon in beginners tutorial
+* `#22375 <https://github.com/numpy/numpy/pull/22375>`__: DOC: How to partition domains
+* `#22379 <https://github.com/numpy/numpy/pull/22379>`__: ENH: allow explicit ``like=None`` in all array creation functions
+* `#22385 <https://github.com/numpy/numpy/pull/22385>`__: DEP: Deprecate conversion of out-of-bound Python integers
+* `#22393 <https://github.com/numpy/numpy/pull/22393>`__: MAINT: Ensure graceful handling of large header sizes
+* `#22398 <https://github.com/numpy/numpy/pull/22398>`__: DOC: Update broken link to diataxis framework.
+* `#22399 <https://github.com/numpy/numpy/pull/22399>`__: MAINT: Fix typos found by codespell
+* `#22401 <https://github.com/numpy/numpy/pull/22401>`__: MAINT: fix obsolete code comment
+* `#22404 <https://github.com/numpy/numpy/pull/22404>`__: DOC: added ``ma.round`` and ``ma.round_`` examples
+* `#22406 <https://github.com/numpy/numpy/pull/22406>`__: DOC: Add changelog for ``masked_invalid`` change.
+* `#22407 <https://github.com/numpy/numpy/pull/22407>`__: DOC: Fix title level for release note improvements
+* `#22409 <https://github.com/numpy/numpy/pull/22409>`__: DOC: Adopt a harmonic color scheme with the dark mode of pydata-sphinx-theme
+* `#22411 <https://github.com/numpy/numpy/pull/22411>`__: DOC: Remove documentation specific to Python 2
+* `#22418 <https://github.com/numpy/numpy/pull/22418>`__: TST, BLD: Fix failing aarch64 wheel builds.
+* `#22419 <https://github.com/numpy/numpy/pull/22419>`__: MAINT: Remove PyCObject from the SWIG interface
+* `#22421 <https://github.com/numpy/numpy/pull/22421>`__: DOC: Replace CObject with Capsule consistently
+* `#22422 <https://github.com/numpy/numpy/pull/22422>`__: ENH: Expose ``ufunc.resolve_dtypes`` and strided loop access
+* `#22430 <https://github.com/numpy/numpy/pull/22430>`__: MAINT: Update main after 1.23.4 release.
+* `#22432 <https://github.com/numpy/numpy/pull/22432>`__: MAINT: always use sys.base_prefix, never use sys.real_prefix
+* `#22433 <https://github.com/numpy/numpy/pull/22433>`__: DOC: remove reference to Python 2
+* `#22436 <https://github.com/numpy/numpy/pull/22436>`__: DOC: Clarify docstring of ``masked_equal`` and ``masked_values``
+* `#22438 <https://github.com/numpy/numpy/pull/22438>`__: MAINT: Update versioneer 0.19 → 0.26
+* `#22440 <https://github.com/numpy/numpy/pull/22440>`__: MAINT: fix typo in f2c_lapack.c
+* `#22441 <https://github.com/numpy/numpy/pull/22441>`__: MAINT,DOC: Revert "MAINT: fix typo in f2c_lapack.c"
+* `#22442 <https://github.com/numpy/numpy/pull/22442>`__: ENH: unstructured_to_structured converts dtype argument
+* `#22447 <https://github.com/numpy/numpy/pull/22447>`__: TYP: Spelling alignment for array flag literal
+* `#22450 <https://github.com/numpy/numpy/pull/22450>`__: BUG: Fix bounds checking for random.logseries
+* `#22452 <https://github.com/numpy/numpy/pull/22452>`__: DEV: Update GH actions and Dockerfile for Gitpod
+* `#22453 <https://github.com/numpy/numpy/pull/22453>`__: DOC: Update NEP 50 text since integer conversion errors now exist.
+* `#22456 <https://github.com/numpy/numpy/pull/22456>`__: DEP: Proposal to deprecate the ``msort`` convenience function
+* `#22458 <https://github.com/numpy/numpy/pull/22458>`__: ENH: Allow all allocated operands in nditer/NpyIter
+* `#22462 <https://github.com/numpy/numpy/pull/22462>`__: MAINT: refactor mandatory npymath functions to #define macros
+* `#22463 <https://github.com/numpy/numpy/pull/22463>`__: DOC: remove mention of ``ipython -p numpy``.
+* `#22466 <https://github.com/numpy/numpy/pull/22466>`__: MAINT: Ensure that fmin loops do not show up multiple times
+* `#22475 <https://github.com/numpy/numpy/pull/22475>`__: MAINT: remove code specific to Python 2
+* `#22478 <https://github.com/numpy/numpy/pull/22478>`__: BUG: -unsigned_int(0) no overflow warning
+* `#22479 <https://github.com/numpy/numpy/pull/22479>`__: MAINT: remove u-prefix for former Unicode strings
+* `#22480 <https://github.com/numpy/numpy/pull/22480>`__: DOC: fix typo in advanced_debugging.rst [skip ci]
+* `#22481 <https://github.com/numpy/numpy/pull/22481>`__: GitHub Workflows security hardening
+* `#22482 <https://github.com/numpy/numpy/pull/22482>`__: ENH: Add OpenSSF Scorecard GitHub Action
+* `#22483 <https://github.com/numpy/numpy/pull/22483>`__: MAINT: change subprocess arguments from Python>=3.7
+* `#22485 <https://github.com/numpy/numpy/pull/22485>`__: TST: Make test_partial_iteration_cleanup robust but require leak...
+* `#22487 <https://github.com/numpy/numpy/pull/22487>`__: TST, MAINT: Replace most setup with setup_method (also teardown)
+* `#22488 <https://github.com/numpy/numpy/pull/22488>`__: MAINT, CI: Switch to cygwin/cygwin-install-action@v2
+* `#22491 <https://github.com/numpy/numpy/pull/22491>`__: CI: Make cygwin build run for branches.
+* `#22496 <https://github.com/numpy/numpy/pull/22496>`__: MAINT: Use SPDX license expression in project metadata
+* `#22498 <https://github.com/numpy/numpy/pull/22498>`__: REL: readme in PyPI plus test
+* `#22500 <https://github.com/numpy/numpy/pull/22500>`__: DOC: added example in char
+* `#22503 <https://github.com/numpy/numpy/pull/22503>`__: CI: Only fetch in actions/checkout
+* `#22504 <https://github.com/numpy/numpy/pull/22504>`__: DOC: Add code-formatting on install instructions
+* `#22505 <https://github.com/numpy/numpy/pull/22505>`__: DOC: fixed minor typo in f2py docs
+* `#22510 <https://github.com/numpy/numpy/pull/22510>`__: MAINT: Ensure raw dlpack deleter works when called without the...
+* `#22519 <https://github.com/numpy/numpy/pull/22519>`__: DOC: fixed pad_width description in numpy.pad
+* `#22521 <https://github.com/numpy/numpy/pull/22521>`__: DOC: Remove "current" from byte-order note and expand it slightly
+* `#22524 <https://github.com/numpy/numpy/pull/22524>`__: MAINT, BLD: Wheel CI: Update cibuildwheel to 2.11.2
+* `#22525 <https://github.com/numpy/numpy/pull/22525>`__: BLD: update OpenBLAS to 0.3.21 and clean up openblas download...
+* `#22531 <https://github.com/numpy/numpy/pull/22531>`__: BLD, CI: Update setup-python
+* `#22538 <https://github.com/numpy/numpy/pull/22538>`__: DOC: update libnpymath docs on its status and how to consume...
+* `#22540 <https://github.com/numpy/numpy/pull/22540>`__: DEP: Expire deprecation of dtype/signature allowing instances
+* `#22541 <https://github.com/numpy/numpy/pull/22541>`__: DEP: Expire deprecation to ignore bad dtype= in logical ufuncs
+* `#22542 <https://github.com/numpy/numpy/pull/22542>`__: API: Always use BufferError when dlpack export fails
+* `#22543 <https://github.com/numpy/numpy/pull/22543>`__: DOC: Add instruction to initialize git submodules
+* `#22548 <https://github.com/numpy/numpy/pull/22548>`__: MAINT: Fix designator order not matching declaration order
+* `#22550 <https://github.com/numpy/numpy/pull/22550>`__: MAINT: Fix Fortran order flag use (using bool rather than enum)
+* `#22552 <https://github.com/numpy/numpy/pull/22552>`__: MAINT: Do not use temporary struct construct
+* `#22554 <https://github.com/numpy/numpy/pull/22554>`__: MAINT: Match arguments of constant in ``isless()``
+* `#22557 <https://github.com/numpy/numpy/pull/22557>`__: BUG: Decrement ref count in gentype_reduce if allocated memory...
+* `#22561 <https://github.com/numpy/numpy/pull/22561>`__: BUG: Histogramdd breaks on big arrays in Windows
+* `#22566 <https://github.com/numpy/numpy/pull/22566>`__: BUG: Fix use and errorchecking of ObjectType use
+* `#22567 <https://github.com/numpy/numpy/pull/22567>`__: CI: Add PR write permissions to artifact redirector.
+* `#22571 <https://github.com/numpy/numpy/pull/22571>`__: DOC: Mention numpy types in ``isnat`` error message
+* `#22576 <https://github.com/numpy/numpy/pull/22576>`__: BUG: fix issue with broken assert statement in ``templ_common.h.src``
+* `#22578 <https://github.com/numpy/numpy/pull/22578>`__: BLD: fix issue with header includes in dlpack.c
+* `#22579 <https://github.com/numpy/numpy/pull/22579>`__: MAINT: remove ``NPY_RESTRICT`` in favor of C99 ``restrict``
+* `#22580 <https://github.com/numpy/numpy/pull/22580>`__: BLD: remove unused ``ncola`` variables from lapack-lite
+* `#22583 <https://github.com/numpy/numpy/pull/22583>`__: MAINT: Patch to remove ncola variable from f2c_blas.c
+* `#22586 <https://github.com/numpy/numpy/pull/22586>`__: MAINT: Rm meaningless checks in determining typeless data
+* `#22587 <https://github.com/numpy/numpy/pull/22587>`__: TYP: Update type annotations for new 1.24 features
+* `#22588 <https://github.com/numpy/numpy/pull/22588>`__: DOC: Clarify relationship between row_stack and vstack.
+* `#22589 <https://github.com/numpy/numpy/pull/22589>`__: DOC: expand docs on debugging with gdb
+* `#22598 <https://github.com/numpy/numpy/pull/22598>`__: MAINT, CI: Update Ubuntu 18.04 to Ubuntu 20.04
+* `#22601 <https://github.com/numpy/numpy/pull/22601>`__: MAINT: Use C99 flexible struct construct for ``NpyIter_InternalOnly``
+* `#22605 <https://github.com/numpy/numpy/pull/22605>`__: MAINT: (array-coercion) Silence invalid read warning in some...
+* `#22607 <https://github.com/numpy/numpy/pull/22607>`__: DEP: Next step in scalar type alias deprecations/futurewarnings
+* `#22608 <https://github.com/numpy/numpy/pull/22608>`__: MAINT, CI: Enable coverage checking.
+* `#22612 <https://github.com/numpy/numpy/pull/22612>`__: BLD: update wheel builds on macos to macos-12 image
+* `#22614 <https://github.com/numpy/numpy/pull/22614>`__: MAINT: remove macOS specific long double handling in numpyconfig.h
+* `#22615 <https://github.com/numpy/numpy/pull/22615>`__: DOC: Rm ``round_`` from the autosummary for rounding
+* `#22616 <https://github.com/numpy/numpy/pull/22616>`__: TST: Rename setup to setup_method in _locales
+* `#22620 <https://github.com/numpy/numpy/pull/22620>`__: DOC: testing: Fix typo: nulps -> nulp
+* `#22628 <https://github.com/numpy/numpy/pull/22628>`__: DOC: Add example for np.ma.power
+* `#22629 <https://github.com/numpy/numpy/pull/22629>`__: MAINT: Update main after 1.23.5 release.
+* `#22630 <https://github.com/numpy/numpy/pull/22630>`__: BLD: Use cibuildwheel 2.9.0 for Python 3.8 aarch64 builds
+* `#22634 <https://github.com/numpy/numpy/pull/22634>`__: CI: Increase travis timeout to 20 minutes.
+* `#22636 <https://github.com/numpy/numpy/pull/22636>`__: CI: Increase travis timeout to 30 minutes (2).
+* `#22639 <https://github.com/numpy/numpy/pull/22639>`__: DOC: Remove traces of interrupt handling utilities
+* `#22662 <https://github.com/numpy/numpy/pull/22662>`__: rel: prepare for the numpy 1.24.0rc1 release.
+* `#22665 <https://github.com/numpy/numpy/pull/22665>`__: MAINT: Fix 1.24.0 release note markup.
+* `#22697 <https://github.com/numpy/numpy/pull/22697>`__: MAINT: Rename symbols in textreading/ that may clash when statically...
+* `#22698 <https://github.com/numpy/numpy/pull/22698>`__: MAINT: check user-defined dtype has an ensure_canonical implementation
+* `#22699 <https://github.com/numpy/numpy/pull/22699>`__: BUG: Ensure string aliases "int0", etc. remain valid for now
+* `#22700 <https://github.com/numpy/numpy/pull/22700>`__: BLD: revert function() -> #define for 3 npymath functions
+* `#22702 <https://github.com/numpy/numpy/pull/22702>`__: MAINT: pin ubuntu and python for emscripten
+* `#22716 <https://github.com/numpy/numpy/pull/22716>`__: BLD, CI: Use cirrus for building aarch64 wheels
+* `#22717 <https://github.com/numpy/numpy/pull/22717>`__: TST: Skip when numba/numpy compat issues cause SystemError
+* `#22719 <https://github.com/numpy/numpy/pull/22719>`__: CI: Make benchmark asv run quick to only check that benchmarks...
+* `#22729 <https://github.com/numpy/numpy/pull/22729>`__: REL: Prepare for the NumPy 1.24.0rc2 release.
+* `#22743 <https://github.com/numpy/numpy/pull/22743>`__: DOC: Fix release note link to out-of-bounds integer deprecation
+* `#22746 <https://github.com/numpy/numpy/pull/22746>`__: BUG: Fix some valgrind errors (and probably harmless warnings)
+* `#22748 <https://github.com/numpy/numpy/pull/22748>`__: BUG: ``keepdims=True`` is ignored if ``out`` is not ``None``...
+* `#22749 <https://github.com/numpy/numpy/pull/22749>`__: MAINT: check if PyArrayDTypeMeta_Spec->casts is set
+* `#22757 <https://github.com/numpy/numpy/pull/22757>`__: CI: fix CIRRUS_TAG check when tagging. Closes #22730.
+* `#22758 <https://github.com/numpy/numpy/pull/22758>`__: DOC: Some updates to the array_api compat document (#22747)
+* `#22759 <https://github.com/numpy/numpy/pull/22759>`__: BUG, SIMD: Fix rounding large numbers >= 2^52 on SSE2
+* `#22760 <https://github.com/numpy/numpy/pull/22760>`__: CI, SIMD: Add workflow to test default baseline features only
+* `#22761 <https://github.com/numpy/numpy/pull/22761>`__: BUG: Fix deepcopy cleanup on error
+* `#22793 <https://github.com/numpy/numpy/pull/22793>`__: BUG: Fix infinite recursion in longdouble/large integer scalar...
+* `#22795 <https://github.com/numpy/numpy/pull/22795>`__: BUG: Ensure arguments to ``npy_floatstatus_..._barrier()`` can...
+* `#22805 <https://github.com/numpy/numpy/pull/22805>`__: REV: revert change to ``numpyconfig.h`` for sizeof(type) hardcoding...
+* `#22815 <https://github.com/numpy/numpy/pull/22815>`__: BLD: use newer version of delocate
diff --git a/doc/changelog/1.24.1-changelog.rst b/doc/changelog/1.24.1-changelog.rst
new file mode 100644
index 000000000..5650438b6
--- /dev/null
+++ b/doc/changelog/1.24.1-changelog.rst
@@ -0,0 +1,43 @@
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Andrew Nelson
+* Ben Greiner +
+* Charles Harris
+* Clément Robert
+* Matteo Raso
+* Matti Picus
+* Melissa Weber Mendonça
+* Miles Cranmer
+* Ralf Gommers
+* Rohit Goswami
+* Sayed Adel
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 18 pull requests were merged for this release.
+
+* `#22820 <https://github.com/numpy/numpy/pull/22820>`__: BLD: add workaround in setup.py for newer setuptools
+* `#22830 <https://github.com/numpy/numpy/pull/22830>`__: BLD: CIRRUS_TAG redux
+* `#22831 <https://github.com/numpy/numpy/pull/22831>`__: DOC: fix a couple typos in 1.23 notes
+* `#22832 <https://github.com/numpy/numpy/pull/22832>`__: BUG: Fix refcounting errors found using pytest-leaks
+* `#22834 <https://github.com/numpy/numpy/pull/22834>`__: BUG, SIMD: Fix invalid value encountered in several ufuncs
+* `#22837 <https://github.com/numpy/numpy/pull/22837>`__: TST: ignore more np.distutils.log imports
+* `#22839 <https://github.com/numpy/numpy/pull/22839>`__: BUG: Do not use getdata() in np.ma.masked_invalid
+* `#22847 <https://github.com/numpy/numpy/pull/22847>`__: BUG: Ensure correct behavior for rows ending in delimiter in...
+* `#22848 <https://github.com/numpy/numpy/pull/22848>`__: BUG, SIMD: Fix the bitmask of the boolean comparison
+* `#22857 <https://github.com/numpy/numpy/pull/22857>`__: BLD: Help raspian arm + clang 13 about __builtin_mul_overflow
+* `#22858 <https://github.com/numpy/numpy/pull/22858>`__: API: Ensure a full mask is returned for masked_invalid
+* `#22866 <https://github.com/numpy/numpy/pull/22866>`__: BUG: Polynomials now copy properly (#22669)
+* `#22867 <https://github.com/numpy/numpy/pull/22867>`__: BUG, SIMD: Fix memory overlap in ufunc comparison loops
+* `#22868 <https://github.com/numpy/numpy/pull/22868>`__: BUG: Fortify string casts against floating point warnings
+* `#22875 <https://github.com/numpy/numpy/pull/22875>`__: TST: Ignore nan-warnings in randomized out tests
+* `#22883 <https://github.com/numpy/numpy/pull/22883>`__: MAINT: restore npymath implementations needed for freebsd
+* `#22884 <https://github.com/numpy/numpy/pull/22884>`__: BUG: Fix integer overflow in in1d for mixed integer dtypes #22877
+* `#22887 <https://github.com/numpy/numpy/pull/22887>`__: BUG: Use whole file for encoding checks with ``charset_normalizer``.
diff --git a/doc/changelog/1.24.2-changelog.rst b/doc/changelog/1.24.2-changelog.rst
new file mode 100644
index 000000000..399092eb0
--- /dev/null
+++ b/doc/changelog/1.24.2-changelog.rst
@@ -0,0 +1,44 @@
+
+Contributors
+============
+
+A total of 14 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Bas van Beek
+* Charles Harris
+* Khem Raj +
+* Mark Harfouche
+* Matti Picus
+* Panagiotis Zestanakis +
+* Peter Hawkins
+* Pradipta Ghosh
+* Ross Barnowski
+* Sayed Adel
+* Sebastian Berg
+* Syam Gadde +
+* dmbelov +
+* pkubaj +
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#22965 <https://github.com/numpy/numpy/pull/22965>`__: MAINT: Update python 3.11-dev to 3.11.
+* `#22966 <https://github.com/numpy/numpy/pull/22966>`__: DOC: Remove dangling deprecation warning
+* `#22967 <https://github.com/numpy/numpy/pull/22967>`__: ENH: Detect CPU features on FreeBSD/powerpc64*
+* `#22968 <https://github.com/numpy/numpy/pull/22968>`__: BUG: np.loadtxt cannot load text file with quoted fields separated...
+* `#22969 <https://github.com/numpy/numpy/pull/22969>`__: TST: Add fixture to avoid issue with randomizing test order.
+* `#22970 <https://github.com/numpy/numpy/pull/22970>`__: BUG: Fix fill violating read-only flag. (#22959)
+* `#22971 <https://github.com/numpy/numpy/pull/22971>`__: MAINT: Add additional information to missing scalar AttributeError
+* `#22972 <https://github.com/numpy/numpy/pull/22972>`__: MAINT: Move export for scipy arm64 helper into main module
+* `#22976 <https://github.com/numpy/numpy/pull/22976>`__: BUG, SIMD: Fix spurious invalid exception for sin/cos on arm64/clang
+* `#22989 <https://github.com/numpy/numpy/pull/22989>`__: BUG: Ensure correct loop order in sin, cos, and arctan2
+* `#23030 <https://github.com/numpy/numpy/pull/23030>`__: DOC: Add version added information for the strict parameter in...
+* `#23031 <https://github.com/numpy/numpy/pull/23031>`__: BUG: use ``_Alignof`` rather than ``offsetof()`` on most compilers
+* `#23147 <https://github.com/numpy/numpy/pull/23147>`__: BUG: Fix for npyv__trunc_s32_f32 (VXE)
+* `#23148 <https://github.com/numpy/numpy/pull/23148>`__: BUG: Fix integer / float scalar promotion
+* `#23149 <https://github.com/numpy/numpy/pull/23149>`__: BUG: Add missing <type_traits> header.
+* `#23150 <https://github.com/numpy/numpy/pull/23150>`__: TYP, MAINT: Add a missing explicit ``Any`` parameter to the ``npt.ArrayLike``...
+* `#23161 <https://github.com/numpy/numpy/pull/23161>`__: BLD: remove redundant definition of npy_nextafter [wheel build]
diff --git a/doc/changelog/1.24.3-changelog.rst b/doc/changelog/1.24.3-changelog.rst
new file mode 100644
index 000000000..ccfdfe8f0
--- /dev/null
+++ b/doc/changelog/1.24.3-changelog.rst
@@ -0,0 +1,42 @@
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Aleksei Nikiforov +
+* Alexander Heger
+* Bas van Beek
+* Bob Eldering
+* Brock Mendel
+* Charles Harris
+* Kyle Sunden
+* Peter Hawkins
+* Rohit Goswami
+* Sebastian Berg
+* Warren Weckesser
+* dependabot[bot]
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#23206 <https://github.com/numpy/numpy/pull/23206>`__: BUG: fix for f2py string scalars (#23194)
+* `#23207 <https://github.com/numpy/numpy/pull/23207>`__: BUG: datetime64/timedelta64 comparisons return NotImplemented
+* `#23208 <https://github.com/numpy/numpy/pull/23208>`__: MAINT: Pin matplotlib to version 3.6.3 for refguide checks
+* `#23221 <https://github.com/numpy/numpy/pull/23221>`__: DOC: Fix matplotlib error in documentation
+* `#23226 <https://github.com/numpy/numpy/pull/23226>`__: CI: Ensure submodules are initialized in gitpod.
+* `#23341 <https://github.com/numpy/numpy/pull/23341>`__: TYP: Replace duplicate reduce in ufunc type signature with reduceat.
+* `#23342 <https://github.com/numpy/numpy/pull/23342>`__: TYP: Remove duplicate CLIP/WRAP/RAISE in ``__init__.pyi``.
+* `#23343 <https://github.com/numpy/numpy/pull/23343>`__: TYP: Mark ``d`` argument to fftfreq and rfftfreq as optional...
+* `#23344 <https://github.com/numpy/numpy/pull/23344>`__: TYP: Add type annotations for comparison operators to MaskedArray.
+* `#23345 <https://github.com/numpy/numpy/pull/23345>`__: TYP: Remove some stray type-check-only imports of ``msort``
+* `#23370 <https://github.com/numpy/numpy/pull/23370>`__: BUG: Ensure like is only stripped for ``like=`` dispatched functions
+* `#23543 <https://github.com/numpy/numpy/pull/23543>`__: BUG: fix loading and storing big arrays on s390x
+* `#23544 <https://github.com/numpy/numpy/pull/23544>`__: MAINT: Bump larsoner/circleci-artifacts-redirector-action
+* `#23634 <https://github.com/numpy/numpy/pull/23634>`__: BUG: Ignore invalid and overflow warnings in masked setitem
+* `#23635 <https://github.com/numpy/numpy/pull/23635>`__: BUG: Fix masked array raveling when ``order="A"`` or ``order="K"``
+* `#23636 <https://github.com/numpy/numpy/pull/23636>`__: MAINT: Update conftest for newer hypothesis versions
+* `#23637 <https://github.com/numpy/numpy/pull/23637>`__: BUG: Fix bug in parsing F77 style string arrays.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 000000000..974d5c3d8
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,74 @@
+@echo off
+
+pushd %~dp0
+
+:: Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=LANG=C sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+if defined SPHINXOPTS goto skipopts
+set SPHINXOPTS=-W --keep-going -d build/doctrees %SPHINXOPTS% source
+set DOXYGEN=doxygen
+set FILES=
+:skipopts
+
+if "%1" == "" goto help
+if "%1" == "clean" goto clean
+if "%1" == "docenv" goto docenv
+if "%1" == "html" goto html
+if "%1" == "linkcheck" goto linkcheck
+if "%1" == "show" goto show
+
+:help
+	echo.
+	echo Please use "make.bat <target>" where ^<target^> is one of
+	echo.
+	echo    clean     to remove generated doc files and start fresh
+	echo    docenv    make a virtual environment in which to build docs
+	echo    html      to make standalone HTML files
+	echo    linkcheck to check all external links for integrity
+	echo    show      to show the html output in a browser
+goto end
+
+:clean
+if exist "%SOURCEDIR%\build\" (
+	rmdir /s /q "%SOURCEDIR%\build"
+	:: TODO
+	:: find . -name generated -type d -prune -exec rm -rf "{}" ";"
+)
+goto end
+
+:docenv
+echo Not implemented
+Rem 	python -mvenv docenv
+Rem 	( \
+Rem             . docenv/bin/activate; \
+Rem             pip install -q --upgrade pip; \
+Rem             pip install -q  -r ../test_requirements.txt; \
+Rem             pip install -q  -r ../doc_requirements.txt; \
+Rem             pip install -q ..; \
+Rem 	)
+goto end
+
+:html
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:linkcheck
+	md build
+	md build\linkcheck
+	md build\doctrees
+	%SPHINXBUILD% -b linkcheck %SOURCEDIR% build\linkcheck
+	echo.
+	echo Link check complete; look for any errors in the above output
+	echo    or in build\linkcheck\output.txt.
+goto end
+
+:show
+python -m webbrowser -t "%~dp0\build\html\index.html"
+
+:end
+popd
diff --git a/doc/neps/nep-0013-ufunc-overrides.rst b/doc/neps/nep-0013-ufunc-overrides.rst
index c132113db..d69af6924 100644
--- a/doc/neps/nep-0013-ufunc-overrides.rst
+++ b/doc/neps/nep-0013-ufunc-overrides.rst
@@ -20,6 +20,8 @@ NEP 13 — A mechanism for overriding Ufuncs
 :Date: 2017-03-31
 
 :Status: Final
+:Updated: 2023-02-19
+:Author: Roy Smart
 
 Executive summary
 =================
@@ -173,12 +175,12 @@ where in all current cases only a single output makes sense).
 
 The function dispatch proceeds as follows:
 
-- If one of the input or output arguments implements
+- If one of the input, output, or ``where`` arguments implements
   ``__array_ufunc__``, it is executed instead of the ufunc.
 
 - If more than one of the arguments implements ``__array_ufunc__``,
   they are tried in the following order: subclasses before superclasses,
-  inputs before outputs, otherwise left to right.
+  inputs before outputs, outputs before ``where``, otherwise left to right.
 
 - The first ``__array_ufunc__`` method returning something else than
   :obj:`NotImplemented` determines the return value of the Ufunc.
@@ -326,7 +328,10 @@ equivalent to::
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         # Cannot handle items that have __array_ufunc__ (other than our own).
         outputs = kwargs.get('out', ())
-        for item in inputs + outputs:
+        objs = inputs + outputs
+        if "where" in kwargs:
+            objs = objs + (kwargs["where"], )
+        for item in objs:
             if (hasattr(item, '__array_ufunc__') and
                     type(item).__array_ufunc__ is not ndarray.__array_ufunc__):
                 return NotImplemented
diff --git a/doc/neps/nep-0019-rng-policy.rst b/doc/neps/nep-0019-rng-policy.rst
index c5c46603b..44033357a 100644
--- a/doc/neps/nep-0019-rng-policy.rst
+++ b/doc/neps/nep-0019-rng-policy.rst
@@ -17,7 +17,7 @@ Abstract
 For the past decade, NumPy has had a strict backwards compatibility policy for
 the number stream of all of its random number distributions.  Unlike other
 numerical components in ``numpy``, which are usually allowed to return
-different when results when they are modified if they remain correct, we have
+different results when modified if the results remain correct, we have
 obligated the random number distributions to always produce the exact same
 numbers in every version.  The objective of our stream-compatibility guarantee
 was to provide exact reproducibility for simulations across numpy versions in
diff --git a/doc/neps/nep-0021-advanced-indexing.rst b/doc/neps/nep-0021-advanced-indexing.rst
index 7751d309b..4c7a16375 100644
--- a/doc/neps/nep-0021-advanced-indexing.rst
+++ b/doc/neps/nep-0021-advanced-indexing.rst
@@ -143,7 +143,7 @@ exactly.
 
 Vectorized indexing in particular can be challenging to implement with array
 storage backends not based on NumPy. In contrast, indexing by 1D arrays along
-at least one dimension in the style of outer indexing is much more acheivable.
+at least one dimension in the style of outer indexing is much more achievable.
 This has led many libraries (including dask and h5py) to attempt to define a
 safe subset of NumPy-style indexing that is equivalent to outer indexing, e.g.,
 by only allowing indexing with an array along at most one dimension. However,
diff --git a/doc/neps/nep-0029-deprecation_policy.rst b/doc/neps/nep-0029-deprecation_policy.rst
index 4eb2aa50f..ccb566227 100644
--- a/doc/neps/nep-0029-deprecation_policy.rst
+++ b/doc/neps/nep-0029-deprecation_policy.rst
@@ -127,7 +127,10 @@ Apr 14, 2023 3.9+   1.21+
 Jun 23, 2023 3.9+   1.22+
 Jan 01, 2024 3.9+   1.23+
 Apr 05, 2024 3.10+  1.23+
-Apr 04, 2025 3.11+  1.23+
+Jun 22, 2024 3.10+  1.24+
+Dec 18, 2024 3.10+  1.25+
+Apr 04, 2025 3.11+  1.25+
+Apr 24, 2026 3.12+  1.25+
 ============ ====== =====
 
 
@@ -150,7 +153,10 @@ Drop Schedule
   On Jun 23, 2023 drop support for NumPy 1.21 (initially released on Jun 22, 2021)
   On Jan 01, 2024 drop support for NumPy 1.22 (initially released on Dec 31, 2021)
   On Apr 05, 2024 drop support for Python 3.9 (initially released on Oct 05, 2020)
+  On Jun 22, 2024 drop support for NumPy 1.23 (initially released on Jun 22, 2022)
+  On Dec 18, 2024 drop support for NumPy 1.24 (initially released on Dec 18, 2022)
   On Apr 04, 2025 drop support for Python 3.10 (initially released on Oct 04, 2021)
+  On Apr 24, 2026 drop support for Python 3.11 (initially released on Oct 24, 2022)
 
 
 Implementation
@@ -284,6 +290,9 @@ Code to generate support and drop schedule tables ::
   Jun 22, 2021: NumPy 1.21
   Oct 04, 2021: Python 3.10
   Dec 31, 2021: NumPy 1.22
+  Jun 22, 2022: NumPy 1.23
+  Oct 24, 2022: Python 3.11
+  Dec 18, 2022: NumPy 1.24
   """
 
   releases = []
diff --git a/doc/neps/nep-0046-sponsorship-guidelines.rst b/doc/neps/nep-0046-sponsorship-guidelines.rst
index ed54c21d5..746a73f5d 100644
--- a/doc/neps/nep-0046-sponsorship-guidelines.rst
+++ b/doc/neps/nep-0046-sponsorship-guidelines.rst
@@ -108,11 +108,12 @@ About page) will be added to acknowledge all current and previous sponsors,
 partners, and any other entities and individuals who provided $5,000 or more of
 financial or in-kind support. This page will include relevant details of
 support (dates, amounts, names, and purpose); no logos will be used on this
-page. The rationale for the $5,000 minimum level is to keep the amount of work
-maintaining the page reasonable; the level is the equivalent of, e.g., one GSoC
-or a person-week's worth of engineering time in a Western country, which seems
-like a reasonable lower limit.
-
+page. Such support, if provided for a specific enhancements or fix, may be
+acknowledged in the appropriate release note snippet. The rationale for the
+$5,000 minimum level is to keep the amount of work maintaining the page
+reasonable; the level is the equivalent of, e.g., one GSoC or a person-week's
+worth of engineering time in a Western country, which seems like a reasonable
+lower limit.
 
 Implementation
 --------------
@@ -128,7 +129,6 @@ The following content changes need to be made:
 - Update https://numpy.org/about with details on how to get in touch with the
   NumPy project about sponsorship related matters (see next section).
 
-
 NumPy Funding Team
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/neps/nep-0051-scalar-representation.rst b/doc/neps/nep-0051-scalar-representation.rst
index 851f173de..1e6f27a5f 100644
--- a/doc/neps/nep-0051-scalar-representation.rst
+++ b/doc/neps/nep-0051-scalar-representation.rst
@@ -70,7 +70,7 @@ Even ``np.float64``, which is very similar to Python's ``float`` and inherits
 from it, does behave differently for example when dividing by zero.
 
 Another common source of confusion are the NumPy booleans.  Python programmers
-somtimes write ``obj is True`` and will surprised when an object that shows
+sometimes write ``obj is True`` and will surprised when an object that shows
 as ``True`` fails to pass the test.
 It is much easier to understand this behavior when the value is
 shown as ``np.True_``.
@@ -96,7 +96,7 @@ Jupyter notebook cells will often show the type information intact.
 
     np.longdouble('3.0')
 
-to allow round-tripping.  Addtionally to this change, ``float128`` will
+to allow round-tripping.  Additionally to this change, ``float128`` will
 now always be printed as ``longdouble`` since the old name gives a false
 impression of precision.
 
@@ -137,7 +137,7 @@ The proposal is to change the default (back) to use ``str`` rather than
 Detailed description
 ====================
 
-This NEP proposes to change the represenatation for NumPy scalars to:
+This NEP proposes to change the representation for NumPy scalars to:
 
 * ``np.True_`` and ``np.False_`` for booleans (their singleton instances)
 * ``np.scalar(<value>)``, i.e. ``np.float64(3.0)`` for all numerical dtypes.
@@ -202,8 +202,8 @@ to always print as ``longdouble`` and never ``float128`` or ``float96``.
 It does not include deprecating the ``np.float128`` alias.
 However, such a deprecation may occur independently of the NEP.
 
-Integer scalar type name and instance represenatation
------------------------------------------------------
+Integer scalar type name and instance representation
+----------------------------------------------------
 
 One detail is that due to NumPy scalar types being based on the C types,
 NumPy sometimes distinguishes them, for example on most 64 bit systems
@@ -263,7 +263,7 @@ allowing customized formatting for all DTypes.
 
 Making ``get_formatter`` public allows it to be used for ``np.record`` and
 masked arrays.
-Currenlty, the formatters themselves seem semi-public; using a single
+Currently, the formatters themselves seem semi-public; using a single
 entry-point will hopefully provide a clear API for formatting NumPy values.
 
 The large part for the scalar representation changes had previously been done
diff --git a/doc/postprocess.py b/doc/postprocess.py
index 3e066d22e..4b48fa443 100755
--- a/doc/postprocess.py
+++ b/doc/postprocess.py
@@ -2,7 +2,7 @@
 """
 Post-processes HTML and Latex files output by Sphinx.
 """
-import io
+
 
 def main():
     import argparse
@@ -15,13 +15,13 @@ def main():
     mode = args.mode
 
     for fn in args.file:
-        with io.open(fn, 'r', encoding="utf-8") as f:
+        with open(fn, encoding="utf-8") as f:
             if mode == 'html':
                 lines = process_html(fn, f.readlines())
             elif mode == 'tex':
                 lines = process_tex(f.readlines())
 
-        with io.open(fn, 'w', encoding="utf-8") as f:
+        with open(fn, 'w', encoding="utf-8") as f:
             f.write("".join(lines))
 
 def process_html(fn, lines):
diff --git a/doc/preprocess.py b/doc/preprocess.py
index 870d3e123..83980bb2f 100755
--- a/doc/preprocess.py
+++ b/doc/preprocess.py
@@ -32,7 +32,7 @@ def doxy_config(root_path):
     confs = []
     dsrc_path = os.path.join(root_path, "doc", "source")
     sub = dict(ROOT_DIR=root_path)
-    with open(os.path.join(dsrc_path, "doxyfile"), "r") as fd:
+    with open(os.path.join(dsrc_path, "doxyfile")) as fd:
         conf = DoxyTpl(fd.read())
         confs.append(conf.substitute(CUR_DIR=dsrc_path, **sub))
 
@@ -40,7 +40,7 @@ def doxy_config(root_path):
         if ".doxyfile" not in files:
             continue
         conf_path = os.path.join(dpath, ".doxyfile")
-        with open(conf_path, "r") as fd:
+        with open(conf_path) as fd:
             conf = DoxyTpl(fd.read())
             confs.append(conf.substitute(CUR_DIR=dpath, **sub))
     return confs
diff --git a/doc/records.rst b/doc/records.rst
index 3c0d55216..2e9b1251a 100644
--- a/doc/records.rst
+++ b/doc/records.rst
@@ -74,7 +74,7 @@ New possibilities for the "data-type"
 Reference counting for ``PyArray_Descr *`` objects.
 ```````````````````````````````````````````````````
 
-Most functions that take ``PyArary_Descr *`` as arguments and return a
+Most functions that take ``PyArray_Descr *`` as arguments and return a
 ``PyObject *`` steal the reference unless otherwise noted in the code:
 
 Functions that return ``PyArray_Descr *`` objects return a new
diff --git a/doc/release/numpy2_changes/23089.change.rst b/doc/release/numpy2_changes/23089.change.rst
new file mode 100644
index 000000000..fbd87e0aa
--- /dev/null
+++ b/doc/release/numpy2_changes/23089.change.rst
@@ -0,0 +1,2 @@
+* ``np.gradient()`` now returns a tuple rather than list making the
+  return value immutable.
diff --git a/doc/release/numpy2_changes/README.md b/doc/release/numpy2_changes/README.md
new file mode 100644
index 000000000..fb59749a9
--- /dev/null
+++ b/doc/release/numpy2_changes/README.md
@@ -0,0 +1,4 @@
+Same as ``../upcoming_changes`` but for changes that currently
+are opt-in behind ``NPY_NUMPY_2_BEHAVIOR=1``.
+
+This is to get a start on release notes for such changes.
diff --git a/doc/release/upcoming_changes/10615.deprecation.rst b/doc/release/upcoming_changes/10615.deprecation.rst
new file mode 100644
index 000000000..7fa948ea8
--- /dev/null
+++ b/doc/release/upcoming_changes/10615.deprecation.rst
@@ -0,0 +1,14 @@
+Only ndim-0 arrays are treated as scalars
+-----------------------------------------
+NumPy used to treat all arrays of size 1 (e.g., ``np.array([3.14])``) as scalars.
+In the future, this will be limited to arrays of ndim 0 (e.g., ``np.array(3.14)``).
+The following expressions will report a deprecation warning:
+
+.. code-block:: python
+
+    a = np.array([3.14])
+    float(a)  # better: a[0] to get the numpy.float or a.item()
+
+    b = np.array([[3.14]])
+    c = numpy.random.rand(10)
+    c[0] = b  # better: c[0] = b[0, 0]
diff --git a/doc/release/upcoming_changes/18053.new_feature.rst b/doc/release/upcoming_changes/18053.new_feature.rst
new file mode 100644
index 000000000..fea04f79a
--- /dev/null
+++ b/doc/release/upcoming_changes/18053.new_feature.rst
@@ -0,0 +1,4 @@
+``np.einsum`` now accepts arrays with ``object`` dtype
+------------------------------------------------------
+The code path will call python operators on object dtype arrays, much
+like ``np.dot`` and ``np.matmul``.
diff --git a/doc/release/upcoming_changes/21120.new_feature.rst b/doc/release/upcoming_changes/21120.new_feature.rst
new file mode 100644
index 000000000..7d4dbf743
--- /dev/null
+++ b/doc/release/upcoming_changes/21120.new_feature.rst
@@ -0,0 +1,21 @@
+Add support for inplace matrix multiplication
+----------------------------------------------
+It is now possible to perform inplace matrix multiplication
+via the ``@=`` operator.
+
+.. code-block:: python
+
+    >>> import numpy as np
+
+    >>> a = np.arange(6).reshape(3, 2)
+    >>> print(a)
+    [[0 1]
+     [2 3]
+     [4 5]]
+
+    >>> b = np.ones((2, 2), dtype=int)
+    >>> a @= b
+    >>> print(a)
+    [[1 1]
+     [5 5]
+     [9 9]]
diff --git a/doc/release/upcoming_changes/22315.performance.rst b/doc/release/upcoming_changes/22315.performance.rst
new file mode 100644
index 000000000..a2f623e5a
--- /dev/null
+++ b/doc/release/upcoming_changes/22315.performance.rst
@@ -0,0 +1,7 @@
+Faster ``np.sort`` on AVX-512 enabled processors
+------------------------------------------------
+Quicksort for 16-bit and 64-bit dtypes gain up to 15x and 9x speed up on
+processors that support AVX-512 instruction set.
+
+Thanks to `Intel corporation <https://open.intel.com/>`_ for sponsoring this
+work.
diff --git a/doc/release/upcoming_changes/22539.change.rst b/doc/release/upcoming_changes/22539.change.rst
new file mode 100644
index 000000000..9df62be30
--- /dev/null
+++ b/doc/release/upcoming_changes/22539.change.rst
@@ -0,0 +1,19 @@
+``np.r_[]`` and ``np.c_[]`` with certain scalar values
+------------------------------------------------------
+In rare cases, using mainly ``np.r_`` with scalars can lead to different
+results.  The main potential changes are highlighted by the following::
+
+    >>> np.r_[np.arange(5, dtype=np.uint8), -1].dtype
+    int16  # rather than the default integer (int64 or int32)
+    >>> np.r_[np.arange(5, dtype=np.int8), 255]
+    array([  0,   1,   2,   3,   4, 255], dtype=int16)
+
+Where the second example returned::
+
+    array([ 0,  1,  2,  3,  4, -1], dtype=int8)
+
+The first one is due to a signed integer scalar with an unsigned integer
+array, while the second is due to ``255`` not fitting into ``int8`` and
+NumPy currently inspecting values to make this work.
+(Note that the second example is expected to change in the future due to
+:ref:`NEP 50 <NEP50>`; it will then raise an error.)
diff --git a/doc/release/upcoming_changes/22539.deprecation.rst b/doc/release/upcoming_changes/22539.deprecation.rst
new file mode 100644
index 000000000..a30434b7e
--- /dev/null
+++ b/doc/release/upcoming_changes/22539.deprecation.rst
@@ -0,0 +1,29 @@
+``np.find_common_type`` is deprecated
+-------------------------------------
+`numpy.find_common_type` is now deprecated and its use should be replaced
+with either `numpy.result_type` or `numpy.promote_types`.
+Most users leave the second ``scalar_types`` argument to ``find_common_type``
+as ``[]`` in which case ``np.result_type`` and ``np.promote_types`` are both
+faster and more robust.
+When not using ``scalar_types`` the main difference is that the replacement
+intentionally converts non-native byte-order to native byte order.
+Further, ``find_common_type`` returns ``object`` dtype rather than failing
+promotion.  This leads to differences when the inputs are not all numeric.
+Importantly, this also happens for e.g. timedelta/datetime for which NumPy
+promotion rules are currently sometimes surprising.
+
+When the ``scalar_types`` argument is not ``[]`` things are more complicated.
+In most cases, using ``np.result_type`` and passing the Python values
+``0``, ``0.0``, or ``0j`` has the same result as using ``int``, ``float``,
+or ``complex`` in `scalar_types`.
+
+When ``scalar_types`` is constructed, ``np.result_type`` is the
+correct replacement and it may be passed scalar values like ``np.float32(0.0)``.
+Passing values other than 0, may lead to value-inspecting behavior
+(which ``np.find_common_type`` never used and NEP 50 may change in the future).
+The main possible change in behavior in this case, is when the array types
+are signed integers and scalar types are unsigned.
+
+If you are unsure about how to replace a use of ``scalar_types`` or when
+non-numeric dtypes are likely, please do not hesitate to open a NumPy issue
+to ask for help.
diff --git a/doc/release/upcoming_changes/22769.improvement.rst b/doc/release/upcoming_changes/22769.improvement.rst
new file mode 100644
index 000000000..3566648ac
--- /dev/null
+++ b/doc/release/upcoming_changes/22769.improvement.rst
@@ -0,0 +1,5 @@
+`np.show_config` uses information from Meson
+--------------------------------------------
+Build and system information now contains information from Meson.
+`np.show_config` now has a new optional parameter ``mode`` to help
+customize the output.
diff --git a/doc/release/upcoming_changes/22776.improvement.rst b/doc/release/upcoming_changes/22776.improvement.rst
new file mode 100644
index 000000000..669fcff16
--- /dev/null
+++ b/doc/release/upcoming_changes/22776.improvement.rst
@@ -0,0 +1,6 @@
+Fix ``np.ma.diff`` not preserving the mask when called with arguments prepend/append.
+-------------------------------------------------------------------------------------
+Calling ``np.ma.diff`` with arguments prepend and/or append now returns a 
+``MaskedArray`` with the input mask preserved.
+
+Previously, a ``MaskedArray`` without the mask was returned.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/22863.new_feature.rst b/doc/release/upcoming_changes/22863.new_feature.rst
new file mode 100644
index 000000000..3f45ed834
--- /dev/null
+++ b/doc/release/upcoming_changes/22863.new_feature.rst
@@ -0,0 +1,4 @@
+String functions in np.char are compatible with NEP 42 custom dtypes
+--------------------------------------------------------------------
+Custom dtypes that represent unicode strings or byte strings can now be
+passed to the string functions in np.char.
diff --git a/doc/release/upcoming_changes/22963.new_feature.rst b/doc/release/upcoming_changes/22963.new_feature.rst
new file mode 100644
index 000000000..88ec3f641
--- /dev/null
+++ b/doc/release/upcoming_changes/22963.new_feature.rst
@@ -0,0 +1,7 @@
+String dtype instances can be created from the string abstract dtype classes
+----------------------------------------------------------------------------
+It is now possible to create a string dtype instance with a size without
+using the string name of the dtype. For example, ``type(np.dtype('U'))(8)``
+will create a dtype that is equivalent to ``np.dtype('U8')``. This feature
+is most useful when writing generic code dealing with string dtype
+classes.
diff --git a/doc/release/upcoming_changes/22982.new_feature.rst b/doc/release/upcoming_changes/22982.new_feature.rst
new file mode 100644
index 000000000..c98f2791c
--- /dev/null
+++ b/doc/release/upcoming_changes/22982.new_feature.rst
@@ -0,0 +1,13 @@
+Fujitsu C/C++ compiler is now supported
+----------------------------------------------
+Support for Fujitsu compiler has been added.
+To build with Fujitsu compiler, run:
+
+    python setup.py build -c fujitsu
+
+
+SSL2 is now supported
+-----------------------------------
+Support for SSL2 has been added. SSL2 is a library that provides OpenBLAS compatible GEMM functions.
+To enable SSL2, it need to edit site.cfg and build with Fujitsu compiler.
+See site.cfg.example.
diff --git a/doc/release/upcoming_changes/22997.improvement.rst b/doc/release/upcoming_changes/22997.improvement.rst
new file mode 100644
index 000000000..156c9dece
--- /dev/null
+++ b/doc/release/upcoming_changes/22997.improvement.rst
@@ -0,0 +1,5 @@
+Corrected error handling for NumPy C-API in Cython
+--------------------------------------------------
+Many NumPy C functions defined for use in Cython were lacking the
+correct error indicator like ``except -1`` or ``except *``.
+These have now been added.
diff --git a/doc/release/upcoming_changes/22998.expired.rst b/doc/release/upcoming_changes/22998.expired.rst
new file mode 100644
index 000000000..a4399b639
--- /dev/null
+++ b/doc/release/upcoming_changes/22998.expired.rst
@@ -0,0 +1,2 @@
+* ``+arr`` will now raise an error when the dtype is not
+  numeric (and positive is undefined).
diff --git a/doc/release/upcoming_changes/23011.deprecation.rst b/doc/release/upcoming_changes/23011.deprecation.rst
new file mode 100644
index 000000000..72168ff87
--- /dev/null
+++ b/doc/release/upcoming_changes/23011.deprecation.rst
@@ -0,0 +1 @@
+* ``np.finfo(None)`` is deprecated.
diff --git a/doc/release/upcoming_changes/23019.expired.rst b/doc/release/upcoming_changes/23019.expired.rst
new file mode 100644
index 000000000..0879c2658
--- /dev/null
+++ b/doc/release/upcoming_changes/23019.expired.rst
@@ -0,0 +1,2 @@
+* A sequence must now be passed into the stacking family of functions
+  (``stack``, ``vstack``, ``hstack``, ``dstack`` and ``column_stack``).
diff --git a/doc/release/upcoming_changes/23020.change.rst b/doc/release/upcoming_changes/23020.change.rst
new file mode 100644
index 000000000..b4198fcba
--- /dev/null
+++ b/doc/release/upcoming_changes/23020.change.rst
@@ -0,0 +1,9 @@
+Most NumPy functions are wrapped into a C-callable
+--------------------------------------------------
+To speed up the ``__array_function__`` dispatching, most NumPy functions
+are now wrapped into C-callables and are not proper Python functions or
+C methods.
+They still look and feel the same as before (like a Python function), and this
+should only improve performance and user experience (cleaner tracebacks).
+However, please inform the NumPy developers if this change confuses your
+program for some reason.
diff --git a/doc/release/upcoming_changes/23020.performance.rst b/doc/release/upcoming_changes/23020.performance.rst
new file mode 100644
index 000000000..db9fcbf3c
--- /dev/null
+++ b/doc/release/upcoming_changes/23020.performance.rst
@@ -0,0 +1,5 @@
+``__array_function__`` machinery is now much faster
+---------------------------------------------------
+The overhead of the majority of functions in NumPy is now smaller
+especially when keyword arguments are used.  This change significantly
+speeds up many simple function calls.
diff --git a/doc/release/upcoming_changes/23041.expired.rst b/doc/release/upcoming_changes/23041.expired.rst
new file mode 100644
index 000000000..9049ea70f
--- /dev/null
+++ b/doc/release/upcoming_changes/23041.expired.rst
@@ -0,0 +1,27 @@
+Nose support has been removed
+-----------------------------
+NumPy switched to using pytest in 2018 and nose has been unmaintained for many
+years. We have kept NumPy's nose support to avoid breaking downstream projects
+who might have been using it and not yet switched to pytest or some other
+testing framework. With the arrival of Python 3.12, unpatched nose will raise
+an error. It is time to move on.
+
+Decorators removed
+^^^^^^^^^^^^^^^^^^
+- raises
+- slow
+- setastest
+- skipif
+- knownfailif
+- deprecated
+- parametrize
+- _needs_refcount
+
+These are not to be confused with pytest versions with similar names, e.g.,
+pytest.mark.slow, pytest.mark.skipif, pytest.mark.parametrize.
+
+Functions removed
+^^^^^^^^^^^^^^^^^
+- Tester
+- import_nose
+- run_module_suite
diff --git a/doc/release/upcoming_changes/23060.expired.rst b/doc/release/upcoming_changes/23060.expired.rst
new file mode 100644
index 000000000..f80ba6fd0
--- /dev/null
+++ b/doc/release/upcoming_changes/23060.expired.rst
@@ -0,0 +1,5 @@
+The ``numpy.testing.utils`` shim has been removed.
+--------------------------------------------------
+Importing from the ``numpy.testing.utils`` shim has been deprecated since 2019,
+the shim has now been removed. All imports should be made directly from
+``numpy.testing``.
diff --git a/doc/release/upcoming_changes/23105.compatibility.rst b/doc/release/upcoming_changes/23105.compatibility.rst
new file mode 100644
index 000000000..8a0b677e5
--- /dev/null
+++ b/doc/release/upcoming_changes/23105.compatibility.rst
@@ -0,0 +1,5 @@
+* When loading data from a file handle using ``np.load``,
+  if the handle is at the end of file, as can happen when reading
+  multiple arrays by calling ``np.load`` repeatedly, numpy previously
+  raised ``ValueError`` if ``allow_pickle=False``, and ``OSError`` if
+  ``allow_pickle=True``. Now it raises ``EOFError`` instead, in both cases.
diff --git a/doc/release/upcoming_changes/23113.improvement.rst b/doc/release/upcoming_changes/23113.improvement.rst
new file mode 100644
index 000000000..299b8f762
--- /dev/null
+++ b/doc/release/upcoming_changes/23113.improvement.rst
@@ -0,0 +1,3 @@
+- The ``NDArrayOperatorsMixin`` class now specifies that it contains no
+  ``__slots__`` ensureing that subclasses can now make use of this feature in
+  Python.
diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
new file mode 100644
index 000000000..1096f8bd1
--- /dev/null
+++ b/doc/release/upcoming_changes/23136.performance.rst
@@ -0,0 +1,18 @@
+``ufunc.at`` can be much faster
+-------------------------------
+Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:
+
+- operands are aligned
+- no casting
+
+If ufuncs with appropriate indexed loops on 1d arguments with the above
+conditions, ``ufunc.at`` can be up to 60x faster (an additional 7x speedup).
+Appropriate indexed loops have been added to ``add``, ``subtract``,
+``multiply``, ``floor_divide``, ``maximum``, ``minimum``, ``fmax``, and
+``fmin``.
+
+The internal logic is similar to the logic used for regular ufuncs, which also
+have fast paths.
+
+Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
+work.
diff --git a/doc/release/upcoming_changes/23195.improvement.rst b/doc/release/upcoming_changes/23195.improvement.rst
new file mode 100644
index 000000000..38b33e849
--- /dev/null
+++ b/doc/release/upcoming_changes/23195.improvement.rst
@@ -0,0 +1,20 @@
+Ability to directly spawn random number generators
+--------------------------------------------------
+`numpy.random.Generator.spawn` now allows to directly spawn new
+independent child generators via the `numpy.random.SeedSequence.spawn`
+mechanism.
+`numpy.random.BitGenerator.spawn` does the same for the underlying
+bit generator.
+
+Additionally, `numpy.random.BitGenerator.seed_seq` now gives direct
+access to the seed sequence used for initializing the bit generator.
+This allows for example::
+
+    seed = 0x2e09b90939db40c400f8f22dae617151
+    rng = np.random.default_rng(seed)
+    child_rng1, child_rng2 = rng.spawn(2)
+
+    # safely use rng, child_rng1, and child_rng2
+
+Previously, this was hard to do without passing the ``SeedSequence``
+explicitly.  Please see `numpy.random.SeedSequence` for more information.
diff --git a/doc/release/upcoming_changes/23229.compatibility.rst b/doc/release/upcoming_changes/23229.compatibility.rst
new file mode 100644
index 000000000..284cc06ab
--- /dev/null
+++ b/doc/release/upcoming_changes/23229.compatibility.rst
@@ -0,0 +1,3 @@
+- The ``busday_count`` method now correctly handles cases where the ``begindates`` is later in time
+  than the ``enddates``. Previously, the ``enddates`` was included, even though the documentation states
+  it is always excluded.
diff --git a/doc/release/upcoming_changes/23240.compatibility.rst b/doc/release/upcoming_changes/23240.compatibility.rst
new file mode 100644
index 000000000..28536a020
--- /dev/null
+++ b/doc/release/upcoming_changes/23240.compatibility.rst
@@ -0,0 +1,10 @@
+Array-likes that define ``__array_ufunc__`` can now override ufuncs if used as ``where``
+----------------------------------------------------------------------------------------
+If the ``where`` keyword argument of a :class:`numpy.ufunc` is a subclass of
+:class:`numpy.ndarray` or is a duck type that defines
+:func:`numpy.class.__array_ufunc__` it can override the behavior of the ufunc
+using the same mechanism as the input and output arguments.
+Note that for this to work properly, the ``where.__array_ufunc__``
+implementation will have to unwrap the ``where`` argument to pass it into the
+default implementation of the ``ufunc`` or, for :class:`numpy.ndarray`
+subclasses before using ``super().__array_ufunc__``.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/23275.improvement.rst b/doc/release/upcoming_changes/23275.improvement.rst
new file mode 100644
index 000000000..14ed5d9ad
--- /dev/null
+++ b/doc/release/upcoming_changes/23275.improvement.rst
@@ -0,0 +1,4 @@
+``numpy.logspace`` now supports a non-scalar ``base`` argument
+--------------------------------------------------------------
+The ``base`` argument of ``numpy.logspace`` can now be array-like if it's
+broadcastable against the ``start`` and ``stop`` arguments.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/23302.deprecation.rst b/doc/release/upcoming_changes/23302.deprecation.rst
new file mode 100644
index 000000000..9e36d658c
--- /dev/null
+++ b/doc/release/upcoming_changes/23302.deprecation.rst
@@ -0,0 +1 @@
+* ``np.round_`` is deprecated. Use `np.round` instead. 
diff --git a/doc/release/upcoming_changes/23314.deprecation.rst b/doc/release/upcoming_changes/23314.deprecation.rst
new file mode 100644
index 000000000..8bed1aef8
--- /dev/null
+++ b/doc/release/upcoming_changes/23314.deprecation.rst
@@ -0,0 +1,4 @@
+* ``np.product`` is deprecated. Use `np.prod` instead.
+* ``np.cumproduct`` is deprecated. Use `np.cumprod` instead.
+* ``np.sometrue`` is deprecated. Use `np.any` instead.
+* ``np.alltrue`` is deprecated. Use `np.all` instead.
diff --git a/doc/release/upcoming_changes/23322.improvement.rst b/doc/release/upcoming_changes/23322.improvement.rst
new file mode 100644
index 000000000..ce5ab8cf5
--- /dev/null
+++ b/doc/release/upcoming_changes/23322.improvement.rst
@@ -0,0 +1,4 @@
+`np.ma.dot()` now supports for non-2d arrays
+--------------------------------------------
+Previously `np.ma.dot()` only worked if `a` and `b` were both 2d.
+Now it works for non-2d arrays as well as `np.dot()`.
diff --git a/doc/release/upcoming_changes/23357.improvement.rst b/doc/release/upcoming_changes/23357.improvement.rst
new file mode 100644
index 000000000..3b474146b
--- /dev/null
+++ b/doc/release/upcoming_changes/23357.improvement.rst
@@ -0,0 +1,9 @@
+Explicitly show keys of .npz file in repr
+-----------------------------------------
+``NpzFile`` shows keys of loaded .npz file when printed.
+
+.. code-block:: python
+
+   >>> npzfile = np.load('arr.npz')
+   >>> npzfile
+   NpzFile 'arr.npz' with keys arr_0, arr_1, arr_2, arr_3, arr_4...
diff --git a/doc/release/upcoming_changes/23358.improvement.rst b/doc/release/upcoming_changes/23358.improvement.rst
new file mode 100644
index 000000000..a8a0e56ef
--- /dev/null
+++ b/doc/release/upcoming_changes/23358.improvement.rst
@@ -0,0 +1,5 @@
+NumPy now exposes DType classes in ``np.dtypes``
+------------------------------------------------
+The new `numpy.dtypes` module now exposes DType classes and
+will contain future dtype related functionality.
+Most users should have no need to use these classes directly.
diff --git a/doc/release/upcoming_changes/23376.expired.rst b/doc/release/upcoming_changes/23376.expired.rst
new file mode 100644
index 000000000..e289b087c
--- /dev/null
+++ b/doc/release/upcoming_changes/23376.expired.rst
@@ -0,0 +1,9 @@
+Environment variable to disable dispatching removed
+---------------------------------------------------
+Support for the ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION`` environment variable has
+been removed. This variable disabled dispatching with ``__array_function__``.
+
+Support for ``y=`` as an alias of ``out=`` removed
+--------------------------------------------------
+The ``fix``, ``isposinf`` and ``isneginf`` functions allowed using ``y=`` as a
+(deprecated) alias for ``out=``. This is no longer supported.
diff --git a/doc/release/upcoming_changes/23403.expired.rst b/doc/release/upcoming_changes/23403.expired.rst
new file mode 100644
index 000000000..b099eb4e9
--- /dev/null
+++ b/doc/release/upcoming_changes/23403.expired.rst
@@ -0,0 +1,4 @@
+* ``np.clip`` now defaults to same-kind casting. Falling back to
+  unsafe casting was deprecated in NumPy 1.17.
+* ``np.clip`` will now propagate ``np.nan`` values passed as ``min`` or ``max``.
+  Previously, a scalar NaN was usually ignored.  This was deprecated in NumPy 1.17.
diff --git a/doc/release/upcoming_changes/23480.expired.rst b/doc/release/upcoming_changes/23480.expired.rst
new file mode 100644
index 000000000..164677b98
--- /dev/null
+++ b/doc/release/upcoming_changes/23480.expired.rst
@@ -0,0 +1 @@
+* The ``np.dual`` submodule has been removed.
diff --git a/doc/release/upcoming_changes/23528.compatibility.rst b/doc/release/upcoming_changes/23528.compatibility.rst
new file mode 100644
index 000000000..439fb6a27
--- /dev/null
+++ b/doc/release/upcoming_changes/23528.compatibility.rst
@@ -0,0 +1,16 @@
+By default, the exported NumPy C API is now compatible with NumPy 1.19
+----------------------------------------------------------------------
+Starting with NumPy 1.25 when including NumPy headers, NumPy now
+defaults to exposing a backwards compatible API.
+This means that by default binaries such as wheels build against
+NumPy 1.25 will also work with NumPy 1.16 because it has the same API version
+as NumPy 1.19 which is the oldest NumPy version compatible with Python 3.9.
+
+You can customize this behavior using::
+
+    #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
+
+or the equivalent ``-D`` option to the compiler.  For more details
+please see `for-downstream-package-authors`_.
+A change should only be required in rare cases when a package relies on newly
+added C-API.
diff --git a/doc/release/upcoming_changes/23601.change.rst b/doc/release/upcoming_changes/23601.change.rst
new file mode 100644
index 000000000..e09bd50fe
--- /dev/null
+++ b/doc/release/upcoming_changes/23601.change.rst
@@ -0,0 +1,6 @@
+C++ standard library usage
+--------------------------
+
+NumPy builds now depend on the C++ standard library, because
+the ``numpy.core._multiarray_umath`` extension is linked with
+the C++ linker.
diff --git a/doc/release/upcoming_changes/23660.expired.rst b/doc/release/upcoming_changes/23660.expired.rst
new file mode 100644
index 000000000..615367350
--- /dev/null
+++ b/doc/release/upcoming_changes/23660.expired.rst
@@ -0,0 +1,2 @@
+* NumPy now always ignores sequence behavior for an array-like (defining
+  one of the array protocols).  (Deprecation started NumPy 1.20)
diff --git a/doc/release/upcoming_changes/23661.performance.rst b/doc/release/upcoming_changes/23661.performance.rst
new file mode 100644
index 000000000..e3e55f3d5
--- /dev/null
+++ b/doc/release/upcoming_changes/23661.performance.rst
@@ -0,0 +1,4 @@
+Faster membership test on ``NpzFile``
+-------------------------------------
+Membership test on ``NpzFile`` will no longer
+decompress the archive if it is successful.
diff --git a/doc/release/upcoming_changes/23666.expired.rst b/doc/release/upcoming_changes/23666.expired.rst
new file mode 100644
index 000000000..eb51c91f3
--- /dev/null
+++ b/doc/release/upcoming_changes/23666.expired.rst
@@ -0,0 +1,5 @@
+* The niche ``FutureWarning`` when casting to a subarray dtype in ``astype``
+  or the array creation functions such as ``asarray`` is now finalized.
+  The behavior is now always the same as if the subarray dtype was
+  wrapped into a single field (which was the workaround, previously).
+  (FutureWarning since NumPy 1.20)
diff --git a/doc/release/upcoming_changes/23713.improvement.rst b/doc/release/upcoming_changes/23713.improvement.rst
new file mode 100644
index 000000000..15a4f412b
--- /dev/null
+++ b/doc/release/upcoming_changes/23713.improvement.rst
@@ -0,0 +1,9 @@
+Signed and unsigned integers always compare correctly
+-----------------------------------------------------
+When ``uint64`` and ``int64`` are mixed in NumPy, NumPy typically
+promotes both to ``float64``.  This behavior may be argued about
+but is confusing for comparisons ``==``, ``<=``, since the results
+returned can be incorrect but the conversion is hidden since the
+result is a boolean.
+NumPy will now return the correct results for these by avoiding
+the cast to float.
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 9546db5f2..ed5c11781 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -62,6 +62,13 @@ def replace_scalar_type_names():
 
 replace_scalar_type_names()
 
+
+# As of NumPy 1.25, a deprecation of `str`/`bytes` attributes happens.
+# For some reasons, the doc build accesses these, so ignore them.
+import warnings
+warnings.filterwarnings("ignore", "In the future.*NumPy scalar", FutureWarning)
+
+
 # -----------------------------------------------------------------------------
 # General configuration
 # -----------------------------------------------------------------------------
@@ -156,7 +163,6 @@ def setup(app):
 # If we deemed it desirable, we could in future make these real modules, which
 # would make `from numpy.char import split` work.
 sys.modules['numpy.char'] = numpy.char
-sys.modules['numpy.testing.dec'] = numpy.testing.dec
 
 # -----------------------------------------------------------------------------
 # HTML output
@@ -248,25 +254,39 @@ latex_documents = [
 #latex_use_parts = False
 
 latex_elements = {
-    'fontenc': r'\usepackage[LGR,T1]{fontenc}'
 }
 
 # Additional stuff for the LaTeX preamble.
 latex_elements['preamble'] = r'''
+\newfontfamily\FontForChinese{FandolSong-Regular}[Extension=.otf]
+\catcode`琴\active\protected\def琴{{\FontForChinese\string琴}}
+\catcode`春\active\protected\def春{{\FontForChinese\string春}}
+\catcode`鈴\active\protected\def鈴{{\FontForChinese\string鈴}}
+\catcode`猫\active\protected\def猫{{\FontForChinese\string猫}}
+\catcode`傅\active\protected\def傅{{\FontForChinese\string傅}}
+\catcode`立\active\protected\def立{{\FontForChinese\string立}}
+\catcode`业\active\protected\def业{{\FontForChinese\string业}}
+\catcode`（\active\protected\def（{{\FontForChinese\string（}}
+\catcode`）\active\protected\def）{{\FontForChinese\string）}}
+
 % In the parameters section, place a newline after the Parameters
-% header
-\usepackage{xcolor}
+% header.  This is default with Sphinx 5.0.0+, so no need for
+% the old hack then.
+% Unfortunately sphinx.sty 5.0.0 did not bump its version date
+% so we check rather sphinxpackagefootnote.sty (which exists
+% since Sphinx 4.0.0).
+\makeatletter
+\@ifpackagelater{sphinxpackagefootnote}{2022/02/12}
+    {}% Sphinx >= 5.0.0, nothing to do
+    {%
 \usepackage{expdlist}
 \let\latexdescription=\description
 \def\description{\latexdescription{}{} \breaklabel}
 % but expdlist old LaTeX package requires fixes:
 % 1) remove extra space
 \usepackage{etoolbox}
-\makeatletter
 \patchcmd\@item{{\@breaklabel} }{{\@breaklabel}}{}{}
-\makeatother
 % 2) fix bug in expdlist's way of breaking the line after long item label
-\makeatletter
 \def\breaklabel{%
     \def\@breaklabel{%
         \leavevmode\par
@@ -274,6 +294,7 @@ latex_elements['preamble'] = r'''
         \def\leavevmode{\def\leavevmode{\unhbox\voidb@x}}%
     }%
 }
+    }% Sphinx < 5.0.0 (and assumed >= 4.0.0)
 \makeatother
 
 % Make Examples/etc section headers smaller and more compact
diff --git a/doc/source/dev/alignment.rst b/doc/source/dev/alignment.rst
index bb1198ebf..f2321d17b 100644
--- a/doc/source/dev/alignment.rst
+++ b/doc/source/dev/alignment.rst
@@ -3,7 +3,7 @@
 .. _alignment:
 
 ****************
-Memory Alignment
+Memory alignment
 ****************
 
 NumPy alignment goals
diff --git a/doc/source/dev/depending_on_numpy.rst b/doc/source/dev/depending_on_numpy.rst
index 0582048cb..868b44162 100644
--- a/doc/source/dev/depending_on_numpy.rst
+++ b/doc/source/dev/depending_on_numpy.rst
@@ -48,70 +48,80 @@ job, either all warnings or otherwise at least ``DeprecationWarning`` and
 adapt your code.
 
 
+.. _depending_on_numpy:
+
 Adding a dependency on NumPy
 ----------------------------
 
 Build-time dependency
 ~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+
+    Before NumPy 1.25, the NumPy C-API was *not* backwards compatible.  This
+    means that when compiling with a NumPy version earlier than 1.25 you
+    have to compile with the oldest version you wish to support.
+    This can be done by using
+    `oldest-supported-numpy <https://github.com/scipy/oldest-supported-numpy/>`__.
+    Please see the NumPy 1.24 documentation at
+    `https://numpy.org/doc/1.24/dev/depending_on_numpy.html`__.
+
+
 If a package either uses the NumPy C API directly or it uses some other tool
 that depends on it like Cython or Pythran, NumPy is a *build-time* dependency
-of the package. Because the NumPy ABI is only forward compatible, you must
-build your own binaries (wheels or other package formats) against the lowest
-NumPy version that you support (or an even older version).
-
-Picking the correct NumPy version to build against for each Python version and
-platform can get complicated. There are a couple of ways to do this.
-Build-time dependencies are specified in ``pyproject.toml`` (see PEP 517),
-which is the file used to build wheels by PEP 517 compliant tools (e.g.,
-when using ``pip wheel``).
-
-You can specify everything manually in ``pyproject.toml``, or you can instead
-rely on the `oldest-supported-numpy <https://github.com/scipy/oldest-supported-numpy/>`__
-metapackage. ``oldest-supported-numpy`` will specify the correct NumPy version
-at build time for wheels, taking into account Python version, Python
-implementation (CPython or PyPy), operating system and hardware platform. It
-will specify the oldest NumPy version that supports that combination of
-characteristics.  Note: for platforms for which NumPy provides wheels on PyPI,
-it will be the first version with wheels (even if some older NumPy version
-happens to build).
-
-For conda-forge it's a little less complicated: there's dedicated handling for
-NumPy in build-time and runtime dependencies, so typically this is enough
-(see `here <https://conda-forge.org/docs/maintainer/knowledge_base.html#building-against-numpy>`__ for docs)::
+of the package. 
 
-    host:
-      - numpy
-    run:
-      - {{ pin_compatible('numpy') }}
+By default, NumPy will expose an API that is backwards compatible with the
+oldest NumPy version that supports the currently oldest compatible Python
+version.  NumPy 1.25.0 supports Python 3.9 and higher and NumPy 1.19 is the
+first version to support Python 3.9.  Thus, we guarantee that, when using
+defaults, NumPy 1.25 will expose a C-API compatible with NumPy 1.19.
+(the exact version is set within NumPy-internal header files).
 
-.. note::
+NumPy is also forward compatible for all minor releases, but a major release
+will require recompilation.
+
+The default behavior can be customized for example by adding::
 
-    ``pip`` has ``--no-use-pep517`` and ``--no-build-isolation`` flags that may
-    ignore ``pyproject.toml`` or treat it differently - if users use those
-    flags, they are responsible for installing the correct build dependencies
-    themselves.
+    #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
 
-    ``conda`` will always use ``-no-build-isolation``; dependencies for conda
-    builds are given in the conda recipe (``meta.yaml``), the ones in
-    ``pyproject.toml`` have no effect.
+before including any NumPy headers (or the equivalent ``-D`` compiler flag) in
+every extension module that requires the NumPy C-API.
+This is mainly useful if you need to use newly added API at the cost of not
+being compatible with older versions.
 
-    Please do not use ``setup_requires`` (it is deprecated and may invoke
-    ``easy_install``).
+If for some reason you wish to compile for the currently installed NumPy
+version by default you can add::
 
-Because for NumPy you have to care about ABI compatibility, you
-specify the version with ``==`` to the lowest supported version. For your other
-build dependencies you can probably be looser, however it's still important to
-set lower and upper bounds for each dependency. It's fine to specify either a
-range or a specific version for a dependency like ``wheel`` or ``setuptools``.
+    #ifndef NPY_TARGET_VERSION
+        #define NPY_TARGET_VERSION NPY_API_VERSION
+    #endif
 
-.. warning::
+Which allows a user to override the default via ``-DNPY_TARGET_VERSION``.
+This define must be consistent for each extension module (use of
+``import_array()``) and also applies to the umath module.
+
+When you compile against NumPy, you should add the proper version restrictions
+to your ``pyproject.toml`` (see PEP 517).  Since your extension will not be
+compatible with a new major release of NumPy and may not be compatible with
+very old versions.
+
+For conda-forge packages, please see
+`here <https://conda-forge.org/docs/maintainer/knowledge_base.html#building-against-numpy>`__.
+
+as of now, it is usually as easy as including::
+
+    host:
+      - numpy
+    run:
+      - {{ pin_compatible('numpy') }}
+
+.. note::
 
-    Note that ``setuptools`` does major releases often and those may contain
-    changes that break ``numpy.distutils``, which will *not* be updated anymore
-    for new ``setuptools`` versions. It is therefore recommended to set an
-    upper version bound in your build configuration for the last known version
-    of ``setuptools`` that works with your build.
+    At the time of NumPy 1.25, NumPy 2.0 is expected to be the next release
+    of NumPy.  The NumPy 2.0 release is expected to require a different pin,
+    since NumPy 2+ will be needed in order to be compatible with both NumPy
+    1.x and 2.x.
 
 
 Runtime dependency & version ranges
diff --git a/doc/source/dev/development_environment.rst b/doc/source/dev/development_environment.rst
index 0ac02271d..aedc489d6 100644
--- a/doc/source/dev/development_environment.rst
+++ b/doc/source/dev/development_environment.rst
@@ -18,15 +18,36 @@ sources needs some additional steps, which are explained below.  For the rest
 of this chapter we assume that you have set up your git repo as described in
 :ref:`using-git`.
 
-.. note:: If you are having trouble building NumPy from source or setting up
-   your local development environment, you can try
-   to :ref:`build NumPy with Gitpod <development-gitpod>`.
+.. note::
+
+   If you are having trouble building NumPy from source or setting up your
+   local development environment, you can try to build NumPy with GitHub
+   Codespaces. It allows you to create the correct development environment
+   right in your browser, reducing the need to install local development
+   environments and deal with incompatible dependencies.
+
+   If you have good internet connectivity and want a temporary set-up, it is
+   often faster to work on NumPy in a Codespaces environment. For documentation
+   on how to get started with Codespaces, see
+   `the Codespaces docs <https://docs.github.com/en/codespaces>`__.
+   When creating a codespace for the ``numpy/numpy`` repository, the default
+   2-core machine type works; 4-core will build and work a bit faster (but of
+   course at a cost of halving your number of free usage hours). Once your
+   codespace has started, you can run ``conda activate numpy-dev`` and your
+   development environment is completely set up - you can then follow the
+   relevant parts of the NumPy documentation to build, test, develop, write
+   docs, and contribute to NumPy.
+
 
 .. _testing-builds:
 
 Testing builds
 --------------
 
+Before running the tests, first install the test dependencies::
+
+    $ python -m pip install -r test_requirements.txt
+
 To build the development version of NumPy and run tests, spawn
 interactive shells with the Python import paths properly set up etc.,
 do one of::
diff --git a/doc/source/dev/development_gitpod.rst b/doc/source/dev/development_gitpod.rst
deleted file mode 100644
index 4e386867d..000000000
--- a/doc/source/dev/development_gitpod.rst
+++ /dev/null
@@ -1,271 +0,0 @@
-.. _development-gitpod:
-
-
-Using Gitpod for NumPy development
-==================================
-
-This section of the documentation will guide you through:
-
-*  using GitPod for your NumPy development environment
-*  creating a personal fork of the NumPy repository on GitHub
-*  a quick tour of Gitpod and VSCode
-*  working on the NumPy documentation in Gitpod
-
-Gitpod
-------
-
-`Gitpod`_  is an open-source platform for automated and ready-to-code 
-development environments. It enables developers to describe their dev 
-environment as code and start instant and fresh development environments for 
-each new task directly from your browser. This reduces the need to install local 
-development environments and deal with incompatible dependencies.
-
-Gitpod GitHub integration
--------------------------
-
-To be able to use Gitpod, you will need to have the Gitpod app installed on your 
-GitHub account, so if
-you do not have an account yet, you will need to create one first.
-
-Head over to the `Gitpod`_ website and click on the **Continue with GitHub** 
-button. You will be redirected to the GitHub authentication page.
-You will then be asked to install the `Gitpod GitHub app <https://github.com/marketplace/gitpod-io>`_.
-
-Make sure to select **All repositories** access option to avoid issues with 
-permissions later on. Click on the green **Install** button
-
-.. image:: ./gitpod-imgs/installing-gitpod-io.png
-   :alt: Gitpod repository access and installation screenshot
-
-This will install the necessary hooks for the integration.
-
-Forking the NumPy repository
-----------------------------
-
-The best way to work on NumPy as a contributor is by making a fork of the 
-repository first.
-
-#. Browse to the `NumPy repository on GitHub`_ and `create your own fork`_.
-#. Browse to your fork. Your fork will have a URL like 
-   https://github.com/melissawm/NumPy, except with your GitHub username in place of ``melissawm``.
-
-Starting Gitpod
----------------
-Once you have authenticated to Gitpod through GitHub, you can install the 
-`Gitpod browser extension <https://www.gitpod.io/docs/browser-extension>`_  
-which will add a **Gitpod** button next to the **Code** button in the 
-repository:
-
-.. image:: ./gitpod-imgs/NumPy-github.png
-   :alt: NumPy repository with Gitpod button screenshot
-
-#. If you install the extension - you can click the **Gitpod** button to start 
-   a new workspace.
-
-#. Alternatively, if you do not want to install the browser extension, you can 
-   visit https://gitpod.io/#https://github.com/USERNAME/NumPy replacing 
-   ``USERNAME`` with your GitHub username.
-
-#. In both cases, this will open a new tab on your web browser and start 
-   building your development environment. Please note this can take a few 
-   minutes.
-
-#. Once the build is complete, you will be directed to your workspace, 
-   including the VSCode editor and all the dependencies you need to work on 
-   NumPy. The first time you start your workspace, you will notice that there 
-   might be some actions running. This will ensure that you have a development 
-   version of NumPy installed and that the docs are being pre-built for you.
-
-#. When your workspace is ready, you can :ref:`test the build<testing-builds>` by 
-   entering::
-
-      $ python runtests.py -v
-
-``runtests.py`` is another script in the NumPy root directory. It runs a suite 
-of tests that make sure NumPy is working as it should, and ``-v`` activates the 
-``--verbose`` option to show all the test output.
-
-Quick workspace tour
---------------------
-Gitpod uses VSCode as the editor. If you have not used this editor before, you 
-can check the Getting started `VSCode docs`_ to familiarize yourself with it.
-
-Your workspace will look similar to the image below:
-
-.. image:: ./gitpod-imgs/gitpod-workspace.png
-   :alt: Gitpod workspace screenshot
-
-.. note::  By default, VSCode initializes with a light theme. You can change to 
-   a dark theme by with the keyboard shortcut :kbd:`Cmd-K Cmd-T` in Mac or 
-   :kbd:`Ctrl-K Ctrl-T` in Linux and Windows.
-
-We have marked some important sections in the editor:
-
-#. Your current Python interpreter - by default, this is ``numpy-dev`` and 
-   should be displayed in the status bar and on your terminal. You do not need 
-   to activate the conda environment as this will always be activated for you.
-#. Your current branch is always displayed in the status bar. You can also use 
-   this button to change or create branches.
-#. GitHub Pull Requests extension - you can use this to work with Pull Requests 
-   from your workspace.
-#. Marketplace extensions - we have added some essential extensions to the NumPy 
-   Gitpod. Still, you can also install other extensions or syntax highlighting 
-   themes for your user, and these will be preserved for you.
-#. Your workspace directory - by default, it is ``/workspace/numpy``. **Do not 
-   change this** as this is the only directory preserved in Gitpod.
-
-We have also pre-installed a few tools and VSCode extensions to help with the 
-development experience:
-
-*  `GitHub CLI <https://cli.github.com/>`_
-*  `VSCode rst extension <https://marketplace.visualstudio.com/items?itemName=lextudio.restructuredtext>`_
-*  `VSCode Live server extension <https://marketplace.visualstudio.com/items?itemName=ritwickdey.LiveServer>`_
-*  `VSCode Gitlens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
-*  `VSCode autodocstrings extension <https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring>`_
-*  `VSCode Git Graph extension <https://marketplace.visualstudio.com/items?itemName=mhutchie.git-graph>`_
-
-Development workflow with Gitpod
---------------------------------
-The  :ref:`development-workflow` section of this documentation contains 
-information regarding the NumPy development workflow. Make sure to check this 
-before working on your contributions.
-
-When using Gitpod, git is pre configured for you:
-
-#. You do not need to configure your git username, and email as this should be 
-   done for you as you authenticated through GitHub. You can check the git 
-   configuration with the command ``git config --list`` in your terminal.
-#. As you started your workspace from your own NumPy fork, you will by default 
-   have both ``upstream`` and ``origin`` added as remotes. You can verify this by 
-   typing ``git remote`` on your terminal or by clicking on the **branch name** 
-   on the status bar (see image below).
-
-   .. image:: ./gitpod-imgs/NumPy-gitpod-branches.png
-      :alt: Gitpod workspace branches plugin screenshot
-
-Rendering the NumPy documentation
----------------------------------
-You can find the detailed documentation on how rendering the documentation with 
-Sphinx works in the :ref:`howto-build-docs` section.
-
-The documentation is pre-built during your workspace initialization. So once 
-this task is completed, you have two main options to render the documentation 
-in Gitpod.
-
-Option 1: Using Liveserve
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-#. View the documentation in ``NumPy/doc/build/html``. You can start with 
-   ``index.html`` and browse, or you can jump straight to the file you're 
-   interested in.
-#. To see the rendered version of a page, you can right-click on the ``.html`` 
-   file and click on **Open with Live Serve**. Alternatively, you can open the 
-   file in the editor and click on the **Go live** button on the status bar.
-
-    .. image:: ./gitpod-imgs/vscode-statusbar.png
-        :alt: Gitpod workspace VSCode start live serve screenshot
-
-#. A simple browser will open to the right-hand side of the editor. We recommend 
-   closing it and click on the **Open in browser** button in the pop-up.
-#. To stop the server click on the **Port: 5500** button on the status bar.
-
-Option 2: Using the rst extension
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A quick and easy way to see live changes in a ``.rst`` file as you work on it 
-uses the rst extension with docutils.
-
-.. note:: This will generate a simple live preview of the document without the 
-    ``html`` theme, and some backlinks might not be added correctly. But it is an 
-    easy and lightweight way to get instant feedback on your work.
-
-#. Open any of the source documentation files located in ``doc/source`` in the 
-   editor.
-#. Open VSCode Command Palette with :kbd:`Cmd-Shift-P` in Mac or 
-   :kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured" 
-   and choose either "Open preview" or "Open preview to the Side".
-
-    .. image:: ./gitpod-imgs/vscode-rst.png
-        :alt: Gitpod workspace VSCode open rst screenshot
-
-#. As you work on the document, you will see a live rendering of it on the editor.
-
-    .. image:: ./gitpod-imgs/rst-rendering.png
-        :alt: Gitpod workspace VSCode rst rendering screenshot
-
-If you want to see the final output with the ``html`` theme you will need to 
-rebuild the docs with ``make html`` and use Live Serve as described in option 1.
-
-FAQ's and troubleshooting
--------------------------
-
-How long is my Gitpod workspace kept for?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Your stopped workspace will be kept for 14 days and deleted afterwards if you do 
-not use them.
-
-Can I come back to a previous workspace?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Yes, let's say you stepped away for a while and you want to carry on working on 
-your NumPy contributions. You need to visit https://gitpod.io/workspaces and 
-click on the workspace you want to spin up again. All your changes will be there 
-as you last left them.
-
-Can I install additional VSCode extensions?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Absolutely! Any extensions you installed will be installed in your own workspace 
-and preserved.
-
-I registered on Gitpod but I still cannot see a ``Gitpod`` button in my repositories.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Head to https://gitpod.io/integrations and make sure you are logged in. 
-Hover over GitHub and click on the three buttons that appear on the right. 
-Click on edit permissions and make sure you have ``user:email``, 
-``read:user``, and ``public_repo`` checked. Click on **Update Permissions** 
-and confirm the changes in the GitHub application page.
-
-.. image:: ./gitpod-imgs/gitpod-edit-permissions-gh.png
-   :alt: Gitpod integrations - edit GH permissions screenshot
-
-How long does my workspace stay active if I'm not using it?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you keep your workspace open in a browser tab but don't interact with it, 
-it will shut down after 30 minutes. If you close the browser tab, it will 
-shut down after 3 minutes.
-
-My terminal is blank - there is no cursor and it's completely unresponsive
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Unfortunately this is a known-issue on Gitpod's side. You can sort this 
-issue in two ways:
-
-#. Create a new Gitpod workspace altogether.
-#. Head to your `Gitpod dashboard <https://gitpod.io/workspaces>`_ and locate 
-   the running workspace. Hover on it and click on the **three dots menu** 
-   and then click on **Stop**. When the workspace is completely stopped you 
-   can click on its name to restart it again.   
-
-.. image:: ./gitpod-imgs/gitpod-dashboard-stop.png
-   :alt: Gitpod dashboard and workspace menu screenshot
-
-I authenticated through GitHub but I still cannot commit to the repository through Gitpod. 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Head to https://gitpod.io/integrations and make sure you are logged in. 
-Hover over GitHub and click on the three buttons that appear on the right. 
-Click on edit permissions and make sure you have ``public_repo`` checked.
-Click on **Update Permissions** and confirm the changes in the 
-GitHub application page.
-
-.. image:: ./gitpod-imgs/gitpod-edit-permissions-repo.png
-   :alt: Gitpod integrations - edit GH repository permissions screenshot
-
-.. _Gitpod: https://www.gitpod.io/
-.. _NumPy repository on GitHub: https://github.com/NumPy/NumPy
-.. _create your own fork: https://help.github.com/en/articles/fork-a-repo
-.. _VSCode docs: https://code.visualstudio.com/docs/getstarted/tips-and-tricks
diff --git a/doc/source/dev/development_workflow.rst b/doc/source/dev/development_workflow.rst
index 33bd0126d..9fa9c5809 100644
--- a/doc/source/dev/development_workflow.rst
+++ b/doc/source/dev/development_workflow.rst
@@ -200,12 +200,46 @@ sequences. In such cases you may explicitly skip CI by including one of these
 fragments in your commit message:
 
 * ``[skip ci]``: skip all CI
-* ``[skip github]``: skip GitHub Actions "build numpy and run tests" jobs
-* ``[skip actions]``: skip all GitHub Actions
+
+  Only recommended if you are still not ready for the checks to run on your PR
+  (for example, if this is only a draft.)
+
+* ``[skip actions]``: skip GitHub Actions jobs
+
+  `GitHub Actions <https://docs.github.com/actions>`__ is where most of the CI
+  checks are run, including the linter, benchmarking, running basic tests for
+  most architectures and OSs, and several compiler and CPU optimization
+  settings.
+  `See the configuration files for these checks. <https://github.com/numpy/numpy/tree/main/.github/workflows>`__
+
 * ``[skip travis]``: skip TravisCI jobs
+
+  `TravisCI <https://www.travis-ci.com/>`__ will test your changes against
+  Python 3.9 on the PowerPC and s390x architectures.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.travis.yml>`__
+
 * ``[skip azp]``: skip Azure jobs
+
+  `Azure <https://azure.microsoft.com/en-us/products/devops/pipelines>`__ is
+  where all comprehensive tests are run. This is an expensive run, and one you
+  could typically skip if you do documentation-only changes, for example.
+  `See the main configuration file for these checks. <https://github.com/numpy/numpy/blob/main/azure-pipelines.yml>`__
+
 * ``[skip circle]``: skip CircleCI jobs
 
+  `CircleCI <https://circleci.com/>`__ is where we build the documentation and
+  store the generated artifact for preview in each PR. This check will also run
+  all the docstrings examples and verify their results. If you don't make
+  documentation changes, but you make changes to a function's API, for example,
+  you may need to run these tests to verify that the doctests are still valid.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.circleci/config.yml>`__
+
+* ``[skip cirrus]``: skip Cirrus jobs
+
+  `CirrusCI <https://cirrus-ci.org/>`__ mostly triggers Linux aarch64 and MacOS Arm64 wheels
+  uploads.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.cirrus.star>`__
+
 Test building wheels
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -481,6 +515,28 @@ usual::
      git commit -am 'ENH - much better code'
      git push origin my-feature-branch # pushes directly into your repo
 
+
+Checkout changes from an existing pull request
+==============================================
+
+If you want to test the changes in a pull request or continue the work in a
+new pull request, the commits are to be cloned into a local branch in your
+forked repository
+
+First ensure your upstream points to the main repo, as from :ref:`linking-to-upstream`
+
+Then, fetch the changes and create a local branch. Assuming ``$ID`` is the pull request number
+and ``$BRANCHNAME`` is the name of the *new local* branch you wish to create::
+
+    git fetch upstream pull/$ID/head:$BRANCHNAME
+
+Checkout the newly created branch::
+
+    git checkout $BRANCHNAME
+
+You now have the changes in the pull request.
+
+
 Exploring your repository
 =========================
 
diff --git a/doc/source/dev/gitpod-imgs/NumPy-github.png b/doc/source/dev/gitpod-imgs/NumPy-github.png
deleted file mode 100644
index 010b0fc5e..000000000
--- a/doc/source/dev/gitpod-imgs/NumPy-github.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png b/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png
deleted file mode 100644
index 3ee6c5f20..000000000
--- a/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png b/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png
deleted file mode 100644
index 40f137745..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png b/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png
deleted file mode 100644
index 8955e907a..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png b/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png
deleted file mode 100644
index 8bfaff81c..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-workspace.png b/doc/source/dev/gitpod-imgs/gitpod-workspace.png
deleted file mode 100644
index a65c9bd7e..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-workspace.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/installing-gitpod-io.png b/doc/source/dev/gitpod-imgs/installing-gitpod-io.png
deleted file mode 100644
index 97319a729..000000000
--- a/doc/source/dev/gitpod-imgs/installing-gitpod-io.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/rst-rendering.png b/doc/source/dev/gitpod-imgs/rst-rendering.png
deleted file mode 100644
index 41cc305f3..000000000
--- a/doc/source/dev/gitpod-imgs/rst-rendering.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/vscode-rst.png b/doc/source/dev/gitpod-imgs/vscode-rst.png
deleted file mode 100644
index 5b574c115..000000000
--- a/doc/source/dev/gitpod-imgs/vscode-rst.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/vscode-statusbar.png b/doc/source/dev/gitpod-imgs/vscode-statusbar.png
deleted file mode 100644
index 3febbcee0..000000000
--- a/doc/source/dev/gitpod-imgs/vscode-statusbar.png
+++ /dev/null
diff --git a/doc/source/dev/gitwash/development_setup.rst b/doc/source/dev/gitwash/development_setup.rst
index 4cb6c8358..75b7fb2f8 100644
--- a/doc/source/dev/gitwash/development_setup.rst
+++ b/doc/source/dev/gitwash/development_setup.rst
@@ -107,6 +107,10 @@ Make the local copy
 
     git submodule update --init
 
+#. Pull from upstream to get latest tag information: ::
+
+    git pull
+
 ******************************************************************************
 Look it over
 ******************************************************************************
diff --git a/doc/source/dev/gitwash/git_resources.rst b/doc/source/dev/gitwash/git_resources.rst
index c41af762c..b6930f394 100644
--- a/doc/source/dev/gitwash/git_resources.rst
+++ b/doc/source/dev/gitwash/git_resources.rst
@@ -1,7 +1,7 @@
 .. _git-resources:
 
 =========================
-Additional Git_ Resources
+Additional Git_ resources
 =========================
 
 Tutorials and summaries
diff --git a/doc/source/dev/howto_build_docs.rst b/doc/source/dev/howto_build_docs.rst
index 02a8820c9..b3d2e3055 100644
--- a/doc/source/dev/howto_build_docs.rst
+++ b/doc/source/dev/howto_build_docs.rst
@@ -14,22 +14,13 @@ in several different formats.
 Development environments
 ========================
 
-Before proceeding further it should be noted that the documentation is built with the ``make`` tool,
-which is not natively available on Windows. MacOS or Linux users can jump
-to :ref:`how-todoc.prerequisites`. It is recommended for Windows users to set up their development
-environment on :ref:`Gitpod <development-gitpod>` or `Windows Subsystem
-for Linux (WSL) <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_. WSL is a good option
-for a persistent local set-up.
-
-Gitpod
-~~~~~~
-Gitpod is an open-source platform that automatically creates the correct development environment right
-in your browser, reducing the need to install local development environments and deal with
-incompatible dependencies.
-
-If you have good internet connectivity and want a temporary set-up,
-it is often faster to build with Gitpod. Here are the in-depth instructions for
-:ref:`building NumPy with Gitpod <development-gitpod>`.
+Before proceeding further it should be noted that the documentation is built
+with the ``make`` tool, which is not natively available on Windows. MacOS or
+Linux users can jump to :ref:`how-todoc.prerequisites`. It is recommended for
+Windows users to set up their development environment on
+GitHub Codespaces (see :ref:`recommended-development-setup`) or
+`Windows Subsystem for Linux (WSL) <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+WSL is a good option for a persistent local set-up.
 
 
 .. _how-todoc.prerequisites:
@@ -44,7 +35,8 @@ NumPy
 
 Since large parts of the main documentation are obtained from NumPy via
 ``import numpy`` and examining the docstrings, you will need to first
-:ref:`build <building-from-source>` and install it so that the correct version is imported.
+:ref:`build <development-environment>` and install it so that the correct version is
+imported.
 NumPy has to be re-built and re-installed every time you fetch the latest version of the
 repository, before generating the documentation. This ensures that the NumPy version and
 the git repository version are in sync.
diff --git a/doc/source/dev/index.rst b/doc/source/dev/index.rst
index 93b57f7e2..b4479fa0d 100644
--- a/doc/source/dev/index.rst
+++ b/doc/source/dev/index.rst
@@ -45,7 +45,7 @@ Here's the short summary, complete TOC links are below:
 
    * Clone the project to your local computer::
 
-      git clone https://github.com/your-username/numpy.git
+      git clone --recurse-submodules https://github.com/your-username/numpy.git
 
    * Change the directory::
 
@@ -60,12 +60,16 @@ Here's the short summary, complete TOC links are below:
      - ``upstream``, which refers to the ``numpy`` repository
      - ``origin``, which refers to your personal fork
 
-2. Develop your contribution:
-
-   * Pull the latest changes from upstream::
+   * Pull the latest changes from upstream, including tags::
 
       git checkout main
-      git pull upstream main
+      git pull upstream main --tags
+
+   * Initialize numpy's submodules::
+
+      git submodule update --init
+
+2. Develop your contribution:
 
    * Create a branch for the feature you want to work on. Since the
      branch name will appear in the merge message, use a sensible name
@@ -180,7 +184,7 @@ Guidelines
   get no response to your pull request within a week.
 
 .. _stylistic-guidelines:
-  
+
 Stylistic Guidelines
 --------------------
 
@@ -210,7 +214,7 @@ Running NumPy's test suite locally requires some additional packages, such as
 in ``test_requirements.txt`` in the top-level directory, and can conveniently
 be installed with::
 
-    pip install -r test_requirements.txt
+    $ python -m pip install -r test_requirements.txt
 
 Tests for a module should ideally cover all code in that module,
 i.e., statement coverage should be at 100%.
@@ -258,7 +262,6 @@ The rest of the story
 
    Git Basics <gitwash/index>
    development_environment
-   development_gitpod
    howto_build_docs
    development_workflow
    development_advanced_debugging
diff --git a/doc/source/dev/reviewer_guidelines.rst b/doc/source/dev/reviewer_guidelines.rst
index 8d938319e..0ed9bf2e5 100644
--- a/doc/source/dev/reviewer_guidelines.rst
+++ b/doc/source/dev/reviewer_guidelines.rst
@@ -1,7 +1,7 @@
 .. _reviewer-guidelines:
 
 ===================
-Reviewer Guidelines
+Reviewer guidelines
 ===================
 
 Reviewing open pull requests (PRs) helps move the project forward. We encourage
@@ -101,7 +101,34 @@ For maintainers
   If a PR becomes inactive, maintainers may make larger changes. 
   Remember, a PR is a collaboration between a contributor and a reviewer/s, 
   sometimes a direct push is the best way to finish it.
- 
+
+API Changes
+-----------
+As mentioned most public API changes should be discussed ahead of time and
+often with a wider audience (on the mailing list, or even through a NEP).
+
+For changes in the public C-API be aware that the NumPy C-API is backwards
+compatible so that any addition must be ABI compatible with previous versions.
+When it is not the case, you must add a guard.
+
+For example ``PyUnicodeScalarObject`` struct contains the following::
+
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
+        char *buffer_fmt;
+    #endif
+
+Because the ``buffer_fmt`` field was added to its end in NumPy 1.20 (all
+previous fields remained ABI compatible).
+Similarly, any function added to the API table in
+``numpy/core/code_generators/numpy_api.py`` must use the ``MinVersion``
+annotation.
+For example::
+
+    'PyDataMem_SetHandler':                 (304, MinVersion("1.22")),
+
+Header only functionality (such as a new macro) typically does not need to be
+guarded.
+
 GitHub Workflow
 ---------------
 
diff --git a/doc/source/f2py/buildtools/cmake.rst b/doc/source/f2py/buildtools/cmake.rst
index 8c654c73e..db64453b4 100644
--- a/doc/source/f2py/buildtools/cmake.rst
+++ b/doc/source/f2py/buildtools/cmake.rst
@@ -18,7 +18,7 @@ but this `extensive CMake collection`_ of resources is great.
    ``f2py`` is not particularly native or pleasant; and a more natural approach
    is to consider :ref:`f2py-skbuild`
 
-Fibonacci Walkthrough (F77)
+Fibonacci walkthrough (F77)
 ===========================
 
 Returning to the ``fib``  example from :ref:`f2py-getting-started` section.
diff --git a/doc/source/f2py/buildtools/meson.rst b/doc/source/f2py/buildtools/meson.rst
index 6b4392880..23b454ee5 100644
--- a/doc/source/f2py/buildtools/meson.rst
+++ b/doc/source/f2py/buildtools/meson.rst
@@ -23,7 +23,7 @@ build system like ``meson``. We will acquire this by:
 
 .. code-block:: bash
 
-    python -n numpy.f2py fib1.f -m fib2
+    python -m numpy.f2py fib1.f -m fib2
 
 Now, consider the following ``meson.build`` file for the ``fib`` and ``scalar``
 examples from :ref:`f2py-getting-started` section:
@@ -57,7 +57,7 @@ to lowercase the original Fortran file with say:
 .. code-block:: bash
 
    tr "[:upper:]" "[:lower:]" < fib1.f > fib1.f
-   python -n numpy.f2py fib1.f -m fib2
+   python -m numpy.f2py fib1.f -m fib2
    meson --wipe builddir
    meson compile -C builddir
    cd builddir
@@ -68,7 +68,7 @@ possible. The easiest way to solve this is to let ``f2py`` deal with it:
 
 .. code-block:: bash
 
-   python -n numpy.f2py fib1.f -m fib2 --lower
+   python -m numpy.f2py fib1.f -m fib2 --lower
    meson --wipe builddir
    meson compile -C builddir
    cd builddir
diff --git a/doc/source/f2py/code/ftype.f b/doc/source/f2py/code/ftype.f
index cabbb9e2d..67d5a224b 100644
--- a/doc/source/f2py/code/ftype.f
+++ b/doc/source/f2py/code/ftype.f
@@ -4,6 +4,6 @@ C FILE: FTYPE.F
 Cf2py integer optional,intent(in) :: n = 13
       REAL A,X
       COMMON /DATA/ A,X(3)
-      PRINT*, "IN FOO: N=",N," A=",A," X=[",X(1),X(2),X(3),"]"
+C     PRINT*, "IN FOO: N=",N," A=",A," X=[",X(1),X(2),X(3),"]"
       END
 C END OF FTYPE.F
diff --git a/doc/source/f2py/f2py.getting-started.rst b/doc/source/f2py/f2py.getting-started.rst
index da88b46f5..e96564814 100644
--- a/doc/source/f2py/f2py.getting-started.rst
+++ b/doc/source/f2py/f2py.getting-started.rst
@@ -18,6 +18,7 @@ following steps:
 
 * F2PY reads a signature file and writes a Python C/API module containing
   Fortran/C/Python bindings.
+
 * F2PY compiles all sources and builds an extension module containing
   the wrappers.
 
@@ -26,6 +27,13 @@ following steps:
     Fortran, SGI MIPSpro, Absoft, NAG, Compaq etc. For different build systems,
     see :ref:`f2py-bldsys`.
 
+  * Depending on your operating system, you may need to install the Python
+    development headers (which provide the file ``Python.h``) separately. In
+    Linux Debian-based distributions this package should be called ``python3-dev``,
+    in Fedora-based distributions it is ``python3-devel``. For macOS, depending
+    how Python was installed, your mileage may vary. In Windows, the headers are
+    typically installed already.
+
 Depending on the situation, these steps can be carried out in a single composite
 command or step-by-step; in which case some steps can be omitted or combined
 with others.
diff --git a/doc/source/f2py/windows/index.rst b/doc/source/f2py/windows/index.rst
index c1e6b4128..980346667 100644
--- a/doc/source/f2py/windows/index.rst
+++ b/doc/source/f2py/windows/index.rst
@@ -56,7 +56,7 @@ PGI Compilers (commercial)
    Windows support`_.
 
 Cygwin (FOSS)
-   Can also be used for ``gfortran``. Howeve, the POSIX API compatibility layer provided by
+   Can also be used for ``gfortran``. However, the POSIX API compatibility layer provided by
    Cygwin is meant to compile UNIX software on Windows, instead of building
    native Windows programs. This means cross compilation is required.
 
diff --git a/doc/source/f2py/windows/pgi.rst b/doc/source/f2py/windows/pgi.rst
index 644259abe..28e25f016 100644
--- a/doc/source/f2py/windows/pgi.rst
+++ b/doc/source/f2py/windows/pgi.rst
@@ -22,7 +22,5 @@ as classic Flang requires a custom LLVM and compilation from sources.
 	As of 29-01-2022, `PGI compiler toolchains`_ have been superseded by the Nvidia
   	HPC SDK, with no `native Windows support`_.
 
-However, 
-
 .. _PGI compiler toolchains: https://www.pgroup.com/index.html
 .. _native Windows support: https://developer.nvidia.com/nvidia-hpc-sdk-downloads#collapseFour
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f31e730d7..5e31ec0a4 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -17,7 +17,6 @@ NumPy documentation
 **Version**: |version|
 
 **Download documentation**:
-`PDF Version <https://numpy.org/doc/stable/numpy-user.pdf>`_ |
 `Historical versions of documentation <https://numpy.org/doc/>`_
    
 **Useful links**:
diff --git a/doc/source/reference/arrays.classes.rst b/doc/source/reference/arrays.classes.rst
index 2f40423ee..34da83670 100644
--- a/doc/source/reference/arrays.classes.rst
+++ b/doc/source/reference/arrays.classes.rst
@@ -71,10 +71,11 @@ NumPy provides several hooks that classes can customize:
    The method should return either the result of the operation, or
    :obj:`NotImplemented` if the operation requested is not implemented.
 
-   If one of the input or output arguments has a :func:`__array_ufunc__`
+   If one of the input, output, or ``where`` arguments has a :func:`__array_ufunc__`
    method, it is executed *instead* of the ufunc.  If more than one of the
    arguments implements :func:`__array_ufunc__`, they are tried in the
-   order: subclasses before superclasses, inputs before outputs, otherwise
+   order: subclasses before superclasses, inputs before outputs,
+   outputs before ``where``, otherwise
    left to right. The first routine returning something other than
    :obj:`NotImplemented` determines the result. If all of the
    :func:`__array_ufunc__` operations return :obj:`NotImplemented`, a
@@ -162,15 +163,6 @@ NumPy provides several hooks that classes can customize:
 
    .. versionadded:: 1.16
 
-   .. note::
-
-       - In NumPy 1.17, the protocol is enabled by default, but can be disabled
-         with ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0``.
-       - In NumPy 1.16, you need to set the environment variable
-         ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1`` before importing NumPy to use
-         NumPy function overrides.
-       - Eventually, expect to ``__array_function__`` to always be enabled.
-
    -  ``func`` is an arbitrary callable exposed by NumPy's public API,
       which was called in the form ``func(*args, **kwargs)``.
    -  ``types`` is a collection :py:class:`collections.abc.Collection`
diff --git a/doc/source/reference/arrays.dtypes.rst b/doc/source/reference/arrays.dtypes.rst
index 8606bc8f1..f4218ce86 100644
--- a/doc/source/reference/arrays.dtypes.rst
+++ b/doc/source/reference/arrays.dtypes.rst
@@ -188,10 +188,10 @@ Built-in Python types
     (all others)      :class:`object_`
     ================  ===============
 
-    Note that ``str`` refers to either null terminated bytes or unicode strings
-    depending on the Python version. In code targeting both Python 2 and 3
-    ``np.unicode_`` should be used as a dtype for strings.
-    See :ref:`Note on string types<string-dtype-note>`.
+    Note that ``str`` corresponds to UCS4 encoded unicode strings, while
+    ``string`` is an alias to ``bytes_``. The name ``np.unicode_`` is also
+    available as an alias to ``np.str_``, see :ref:`Note on string
+    types<string-dtype-note>`.
 
     .. admonition:: Example
 
@@ -263,11 +263,11 @@ Array-protocol type strings (see :ref:`arrays.interface`)
 
    .. admonition:: Note on string types
 
-    For backward compatibility with Python 2 the ``S`` and ``a`` typestrings
-    remain zero-terminated bytes and `numpy.string_` continues to alias
-    `numpy.bytes_`. To use actual strings in Python 3 use ``U`` or `numpy.str_`.
-    For signed bytes that do not need zero-termination ``b`` or ``i1`` can be
-    used.
+    For backward compatibility with existing code originally written to support
+    Python 2, ``S`` and ``a`` typestrings are zero-terminated bytes and
+    `numpy.string_` continues to alias `numpy.bytes_`. For unicode strings,
+    use ``U``, `numpy.str_`, or `numpy.unicode_`.  For signed bytes that do not
+    need zero-termination ``b`` or ``i1`` can be used.
 
 String with comma-separated fields
    A short-hand notation for specifying the format of a structured data type is
diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst
index 8cabc1a06..fb1c91a07 100644
--- a/doc/source/reference/arrays.nditer.rst
+++ b/doc/source/reference/arrays.nditer.rst
@@ -3,7 +3,7 @@
 .. _arrays.nditer:
 
 *********************
-Iterating Over Arrays
+Iterating over arrays
 *********************
 
 .. note::
@@ -23,7 +23,7 @@ can accelerate the inner loop in Cython. Since the Python exposure of
 iterator API, these ideas will also provide help working with array
 iteration from C or C++.
 
-Single Array Iteration
+Single array iteration
 ======================
 
 The most basic task that can be done with the :class:`nditer` is to
@@ -64,7 +64,7 @@ namely the order they are stored in memory, whereas the elements of
 `a.T.copy(order='C')` get visited in a different order because they
 have been put into a different memory layout.
 
-Controlling Iteration Order
+Controlling iteration order
 ---------------------------
 
 There are times when it is important to visit the elements of an array
@@ -88,7 +88,7 @@ order='C' for C order and order='F' for Fortran order.
 
 .. _nditer-context-manager:
 
-Modifying Array Values
+Modifying array values
 ----------------------
 
 By default, the :class:`nditer` treats the input operand as a read-only
@@ -128,7 +128,7 @@ note that prior to 1.15, :class:`nditer` was not a context manager and
 did not have a `close` method. Instead it relied on the destructor to
 initiate the writeback of the buffer.
 
-Using an External Loop
+Using an external loop
 ----------------------
 
 In all the examples so far, the elements of `a` are provided by the
@@ -161,7 +161,7 @@ elements each.
     ...
     [0 3] [1 4] [2 5]
 
-Tracking an Index or Multi-Index
+Tracking an index or multi-index
 --------------------------------
 
 During iteration, you may want to use the index of the current
@@ -210,7 +210,7 @@ raise an exception.
       File "<stdin>", line 1, in <module>
     ValueError: Iterator flag EXTERNAL_LOOP cannot be used if an index or multi-index is being tracked
 
-Alternative Looping and Element Access
+Alternative looping and element access
 --------------------------------------
 
 To make its properties more readily accessible during iteration,
@@ -246,7 +246,7 @@ produce identical results to the ones in the previous section.
     array([[ 0,  1,  2],
            [-1,  0,  1]])
 
-Buffering the Array Elements
+Buffering the array elements
 ----------------------------
 
 When forcing an iteration order, we observed that the external loop
@@ -274,7 +274,7 @@ is enabled.
     ...
     [0 3 1 4 2 5]
 
-Iterating as a Specific Data Type
+Iterating as a specific data type
 ---------------------------------
 
 There are times when it is necessary to treat an array as a different
@@ -382,7 +382,7 @@ would violate the casting rule.
       File "<stdin>", line 2, in <module>
     TypeError: Iterator requested dtype could not be cast from dtype('float64') to dtype('int64'), the operand 0 dtype, according to the rule 'same_kind'
 
-Broadcasting Array Iteration
+Broadcasting array iteration
 ============================
 
 NumPy has a set of rules for dealing with arrays that have differing
@@ -417,7 +417,7 @@ which includes the input shapes to help diagnose the problem.
     ...
     ValueError: operands could not be broadcast together with shapes (2,) (2,3)
 
-Iterator-Allocated Output Arrays
+Iterator-allocated output arrays
 --------------------------------
 
 A common case in NumPy functions is to have outputs allocated based
diff --git a/doc/source/reference/arrays.scalars.rst b/doc/source/reference/arrays.scalars.rst
index 30d7be9f9..4dba54d6b 100644
--- a/doc/source/reference/arrays.scalars.rst
+++ b/doc/source/reference/arrays.scalars.rst
@@ -293,6 +293,10 @@ elements the data type consists of.)
    :members: __init__
    :exclude-members: __init__
 
+.. autoclass:: numpy.character
+   :members: __init__
+   :exclude-members: __init__
+
 .. autoclass:: numpy.bytes_
    :members: __init__
    :exclude-members: __init__
@@ -333,8 +337,6 @@ are also provided.
 .. note that these are documented with ..attribute because that is what
    autoclass does for aliases under the hood.
 
-.. autoclass:: numpy.bool8
-
 .. attribute:: int8
                int16
                int32
diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst
index 8772b494c..038702bcf 100644
--- a/doc/source/reference/c-api/array.rst
+++ b/doc/source/reference/c-api/array.rst
@@ -423,7 +423,7 @@ From other objects
     :c:data:`NPY_ARRAY_FORCECAST` is present in ``flags``,
     this call will generate an error if the data
     type cannot be safely obtained from the object. If you want to use
-    ``NULL`` for the *dtype* and ensure the array is notswapped then
+    ``NULL`` for the *dtype* and ensure the array is not swapped then
     use :c:func:`PyArray_CheckFromAny`. A value of 0 for either of the
     depth parameters causes the parameter to be ignored. Any of the
     following array flags can be added (*e.g.* using \|) to get the
@@ -548,22 +548,6 @@ From other objects
         :c:data:`NPY_ARRAY_F_CONTIGUOUS` \| :c:data:`NPY_ARRAY_WRITEABLE` \|
         :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
 
-.. c:function:: int PyArray_GetArrayParamsFromObject( \
-        PyObject* op, PyArray_Descr* requested_dtype, npy_bool writeable, \
-        PyArray_Descr** out_dtype, int* out_ndim, npy_intp* out_dims, \
-        PyArrayObject** out_arr, PyObject* context)
-
-    .. deprecated:: NumPy 1.19
-
-        Unless NumPy is made aware of an issue with this, this function
-        is scheduled for rapid removal without replacement.
-
-    .. versionchanged:: NumPy 1.19
-
-        `context` is never used. Its use results in an error.
-
-    .. versionadded:: 1.6
-
 .. c:function:: PyObject* PyArray_CheckFromAny( \
         PyObject* op, PyArray_Descr* dtype, int min_depth, int max_depth, \
         int requirements, PyObject* context)
@@ -1165,29 +1149,13 @@ Converting data types
 
     .. versionadded:: 1.6
 
-    This applies type promotion to all the inputs,
-    using the NumPy rules for combining scalars and arrays, to
-    determine the output type of a set of operands.  This is the
-    same result type that ufuncs produce. The specific algorithm
-    used is as follows.
-
-    Categories are determined by first checking which of boolean,
-    integer (int/uint), or floating point (float/complex) the maximum
-    kind of all the arrays and the scalars are.
+    This applies type promotion to all the input arrays and dtype
+    objects, using the NumPy rules for combining scalars and arrays, to
+    determine the output type for an operation with the given set of
+    operands. This is the same result type that ufuncs produce.
 
-    If there are only scalars or the maximum category of the scalars
-    is higher than the maximum category of the arrays,
-    the data types are combined with :c:func:`PyArray_PromoteTypes`
-    to produce the return value.
-
-    Otherwise, PyArray_MinScalarType is called on each array, and
-    the resulting data types are all combined with
-    :c:func:`PyArray_PromoteTypes` to produce the return value.
-
-    The set of int values is not a subset of the uint values for types
-    with the same number of bits, something not reflected in
-    :c:func:`PyArray_MinScalarType`, but handled as a special case in
-    PyArray_ResultType.
+    See the documentation of :func:`numpy.result_type` for more
+    detail about the type promotion algorithm.
 
 .. c:function:: int PyArray_ObjectType(PyObject* op, int mintype)
 
@@ -1202,17 +1170,6 @@ Converting data types
     return value is the enumerated typenumber that represents the
     data-type that *op* should have.
 
-.. c:function:: void PyArray_ArrayType( \
-        PyObject* op, PyArray_Descr* mintype, PyArray_Descr* outtype)
-
-    This function is superseded by :c:func:`PyArray_ResultType`.
-
-    This function works similarly to :c:func:`PyArray_ObjectType` (...)
-    except it handles flexible arrays. The *mintype* argument can have
-    an itemsize member and the *outtype* argument will have an
-    itemsize member at least as big but perhaps bigger depending on
-    the object *op*.
-
 .. c:function:: PyArrayObject** PyArray_ConvertToCommonType( \
         PyObject* op, int* n)
 
@@ -1490,7 +1447,7 @@ of the constant names is deprecated in 1.7.
     :c:func:`PyArray_FromAny` and a copy had to be made of some other
     array (and the user asked for this flag to be set in such a
     situation). The base attribute then points to the "misbehaved"
-    array (which is set read_only). :c:func`PyArray_ResolveWritebackIfCopy`
+    array (which is set read_only). :c:func:`PyArray_ResolveWritebackIfCopy`
     will copy its contents back to the "misbehaved"
     array (casting if necessary) and will reset the "misbehaved" array
     to :c:data:`NPY_ARRAY_WRITEABLE`. If the "misbehaved" array was not
@@ -2276,7 +2233,7 @@ Array Functions
     output array must have the correct shape, type, and be
     C-contiguous, or an exception is raised.
 
-.. c:function:: PyObject* PyArray_EinsteinSum( \
+.. c:function:: PyArrayObject* PyArray_EinsteinSum( \
         char* subscripts, npy_intp nop, PyArrayObject** op_in, \
         PyArray_Descr* dtype, NPY_ORDER order, NPY_CASTING casting, \
         PyArrayObject* out)
@@ -2475,9 +2432,9 @@ As of NumPy 1.6.0, these array iterators are superseded by
 the new array iterator, :c:type:`NpyIter`.
 
 An array iterator is a simple way to access the elements of an
-N-dimensional array quickly and efficiently. Section `2
-<#sec-array-iterator>`__ provides more description and examples of
-this useful approach to looping over an array.
+N-dimensional array quickly and efficiently, as seen in :ref:`the
+example <iteration-example>` which provides more description
+of this useful approach to looping over an array from C. 
 
 .. c:function:: PyObject* PyArray_IterNew(PyObject* arr)
 
@@ -3378,7 +3335,7 @@ Memory management
 
 .. c:function:: int PyArray_ResolveWritebackIfCopy(PyArrayObject* obj)
 
-    If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+    If ``obj->flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
     clears the flags, `DECREF` s
     `obj->base` and makes it writeable, and sets ``obj->base`` to NULL. It then
     copies ``obj->data`` to `obj->base->data`, and returns the error state of
@@ -3608,29 +3565,17 @@ Miscellaneous Macros
 
     Returns the reference count of any Python object.
 
-.. c:function:: void PyArray_DiscardWritebackIfCopy(PyObject* obj)
+.. c:function:: void PyArray_DiscardWritebackIfCopy(PyArrayObject* obj)
 
-    If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+    If ``obj->flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
     clears the flags, `DECREF` s
     `obj->base` and makes it writeable, and sets ``obj->base`` to NULL. In
-    contrast to :c:func:`PyArray_DiscardWritebackIfCopy` it makes no attempt
-    to copy the data from `obj->base` This undoes
+    contrast to :c:func:`PyArray_ResolveWritebackIfCopy` it makes no attempt
+    to copy the data from `obj->base`. This undoes
     :c:func:`PyArray_SetWritebackIfCopyBase`. Usually this is called after an
     error when you are finished with ``obj``, just before ``Py_DECREF(obj)``.
     It may be called multiple times, or with ``NULL`` input.
 
-.. c:function:: void PyArray_XDECREF_ERR(PyObject* obj)
-
-    Deprecated in 1.14, use :c:func:`PyArray_DiscardWritebackIfCopy`
-    followed by ``Py_XDECREF``
-
-    DECREF's an array object which may have the
-    :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
-    flag set without causing the contents to be copied back into the
-    original array. Resets the :c:data:`NPY_ARRAY_WRITEABLE` flag on the base
-    object. This is useful for recovering from an error condition when
-    writeback semantics are used, but will lead to wrong results.
-
 
 Enumerated Types
 ~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/c-api/data_memory.rst b/doc/source/reference/c-api/data_memory.rst
index 2084ab5d0..cab587efc 100644
--- a/doc/source/reference/c-api/data_memory.rst
+++ b/doc/source/reference/c-api/data_memory.rst
@@ -159,3 +159,87 @@ A better technique would be to use a ``PyCapsule`` as a base object:
         return NULL;
     }
     ...
+
+Example of memory tracing with ``np.lib.tracemalloc_domain``
+------------------------------------------------------------
+
+Note that since Python 3.6 (or newer), the builtin ``tracemalloc`` module can be used to
+track allocations inside NumPy. NumPy places its CPU memory allocations into the 
+``np.lib.tracemalloc_domain`` domain.
+For additional information, check: `https://docs.python.org/3/library/tracemalloc.html`.
+
+Here is an example on how to use ``np.lib.tracemalloc_domain``:
+
+.. code-block:: python
+
+    """
+       The goal of this example is to show how to trace memory
+       from an application that has NumPy and non-NumPy sections.
+       We only select the sections using NumPy related calls.
+    """
+    
+    import tracemalloc
+    import numpy as np
+    
+    # Flag to determine if we select NumPy domain
+    use_np_domain = True
+    
+    nx = 300
+    ny = 500
+    
+    # Start to trace memory
+    tracemalloc.start()
+    
+    # Section 1
+    # ---------
+    
+    # NumPy related call
+    a = np.zeros((nx,ny))
+    
+    # non-NumPy related call
+    b = [i**2 for i in range(nx*ny)]
+    
+    snapshot1 = tracemalloc.take_snapshot()
+    # We filter the snapshot to only select NumPy related calls
+    np_domain = np.lib.tracemalloc_domain
+    dom_filter = tracemalloc.DomainFilter(inclusive=use_np_domain,
+                                          domain=np_domain)
+    snapshot1 = snapshot1.filter_traces([dom_filter])
+    top_stats1 = snapshot1.statistics('traceback')
+    
+    print("================ SNAPSHOT 1 =================")
+    for stat in top_stats1:
+        print(f"{stat.count} memory blocks: {stat.size / 1024:.1f} KiB")
+        print(stat.traceback.format()[-1])
+    
+    # Clear traces of memory blocks allocated by Python
+    # before moving to the next section.
+    tracemalloc.clear_traces()
+    
+    # Section 2
+    #----------
+    
+    # We are only using NumPy
+    c = np.sum(a*a)
+    
+    snapshot2 = tracemalloc.take_snapshot()
+    top_stats2 = snapshot2.statistics('traceback')
+
+    print()
+    print("================ SNAPSHOT 2 =================")
+    for stat in top_stats2:
+        print(f"{stat.count} memory blocks: {stat.size / 1024:.1f} KiB")
+        print(stat.traceback.format()[-1])
+    
+    tracemalloc.stop()
+    
+    print()
+    print("============================================")
+    print("\nTracing Status : ", tracemalloc.is_tracing())
+    
+    try:
+        print("\nTrying to Take Snapshot After Tracing is Stopped.")
+        snap = tracemalloc.take_snapshot()
+    except Exception as e:
+        print("Exception : ", e)
+    
diff --git a/doc/source/reference/c-api/deprecations.rst b/doc/source/reference/c-api/deprecations.rst
index 5b1abc6f2..d805421f2 100644
--- a/doc/source/reference/c-api/deprecations.rst
+++ b/doc/source/reference/c-api/deprecations.rst
@@ -58,3 +58,7 @@ On compilers which support a #warning mechanism, NumPy issues a
 compiler warning if you do not define the symbol NPY_NO_DEPRECATED_API.
 This way, the fact that there are deprecations will be flagged for
 third-party developers who may not have read the release notes closely.
+
+Note that defining NPY_NO_DEPRECATED_API is not sufficient to make your
+extension ABI compatible with a given NumPy version. See
+:ref:`for-downstream-package-authors`.
diff --git a/doc/source/reference/c-api/dtype.rst b/doc/source/reference/c-api/dtype.rst
index 642f62749..bdd5a6f55 100644
--- a/doc/source/reference/c-api/dtype.rst
+++ b/doc/source/reference/c-api/dtype.rst
@@ -25,161 +25,161 @@ select the precision desired.
 Enumerated Types
 ----------------
 
-.. c:enumerator:: NPY_TYPES
+.. c:enum:: NPY_TYPES
 
-There is a list of enumerated types defined providing the basic 24
-data types plus some useful generic names. Whenever the code requires
-a type number, one of these enumerated types is requested. The types
-are all called ``NPY_{NAME}``:
+    There is a list of enumerated types defined providing the basic 24
+    data types plus some useful generic names. Whenever the code requires
+    a type number, one of these enumerated types is requested. The types
+    are all called ``NPY_{NAME}``:
 
-.. c:enumerator:: NPY_BOOL
+    .. c:enumerator:: NPY_BOOL
 
-    The enumeration value for the boolean type, stored as one byte.
-    It may only be set to the values 0 and 1.
+        The enumeration value for the boolean type, stored as one byte.
+        It may only be set to the values 0 and 1.
 
-.. c:enumerator:: NPY_BYTE
-.. c:enumerator:: NPY_INT8
+    .. c:enumerator:: NPY_BYTE
+    .. c:enumerator:: NPY_INT8
 
-    The enumeration value for an 8-bit/1-byte signed integer.
+        The enumeration value for an 8-bit/1-byte signed integer.
 
-.. c:enumerator:: NPY_SHORT
-.. c:enumerator:: NPY_INT16
+    .. c:enumerator:: NPY_SHORT
+    .. c:enumerator:: NPY_INT16
 
-    The enumeration value for a 16-bit/2-byte signed integer.
+        The enumeration value for a 16-bit/2-byte signed integer.
 
-.. c:enumerator:: NPY_INT
-.. c:enumerator:: NPY_INT32
+    .. c:enumerator:: NPY_INT
+    .. c:enumerator:: NPY_INT32
 
-    The enumeration value for a 32-bit/4-byte signed integer.
+        The enumeration value for a 32-bit/4-byte signed integer.
 
-.. c:enumerator:: NPY_LONG
+    .. c:enumerator:: NPY_LONG
 
-    Equivalent to either NPY_INT or NPY_LONGLONG, depending on the
-    platform.
+        Equivalent to either NPY_INT or NPY_LONGLONG, depending on the
+        platform.
 
-.. c:enumerator:: NPY_LONGLONG
-.. c:enumerator:: NPY_INT64
+    .. c:enumerator:: NPY_LONGLONG
+    .. c:enumerator:: NPY_INT64
 
-    The enumeration value for a 64-bit/8-byte signed integer.
+        The enumeration value for a 64-bit/8-byte signed integer.
 
-.. c:enumerator:: NPY_UBYTE
-.. c:enumerator:: NPY_UINT8
+    .. c:enumerator:: NPY_UBYTE
+    .. c:enumerator:: NPY_UINT8
 
-    The enumeration value for an 8-bit/1-byte unsigned integer.
+        The enumeration value for an 8-bit/1-byte unsigned integer.
 
-.. c:enumerator:: NPY_USHORT
-.. c:enumerator:: NPY_UINT16
+    .. c:enumerator:: NPY_USHORT
+    .. c:enumerator:: NPY_UINT16
 
-    The enumeration value for a 16-bit/2-byte unsigned integer.
+        The enumeration value for a 16-bit/2-byte unsigned integer.
 
-.. c:enumerator:: NPY_UINT
-.. c:enumerator:: NPY_UINT32
+    .. c:enumerator:: NPY_UINT
+    .. c:enumerator:: NPY_UINT32
 
-    The enumeration value for a 32-bit/4-byte unsigned integer.
+        The enumeration value for a 32-bit/4-byte unsigned integer.
 
-.. c:enumerator:: NPY_ULONG
+    .. c:enumerator:: NPY_ULONG
 
-    Equivalent to either NPY_UINT or NPY_ULONGLONG, depending on the
-    platform.
+        Equivalent to either NPY_UINT or NPY_ULONGLONG, depending on the
+        platform.
 
-.. c:enumerator:: NPY_ULONGLONG
-.. c:enumerator:: NPY_UINT64
+    .. c:enumerator:: NPY_ULONGLONG
+    .. c:enumerator:: NPY_UINT64
 
-    The enumeration value for a 64-bit/8-byte unsigned integer.
+        The enumeration value for a 64-bit/8-byte unsigned integer.
 
-.. c:enumerator:: NPY_HALF
-.. c:enumerator:: NPY_FLOAT16
+    .. c:enumerator:: NPY_HALF
+    .. c:enumerator:: NPY_FLOAT16
 
-    The enumeration value for a 16-bit/2-byte IEEE 754-2008 compatible floating
-    point type.
+        The enumeration value for a 16-bit/2-byte IEEE 754-2008 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_FLOAT
-.. c:enumerator:: NPY_FLOAT32
+    .. c:enumerator:: NPY_FLOAT
+    .. c:enumerator:: NPY_FLOAT32
 
-    The enumeration value for a 32-bit/4-byte IEEE 754 compatible floating
-    point type.
+        The enumeration value for a 32-bit/4-byte IEEE 754 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_DOUBLE
-.. c:enumerator:: NPY_FLOAT64
+    .. c:enumerator:: NPY_DOUBLE
+    .. c:enumerator:: NPY_FLOAT64
 
-    The enumeration value for a 64-bit/8-byte IEEE 754 compatible floating
-    point type.
+        The enumeration value for a 64-bit/8-byte IEEE 754 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_LONGDOUBLE
+    .. c:enumerator:: NPY_LONGDOUBLE
 
-    The enumeration value for a platform-specific floating point type which is
-    at least as large as NPY_DOUBLE, but larger on many platforms.
+        The enumeration value for a platform-specific floating point type which is
+        at least as large as NPY_DOUBLE, but larger on many platforms.
 
-.. c:enumerator:: NPY_CFLOAT
-.. c:enumerator:: NPY_COMPLEX64
+    .. c:enumerator:: NPY_CFLOAT
+    .. c:enumerator:: NPY_COMPLEX64
 
-    The enumeration value for a 64-bit/8-byte complex type made up of
-    two NPY_FLOAT values.
+        The enumeration value for a 64-bit/8-byte complex type made up of
+        two NPY_FLOAT values.
 
-.. c:enumerator:: NPY_CDOUBLE
-.. c:enumerator:: NPY_COMPLEX128
+    .. c:enumerator:: NPY_CDOUBLE
+    .. c:enumerator:: NPY_COMPLEX128
 
-    The enumeration value for a 128-bit/16-byte complex type made up of
-    two NPY_DOUBLE values.
+        The enumeration value for a 128-bit/16-byte complex type made up of
+        two NPY_DOUBLE values.
 
-.. c:enumerator:: NPY_CLONGDOUBLE
+    .. c:enumerator:: NPY_CLONGDOUBLE
 
-    The enumeration value for a platform-specific complex floating point
-    type which is made up of two NPY_LONGDOUBLE values.
+        The enumeration value for a platform-specific complex floating point
+        type which is made up of two NPY_LONGDOUBLE values.
 
-.. c:enumerator:: NPY_DATETIME
+    .. c:enumerator:: NPY_DATETIME
 
-    The enumeration value for a data type which holds dates or datetimes with
-    a precision based on selectable date or time units.
+        The enumeration value for a data type which holds dates or datetimes with
+        a precision based on selectable date or time units.
 
-.. c:enumerator:: NPY_TIMEDELTA
+    .. c:enumerator:: NPY_TIMEDELTA
 
-    The enumeration value for a data type which holds lengths of times in
-    integers of selectable date or time units.
+        The enumeration value for a data type which holds lengths of times in
+        integers of selectable date or time units.
 
-.. c:enumerator:: NPY_STRING
+    .. c:enumerator:: NPY_STRING
 
-    The enumeration value for ASCII strings of a selectable size. The
-    strings have a fixed maximum size within a given array.
+        The enumeration value for ASCII strings of a selectable size. The
+        strings have a fixed maximum size within a given array.
 
-.. c:enumerator:: NPY_UNICODE
+    .. c:enumerator:: NPY_UNICODE
 
-    The enumeration value for UCS4 strings of a selectable size. The
-    strings have a fixed maximum size within a given array.
+        The enumeration value for UCS4 strings of a selectable size. The
+        strings have a fixed maximum size within a given array.
 
-.. c:enumerator:: NPY_OBJECT
+    .. c:enumerator:: NPY_OBJECT
 
-    The enumeration value for references to arbitrary Python objects.
+        The enumeration value for references to arbitrary Python objects.
 
-.. c:enumerator:: NPY_VOID
+    .. c:enumerator:: NPY_VOID
 
-    Primarily used to hold struct dtypes, but can contain arbitrary
-    binary data.
+        Primarily used to hold struct dtypes, but can contain arbitrary
+        binary data.
 
-Some useful aliases of the above types are
+    Some useful aliases of the above types are
 
-.. c:enumerator:: NPY_INTP
+    .. c:enumerator:: NPY_INTP
 
-    The enumeration value for a signed integer type which is the same
-    size as a (void \*) pointer. This is the type used by all
-    arrays of indices.
+        The enumeration value for a signed integer type which is the same
+        size as a (void \*) pointer. This is the type used by all
+        arrays of indices.
 
-.. c:enumerator:: NPY_UINTP
+    .. c:enumerator:: NPY_UINTP
 
-    The enumeration value for an unsigned integer type which is the
-    same size as a (void \*) pointer.
+        The enumeration value for an unsigned integer type which is the
+        same size as a (void \*) pointer.
 
-.. c:enumerator:: NPY_MASK
+    .. c:enumerator:: NPY_MASK
 
-    The enumeration value of the type used for masks, such as with
-    the :c:data:`NPY_ITER_ARRAYMASK` iterator flag. This is equivalent
-    to :c:data:`NPY_UINT8`.
+        The enumeration value of the type used for masks, such as with
+        the :c:data:`NPY_ITER_ARRAYMASK` iterator flag. This is equivalent
+        to :c:data:`NPY_UINT8`.
 
-.. c:enumerator:: NPY_DEFAULT_TYPE
+    .. c:enumerator:: NPY_DEFAULT_TYPE
 
-    The default type to use when no dtype is explicitly specified, for
-    example when calling np.zero(shape). This is equivalent to
-    :c:data:`NPY_DOUBLE`.
+        The default type to use when no dtype is explicitly specified, for
+        example when calling np.zero(shape). This is equivalent to
+        :c:data:`NPY_DOUBLE`.
 
 Other useful related constants are
 
diff --git a/doc/source/reference/c-api/iterator.rst b/doc/source/reference/c-api/iterator.rst
index 09c61e5fc..92e3e435b 100644
--- a/doc/source/reference/c-api/iterator.rst
+++ b/doc/source/reference/c-api/iterator.rst
@@ -26,8 +26,10 @@ which may be of interest for those using this C API. In many instances,
 testing out ideas by creating the iterator in Python is a good idea
 before writing the C iteration code.
 
+.. _iteration-example:
+
 Iteration Example
-------------------------
+-----------------
 
 The best way to become familiar with the iterator is to look at its
 usage within the NumPy codebase itself. For example, here is a slightly
diff --git a/doc/source/reference/c-api/types-and-structures.rst b/doc/source/reference/c-api/types-and-structures.rst
index 34437bd30..cff1b3d38 100644
--- a/doc/source/reference/c-api/types-and-structures.rst
+++ b/doc/source/reference/c-api/types-and-structures.rst
@@ -122,7 +122,7 @@ PyArray_Type and PyArrayObject
        ``ndarraytypes.h`` points to this data member. :c:data:`NPY_MAXDIMS`
        is the largest number of dimensions for any array.
 
-   .. c:member:: npy_intp dimensions
+   .. c:member:: npy_intp *dimensions
 
        An array of integers providing the shape in each dimension as
        long as nd :math:`\geq` 1. The integer is always large enough
@@ -225,7 +225,7 @@ PyArrayDescr_Type and PyArray_Descr
 
    - Never declare a non-pointer instance of the struct
    - Never perform pointer arithmetic
-   - Never use ``sizof(PyArray_Descr)``
+   - Never use ``sizeof(PyArray_Descr)``
 
    It has the following structure:
 
@@ -285,83 +285,16 @@ PyArrayDescr_Type and PyArray_Descr
        array like behavior. Each bit in this member is a flag which are named
        as:
 
-   .. c:member:: int alignment
-
-       Non-NULL if this type is an array (C-contiguous) of some other type
-
-
-..
-  dedented to allow internal linking, pending a refactoring
-
-.. c:macro:: NPY_ITEM_REFCOUNT
-
-    Indicates that items of this data-type must be reference
-    counted (using :c:func:`Py_INCREF` and :c:func:`Py_DECREF` ).
-
-       .. c:macro:: NPY_ITEM_HASOBJECT
-
-           Same as :c:data:`NPY_ITEM_REFCOUNT`.
-
-..
-  dedented to allow internal linking, pending a refactoring
-
-.. c:macro:: NPY_LIST_PICKLE
-
-    Indicates arrays of this data-type must be converted to a list
-    before pickling.
-
-.. c:macro:: NPY_ITEM_IS_POINTER
-
-    Indicates the item is a pointer to some other data-type
-
-.. c:macro:: NPY_NEEDS_INIT
-
-    Indicates memory for this data-type must be initialized (set
-    to 0) on creation.
-
-.. c:macro:: NPY_NEEDS_PYAPI
-
-    Indicates this data-type requires the Python C-API during
-    access (so don't give up the GIL if array access is going to
-    be needed).
-
-.. c:macro:: NPY_USE_GETITEM
-
-    On array access use the ``f->getitem`` function pointer
-    instead of the standard conversion to an array scalar. Must
-    use if you don't define an array scalar to go along with
-    the data-type.
-
-.. c:macro:: NPY_USE_SETITEM
-
-    When creating a 0-d array from an array scalar use
-    ``f->setitem`` instead of the standard copy from an array
-    scalar. Must use if you don't define an array scalar to go
-    along with the data-type.
-
-       .. c:macro:: NPY_FROM_FIELDS
-
-           The bits that are inherited for the parent data-type if these
-           bits are set in any field of the data-type. Currently (
-           :c:data:`NPY_NEEDS_INIT` \| :c:data:`NPY_LIST_PICKLE` \|
-           :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_PYAPI` ).
-
-       .. c:macro:: NPY_OBJECT_DTYPE_FLAGS
-
-           Bits set for the object data-type: ( :c:data:`NPY_LIST_PICKLE`
-           \| :c:data:`NPY_USE_GETITEM` \| :c:data:`NPY_ITEM_IS_POINTER` \|
-           :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_INIT` \|
-           :c:data:`NPY_NEEDS_PYAPI`).
-
-       .. c:function:: int PyDataType_FLAGCHK(PyArray_Descr *dtype, int flags)
-
-           Return true if all the given flags are set for the data-type
-           object.
-
-       .. c:function:: int PyDataType_REFCHK(PyArray_Descr *dtype)
-
-           Equivalent to :c:func:`PyDataType_FLAGCHK` (*dtype*,
-           :c:data:`NPY_ITEM_REFCOUNT`).
+       * :c:macro:`NPY_ITEM_REFCOUNT`
+       * :c:macro:`NPY_ITEM_HASOBJECT`
+       * :c:macro:`NPY_LIST_PICKLE`
+       * :c:macro:`NPY_ITEM_IS_POINTER`
+       * :c:macro:`NPY_NEEDS_INIT`
+       * :c:macro:`NPY_NEEDS_PYAPI`
+       * :c:macro:`NPY_USE_GETITEM`
+       * :c:macro:`NPY_USE_SETITEM`
+       * :c:macro:`NPY_FROM_FIELDS`
+       * :c:macro:`NPY_OBJECT_DTYPE_FLAGS`
 
    .. c:member:: int type_num
 
@@ -452,6 +385,73 @@ PyArrayDescr_Type and PyArray_Descr
        Currently unused. Reserved for future use in caching
        hash values.
 
+.. c:macro:: NPY_ITEM_REFCOUNT
+
+    Indicates that items of this data-type must be reference
+    counted (using :c:func:`Py_INCREF` and :c:func:`Py_DECREF` ).
+
+.. c:macro:: NPY_ITEM_HASOBJECT
+
+   Same as :c:data:`NPY_ITEM_REFCOUNT`.
+
+.. c:macro:: NPY_LIST_PICKLE
+
+    Indicates arrays of this data-type must be converted to a list
+    before pickling.
+
+.. c:macro:: NPY_ITEM_IS_POINTER
+
+    Indicates the item is a pointer to some other data-type
+
+.. c:macro:: NPY_NEEDS_INIT
+
+    Indicates memory for this data-type must be initialized (set
+    to 0) on creation.
+
+.. c:macro:: NPY_NEEDS_PYAPI
+
+    Indicates this data-type requires the Python C-API during
+    access (so don't give up the GIL if array access is going to
+    be needed).
+
+.. c:macro:: NPY_USE_GETITEM
+
+    On array access use the ``f->getitem`` function pointer
+    instead of the standard conversion to an array scalar. Must
+    use if you don't define an array scalar to go along with
+    the data-type.
+
+.. c:macro:: NPY_USE_SETITEM
+
+    When creating a 0-d array from an array scalar use
+    ``f->setitem`` instead of the standard copy from an array
+    scalar. Must use if you don't define an array scalar to go
+    along with the data-type.
+
+.. c:macro:: NPY_FROM_FIELDS
+
+   The bits that are inherited for the parent data-type if these
+   bits are set in any field of the data-type. Currently (
+   :c:data:`NPY_NEEDS_INIT` \| :c:data:`NPY_LIST_PICKLE` \|
+   :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_PYAPI` ).
+
+.. c:macro:: NPY_OBJECT_DTYPE_FLAGS
+
+   Bits set for the object data-type: ( :c:data:`NPY_LIST_PICKLE`
+   \| :c:data:`NPY_USE_GETITEM` \| :c:data:`NPY_ITEM_IS_POINTER` \|
+   :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_INIT` \|
+   :c:data:`NPY_NEEDS_PYAPI`).
+
+.. c:function:: int PyDataType_FLAGCHK(PyArray_Descr *dtype, int flags)
+
+   Return true if all the given flags are set for the data-type
+   object.
+
+.. c:function:: int PyDataType_REFCHK(PyArray_Descr *dtype)
+
+   Equivalent to :c:func:`PyDataType_FLAGCHK` (*dtype*,
+   :c:data:`NPY_ITEM_REFCOUNT`).
+
 .. c:type:: PyArray_ArrFuncs
 
     Functions implementing internal features. Not all of these
@@ -973,7 +973,7 @@ PyUFunc_Type and PyUFuncObject
             Some fallback support for this slot exists, but will be removed
             eventually.  A universal function that relied on this will
             have to be ported eventually.
-            See ref:`NEP 41 <NEP41>` and ref:`NEP 43 <NEP43>`
+            See :ref:`NEP 41 <NEP41>` and :ref:`NEP 43 <NEP43>`
 
    .. c:member:: void *reserved2
 
@@ -997,10 +997,14 @@ PyUFunc_Type and PyUFuncObject
 
    .. c:member:: npy_uint32 *core_dim_flags
 
-       For each distinct core dimension, a set of ``UFUNC_CORE_DIM*`` flags
+       For each distinct core dimension, a set of flags (
+       :c:macro:`UFUNC_CORE_DIM_CAN_IGNORE` and
+       :c:macro:`UFUNC_CORE_DIM_SIZE_INFERRED`)
+
+   .. c:member:: PyObject *identity_value
 
-..
-  dedented to allow internal linking, pending a refactoring
+       Identity for reduction, when :c:member:`PyUFuncObject.identity`
+       is equal to :c:data:`PyUFunc_IdentityValue`.
 
 .. c:macro:: UFUNC_CORE_DIM_CAN_IGNORE
 
@@ -1011,11 +1015,6 @@ PyUFunc_Type and PyUFuncObject
     if the dim size will be determined from the operands
     and not from a :ref:`frozen <frozen>` signature
 
-   .. c:member:: PyObject *identity_value
-
-       Identity for reduction, when :c:member:`PyUFuncObject.identity`
-       is equal to :c:data:`PyUFunc_IdentityValue`.
-
 PyArrayIter_Type and PyArrayIterObject
 --------------------------------------
 
@@ -1253,7 +1252,7 @@ ScalarArrayTypes
 ----------------
 
 There is a Python type for each of the different built-in data types
-that can be present in the array Most of these are simple wrappers
+that can be present in the array. Most of these are simple wrappers
 around the corresponding data type in C. The C-names for these types
 are ``Py{TYPE}ArrType_Type`` where ``{TYPE}`` can be
 
@@ -1457,22 +1456,6 @@ memory management. These types are not accessible directly from
 Python, and are not exposed to the C-API. They are included here only
 for completeness and assistance in understanding the code.
 
-
-.. c:type:: PyUFuncLoopObject
-
-   A loose wrapper for a C-structure that contains the information
-   needed for looping. This is useful if you are trying to understand
-   the ufunc looping code. The :c:type:`PyUFuncLoopObject` is the associated
-   C-structure. It is defined in the ``ufuncobject.h`` header.
-
-.. c:type:: PyUFuncReduceObject
-
-   A loose wrapper for the C-structure that contains the information
-   needed for reduce-like methods of ufuncs. This is useful if you are
-   trying to understand the reduce, accumulate, and reduce-at
-   code. The :c:type:`PyUFuncReduceObject` is the associated C-structure. It
-   is defined in the ``ufuncobject.h`` header.
-
 .. c:type:: PyUFunc_Loop1d
 
    A simple linked-list of C-structures containing the information needed
@@ -1483,8 +1466,12 @@ for completeness and assistance in understanding the code.
 
    Advanced indexing is handled with this Python type. It is simply a
    loose wrapper around the C-structure containing the variables
-   needed for advanced array indexing. The associated C-structure,
-   ``PyArrayMapIterObject``, is useful if you are trying to
+   needed for advanced array indexing.
+
+.. c:type:: PyArrayMapIterObject
+
+   The C-structure associated with :c:var:`PyArrayMapIter_Type`.
+   This structure is useful if you are trying to
    understand the advanced-index mapping code. It is defined in the
    ``arrayobject.h`` header. This type is not exposed to Python and
    could be replaced with a C-structure. As a Python type it takes
diff --git a/doc/source/reference/distutils_guide.rst b/doc/source/reference/distutils_guide.rst
index a72a9c4d6..6f927c231 100644
--- a/doc/source/reference/distutils_guide.rst
+++ b/doc/source/reference/distutils_guide.rst
@@ -1,6 +1,6 @@
 .. _distutils-user-guide:
 
-NumPy Distutils - Users Guide
+NumPy distutils - users guide
 =============================
 
 .. warning::
diff --git a/doc/source/reference/distutils_status_migration.rst b/doc/source/reference/distutils_status_migration.rst
index eda4790b5..67695978c 100644
--- a/doc/source/reference/distutils_status_migration.rst
+++ b/doc/source/reference/distutils_status_migration.rst
@@ -83,7 +83,7 @@ present in ``setuptools``:
 - Support for a few other scientific libraries, like FFTW and UMFPACK
 - Better MinGW support
 - Per-compiler build flag customization (e.g. `-O3` and `SSE2` flags are default)
-- a simple user build config system, see [site.cfg.example](https://github.com/numpy/numpy/blob/master/site.cfg.example)
+- a simple user build config system, see `site.cfg.example <https://github.com/numpy/numpy/blob/master/site.cfg.example>`__
 - SIMD intrinsics support
 
 The most widely used feature is nested ``setup.py`` files. This feature will
diff --git a/doc/source/reference/figures/dtype-hierarchy.dia b/doc/source/reference/figures/dtype-hierarchy.dia
index 62e925cfd..d5c01bf27 100644
--- a/doc/source/reference/figures/dtype-hierarchy.dia
+++ b/doc/source/reference/figures/dtype-hierarchy.dia
diff --git a/doc/source/reference/figures/dtype-hierarchy.pdf b/doc/source/reference/figures/dtype-hierarchy.pdf
index 6ce496a3e..b2375e152 100644
--- a/doc/source/reference/figures/dtype-hierarchy.pdf
+++ b/doc/source/reference/figures/dtype-hierarchy.pdf
diff --git a/doc/source/reference/figures/dtype-hierarchy.png b/doc/source/reference/figures/dtype-hierarchy.png
index 6c45758b1..891de096c 100644
--- a/doc/source/reference/figures/dtype-hierarchy.png
+++ b/doc/source/reference/figures/dtype-hierarchy.png
diff --git a/doc/source/reference/global_state.rst b/doc/source/reference/global_state.rst
index 81685ec7d..a898450af 100644
--- a/doc/source/reference/global_state.rst
+++ b/doc/source/reference/global_state.rst
@@ -1,7 +1,7 @@
 .. _global_state:
 
 ************
-Global State
+Global state
 ************
 
 NumPy has a few import-time, compile-time, or runtime options
@@ -11,10 +11,10 @@ purposes and will not be interesting to the vast majority
 of users.
 
 
-Performance-Related Options
+Performance-related options
 ===========================
 
-Number of Threads used for Linear Algebra
+Number of threads used for Linear Algebra
 -----------------------------------------
 
 NumPy itself is normally intentionally limited to a single thread
@@ -49,25 +49,17 @@ is to use madvise on Kernels 4.6 and newer. These kernels presumably
 experience a large speedup with hugepage support.
 This flag is checked at import time.
 
+SIMD feature selection
+----------------------
 
-Interoperability-Related Options
-================================
+Setting ``NPY_DISABLE_CPU_FEATURES`` will exclude simd features at runtime.
+See :ref:`runtime-simd-dispatch`.
 
-The array function protocol which allows array-like objects to
-hook into the NumPy API is currently enabled by default.
-This option exists since NumPy 1.16 and is enabled by default since
-NumPy 1.17. It can be disabled using::
 
-    NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0
-
-See also :py:meth:`numpy.class.__array_function__` for more information.
-This flag is checked at import time.
-
-
-Debugging-Related Options
+Debugging-related options
 =========================
 
-Relaxed Strides Checking
+Relaxed strides checking
 ------------------------
 
 The *compile-time* environment variable::
@@ -90,3 +82,17 @@ memory allocation policy, the default will be to call ``free``. If
 ``NUMPY_WARN_IF_NO_MEM_POLICY`` is set to ``"1"``, a ``RuntimeWarning`` will
 be emitted. A better alternative is to use a ``PyCapsule`` with a deallocator
 and set the ``ndarray.base``.
+
+
+Testing planned future behavior
+===============================
+
+NumPy has some code paths which are planned to be activated in the future
+but are not yet the default behavior.
+You can try testing some of these which may be shipped with a new "major"
+release (NumPy 2.0) by setting an environment before importing NumPy:
+
+    NPY_NUMPY_2_BEHAVIOR=1
+
+By default this will also activate the :ref:`NEP 50 <NEP50>` related setting
+``NPY_PROMOTION_STATE`` (please see the NEP for details on this).
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 8e4a81436..4756af5fa 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -3,7 +3,7 @@
 .. _reference:
 
 ###############
-NumPy Reference
+NumPy reference
 ###############
 
 :Release: |version|
diff --git a/doc/source/reference/internals.code-explanations.rst b/doc/source/reference/internals.code-explanations.rst
index d34314610..db3b3b452 100644
--- a/doc/source/reference/internals.code-explanations.rst
+++ b/doc/source/reference/internals.code-explanations.rst
@@ -1,7 +1,7 @@
 :orphan:
 
 *************************
-NumPy C Code Explanations
+NumPy C code explanations
 *************************
 
 .. This document has been moved to ../dev/internals.code-explanations.rst.
diff --git a/doc/source/reference/maskedarray.baseclass.rst b/doc/source/reference/maskedarray.baseclass.rst
index fcd310faa..7121914b9 100644
--- a/doc/source/reference/maskedarray.baseclass.rst
+++ b/doc/source/reference/maskedarray.baseclass.rst
@@ -72,19 +72,19 @@ Attributes and properties of masked arrays
 
 .. seealso:: :ref:`Array Attributes <arrays.ndarray.attributes>`
 
-.. autoattribute:: MaskedArray.data
+.. autoattribute:: numpy::ma.MaskedArray.data
 
-.. autoattribute:: MaskedArray.mask
+.. autoattribute:: numpy::ma.MaskedArray.mask
 
-.. autoattribute:: MaskedArray.recordmask
+.. autoattribute:: numpy::ma.MaskedArray.recordmask
 
-.. autoattribute:: MaskedArray.fill_value
+.. autoattribute:: numpy::ma.MaskedArray.fill_value
 
-.. autoattribute:: MaskedArray.baseclass
+.. autoattribute:: numpy::ma.MaskedArray.baseclass
 
-.. autoattribute:: MaskedArray.sharedmask
+.. autoattribute:: numpy::ma.MaskedArray.sharedmask
 
-.. autoattribute:: MaskedArray.hardmask
+.. autoattribute:: numpy::ma.MaskedArray.hardmask
 
 As :class:`MaskedArray` is a subclass of :class:`~numpy.ndarray`, a masked array also inherits all the attributes and properties of a  :class:`~numpy.ndarray` instance.
 
diff --git a/doc/source/reference/random/bit_generators/index.rst b/doc/source/reference/random/bit_generators/index.rst
index d93f38d0b..e523b4339 100644
--- a/doc/source/reference/random/bit_generators/index.rst
+++ b/doc/source/reference/random/bit_generators/index.rst
@@ -1,5 +1,7 @@
 .. currentmodule:: numpy.random
 
+.. _random-bit-generators:
+
 Bit Generators
 ==============
 
@@ -50,6 +52,8 @@ The included BitGenerators are:
     Philox <philox>
     SFC64 <sfc64>
 
+.. _seeding_and_entropy:
+
 Seeding and Entropy
 ===================
 
@@ -127,6 +131,16 @@ of 12 instances:
 
 .. end_block
 
+If you already have an initial random generator instance, you can shorten
+the above by using the `~BitGenerator.spawn` method:
+
+.. code-block:: python
+
+    from numpy.random import PCG64, SeedSequence
+    # High quality initial entropy
+    entropy = 0x87351080e25cb0fad77a44a3be03b491
+    base_bitgen = PCG64(entropy)
+    generators = base_bitgen.spawn(12)
 
 An alternative way is to use the fact that a `~SeedSequence` can be initialized
 by a tuple of elements. Here we use a base entropy value and an integer
diff --git a/doc/source/reference/random/compatibility.rst b/doc/source/reference/random/compatibility.rst
new file mode 100644
index 000000000..138f03ca3
--- /dev/null
+++ b/doc/source/reference/random/compatibility.rst
@@ -0,0 +1,87 @@
+.. _random-compatibility:
+
+.. currentmodule:: numpy.random
+
+Compatibility Policy
+====================
+
+`numpy.random` has a somewhat stricter compatibility policy than the rest of
+NumPy. Users of pseudorandomness often have use cases for being able to
+reproduce runs in fine detail given the same seed (so-called "stream
+compatibility"), and so we try to balance those needs with the flexibility to
+enhance our algorithms. :ref:`NEP 19 <NEP19>` describes the evolution of this
+policy.
+
+The main kind of compatibility that we enforce is stream-compatibility from run
+to run under certain conditions. If you create a `Generator` with the same
+`BitGenerator`, with the same seed, perform the same sequence of method calls
+with the same arguments, on the same build of ``numpy``, in the same
+environment, on the same machine, you should get the same stream of numbers.
+Note that these conditions are very strict. There are a number of factors
+outside of NumPy's control that limit our ability to guarantee much more than
+this. For example, different CPUs implement floating point arithmetic
+differently, and this can cause differences in certain edge cases that cascade
+to the rest of the stream. `Generator.multivariate_normal`, for another
+example, uses a matrix decomposition from ``numpy.linalg``. Even on the same
+platform, a different build of ``numpy`` may use a different version of this
+matrix decomposition algorithm from the LAPACK that it links to, causing
+`Generator.multivariate_normal` to return completely different (but equally
+valid!) results. We strive to prefer algorithms that are more resistant to
+these effects, but this is always imperfect.
+
+.. note::
+
+   Most of the `Generator` methods allow you to draw multiple values from
+   a distribution as arrays. The requested size of this array is a parameter,
+   for the purposes of the above policy. Calling ``rng.random()`` 5 times is
+   not *guaranteed* to give the same numbers as ``rng.random(5)``. We reserve
+   the ability to decide to use different algorithms for different-sized
+   blocks. In practice, this happens rarely.
+
+Like the rest of NumPy, we generally maintain API source
+compatibility from version to version. If we *must* make an API-breaking
+change, then we will only do so with an appropriate deprecation period and
+warnings, according to :ref:`general NumPy policy <NEP23>`.
+
+Breaking stream-compatibility in order to introduce new features or
+improve performance in `Generator` or `default_rng` will be *allowed* with
+*caution*. Such changes will be considered features, and as such will be no
+faster than the standard release cadence of features (i.e. on ``X.Y`` releases,
+never ``X.Y.Z``).  Slowness will not be considered a bug for this purpose.
+Correctness bug fixes that break stream-compatibility can happen on bugfix
+releases, per usual, but developers should consider if they can wait until the
+next feature release. We encourage developers to strongly weight user’s pain
+from the break in stream-compatibility against the improvements. One example
+of a worthwhile improvement would be to change algorithms for a significant
+increase in performance, for example, moving from the `Box-Muller transform
+<https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform>`_ method of
+Gaussian variate generation to the faster `Ziggurat algorithm
+<https://en.wikipedia.org/wiki/Ziggurat_algorithm>`_. An example of
+a discouraged improvement would be tweaking the Ziggurat tables just a little
+bit for a small performance improvement.
+
+.. note::
+
+    In particular, `default_rng` is allowed to change the default
+    `BitGenerator` that it uses (again, with *caution* and plenty of advance
+    warning).
+
+In general, `BitGenerator` classes have stronger guarantees of
+version-to-version stream compatibility. This allows them to be a firmer
+building block for downstream users that need it. Their limited API surface
+makes it easier for them to maintain this compatibility from version to version. See
+the docstrings of each `BitGenerator` class for their individual compatibility
+guarantees.
+
+The legacy `RandomState` and the :ref:`associated convenience functions
+<functions-in-numpy-random>` have a stricter version-to-version
+compatibility guarantee. For reasons outlined in :ref:`NEP 19 <NEP19>`, we had made
+stronger promises about their version-to-version stability early in NumPy's
+development. There are still some limited use cases for this kind of
+compatibility (like generating data for tests), so we maintain as much
+compatibility as we can. There will be no more modifications to `RandomState`,
+not even to fix correctness bugs. There are a few gray areas where we can make
+minor fixes to keep `RandomState` working without segfaulting as NumPy's
+internals change, and some docstring fixes. However, the previously-mentioned
+caveats about the variability from machine to machine and build to build still
+apply to `RandomState` just as much as it does to `Generator`.
diff --git a/doc/source/reference/random/extending.rst b/doc/source/reference/random/extending.rst
index 6bb941496..998faf80a 100644
--- a/doc/source/reference/random/extending.rst
+++ b/doc/source/reference/random/extending.rst
@@ -25,7 +25,7 @@ provided by ``ctypes.next_double``.
 Both CTypes and CFFI allow the more complicated distributions to be used
 directly in Numba after compiling the file distributions.c into a ``DLL`` or
 ``so``.  An example showing the use of a more complicated distribution is in
-the `examples` section below.
+the `Examples`_ section below.
 
 .. _random_cython:
 
@@ -113,6 +113,6 @@ Examples
 
 .. toctree::
     Numba <examples/numba>
-    CFFI + Numba <examples/numba_cffi> 
+    CFFI + Numba <examples/numba_cffi>
     Cython <examples/cython/index>
     CFFI <examples/cffi>
diff --git a/doc/source/reference/random/generator.rst b/doc/source/reference/random/generator.rst
index dc71cb1f9..e08395b17 100644
--- a/doc/source/reference/random/generator.rst
+++ b/doc/source/reference/random/generator.rst
@@ -18,12 +18,13 @@ can be changed by passing an instantized BitGenerator to ``Generator``.
     :members: __init__
     :exclude-members: __init__
 
-Accessing the BitGenerator
---------------------------
+Accessing the BitGenerator and Spawning
+---------------------------------------
 .. autosummary::
    :toctree: generated/
 
    ~numpy.random.Generator.bit_generator
+   ~numpy.random.Generator.spawn
 
 Simple random data
 ------------------
diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst
index 83a27d80c..38555b133 100644
--- a/doc/source/reference/random/index.rst
+++ b/doc/source/reference/random/index.rst
@@ -7,210 +7,124 @@
 Random sampling (:mod:`numpy.random`)
 =====================================
 
-Numpy's random number routines produce pseudo random numbers using
-combinations of a `BitGenerator` to create sequences and a `Generator`
-to use those sequences to sample from different statistical distributions:
-
-* BitGenerators: Objects that generate random numbers. These are typically
-  unsigned integer words filled with sequences of either 32 or 64 random bits.
-* Generators: Objects that transform sequences of random bits from a
-  BitGenerator into sequences of numbers that follow a specific probability
-  distribution (such as uniform, Normal or Binomial) within a specified
-  interval.
-
-Since Numpy version 1.17.0 the Generator can be initialized with a
-number of different BitGenerators. It exposes many different probability
-distributions. See `NEP 19 <https://www.numpy.org/neps/
-nep-0019-rng-policy.html>`_ for context on the updated random Numpy number
-routines. The legacy `RandomState` random number routines are still
-available, but limited to a single BitGenerator. See :ref:`new-or-different` 
-for a complete list of improvements and differences from the legacy
-``RandomState``.
-
-For convenience and backward compatibility, a single `RandomState`
-instance's methods are imported into the numpy.random namespace, see
-:ref:`legacy` for the complete list.
-
 .. _random-quick-start:
 
 Quick Start
 -----------
 
-Call `default_rng` to get a new instance of a `Generator`, then call its
-methods to obtain samples from different distributions.  By default,
-`Generator` uses bits provided by `PCG64` which has better statistical
-properties than the legacy `MT19937` used in `RandomState`.
-
-.. code-block:: python
-
-  # Do this (new version)
-  from numpy.random import default_rng
-  rng = default_rng()
-  vals = rng.standard_normal(10)
-  more_vals = rng.standard_normal(10)
-
-  # instead of this (legacy version)
-  from numpy import random
-  vals = random.standard_normal(10)
-  more_vals = random.standard_normal(10)
-
-`Generator` can be used as a replacement for `RandomState`. Both class
-instances hold an internal `BitGenerator` instance to provide the bit
-stream, it is accessible as ``gen.bit_generator``. Some long-overdue API
-cleanup means that legacy and compatibility methods have been removed from
-`Generator`
-
-=================== ============== ============
-`RandomState`       `Generator`    Notes
-------------------- -------------- ------------
-``random_sample``,  ``random``     Compatible with `random.random`
-``rand``
-------------------- -------------- ------------
-``randint``,        ``integers``   Add an ``endpoint`` kwarg
-``random_integers``
-------------------- -------------- ------------
-``tomaxint``        removed        Use ``integers(0, np.iinfo(np.int_).max,``
-                                   ``endpoint=False)``
-------------------- -------------- ------------
-``seed``            removed        Use `SeedSequence.spawn`
-=================== ============== ============
-
-See :ref:`new-or-different` for more information.
-
-Something like the following code can be used to support both ``RandomState``
-and ``Generator``, with the understanding that the interfaces are slightly
-different
-
-.. code-block:: python
-
-    try:
-        rng_integers = rng.integers
-    except AttributeError:
-        rng_integers = rng.randint
-    a = rng_integers(1000)
-
-Seeds can be passed to any of the BitGenerators. The provided value is mixed
-via `SeedSequence` to spread a possible sequence of seeds across a wider
-range of initialization states for the BitGenerator. Here `PCG64` is used and
-is wrapped with a `Generator`.
-
-.. code-block:: python
-
-  from numpy.random import Generator, PCG64
-  rng = Generator(PCG64(12345))
-  rng.standard_normal()
-  
-Here we use `default_rng` to create an instance of `Generator` to generate a 
-random float:
- 
->>> import numpy as np
->>> rng = np.random.default_rng(12345)
->>> print(rng)
-Generator(PCG64)
->>> rfloat = rng.random()
->>> rfloat
-0.22733602246716966
->>> type(rfloat)
-<class 'float'>
- 
-Here we use `default_rng` to create an instance of `Generator` to generate 3 
-random integers between 0 (inclusive) and 10 (exclusive):
-    
->>> import numpy as np
->>> rng = np.random.default_rng(12345)
->>> rints = rng.integers(low=0, high=10, size=3)
->>> rints
-array([6, 2, 7])
->>> type(rints[0])
-<class 'numpy.int64'> 
-
-Introduction
-------------
-The new infrastructure takes a different approach to producing random numbers
-from the `RandomState` object.  Random number generation is separated into
-two components, a bit generator and a random generator.
-
-The `BitGenerator` has a limited set of responsibilities. It manages state
-and provides functions to produce random doubles and random unsigned 32- and
-64-bit values.
-
-The `random generator <Generator>` takes the
-bit generator-provided stream and transforms them into more useful
-distributions, e.g., simulated normal random values. This structure allows
-alternative bit generators to be used with little code duplication.
-
-The `Generator` is the user-facing object that is nearly identical to the
-legacy `RandomState`. It accepts a bit generator instance as an argument.
-The default is currently `PCG64` but this may change in future versions. 
-As a convenience NumPy  provides the `default_rng` function to hide these 
-details:
-  
->>> from numpy.random import default_rng
->>> rng = default_rng(12345)
->>> print(rng)
-Generator(PCG64)
->>> print(rng.random())
-0.22733602246716966
-  
-One can also instantiate `Generator` directly with a `BitGenerator` instance.
-
-To use the default `PCG64` bit generator, one can instantiate it directly and 
-pass it to `Generator`:
-
->>> from numpy.random import Generator, PCG64
->>> rng = Generator(PCG64(12345))
->>> print(rng)
-Generator(PCG64)
-
-Similarly to use the older `MT19937` bit generator (not recommended), one can
-instantiate it directly and pass it to `Generator`:
-
->>> from numpy.random import Generator, MT19937
->>> rng = Generator(MT19937(12345))
->>> print(rng)
-Generator(MT19937)
-
-What's New or Different
-~~~~~~~~~~~~~~~~~~~~~~~
+The :mod:`numpy.random` module implements pseudo-random number generators
+(PRNGs or RNGs, for short) with the ability to draw samples from a variety of
+probability distributions. In general, users will create a `Generator` instance
+with `default_rng` and call the various methods on it to obtain samples from
+different distributions.
+
+::
+
+  >>> import numpy as np
+  >>> rng = np.random.default_rng()
+  # Generate one random float uniformly distributed over the range [0, 1)
+  >>> rng.random()  #doctest: +SKIP
+  0.06369197489564249  # may vary
+  # Generate an array of 10 numbers according to a unit Gaussian distribution.
+  >>> rng.standard_normal(10)  #doctest: +SKIP
+  array([-0.31018314, -1.8922078 , -0.3628523 , -0.63526532,  0.43181166,  # may vary
+          0.51640373,  1.25693945,  0.07779185,  0.84090247, -2.13406828])
+  # Generate an array of 5 integers uniformly over the range [0, 10).
+  >>> rng.integers(low=0, high=10, size=5)  #doctest: +SKIP
+  array([8, 7, 6, 2, 0])  # may vary
+
+Our RNGs are deterministic sequences and can be reproduced by specifying a seed integer to
+derive its initial state. By default, with no seed provided, `default_rng` will create
+seed the RNG from nondeterministic data from the operating system and therefore
+generate different numbers each time. The pseudo-random sequences will be
+independent for all practical purposes, at least those purposes for which our
+pseudo-randomness was good for in the first place.
+
+::
+
+    >>> rng1 = np.random.default_rng()
+    >>> rng1.random()  #doctest: +SKIP
+    0.6596288841243357  # may vary
+    >>> rng2 = np.random.default_rng()
+    >>> rng2.random()  #doctest: +SKIP
+    0.11885628817151628  # may vary
+
 .. warning::
 
-  The Box-Muller method used to produce NumPy's normals is no longer available
-  in `Generator`.  It is not possible to reproduce the exact random
-  values using Generator for the normal distribution or any other
-  distribution that relies on the normal such as the `RandomState.gamma` or
-  `RandomState.standard_t`. If you require bitwise backward compatible
-  streams, use `RandomState`.
-
-* The Generator's normal, exponential and gamma functions use 256-step Ziggurat
-  methods which are 2-10 times faster than NumPy's Box-Muller or inverse CDF
-  implementations.
-* Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
-  to produce either single or double precision uniform random variables for
-  select distributions
-* Optional ``out`` argument that allows existing arrays to be filled for
-  select distributions
-* All BitGenerators can produce doubles, uint64s and uint32s via CTypes
-  (`PCG64.ctypes`) and CFFI (`PCG64.cffi`). This allows the bit generators
-  to be used in numba.
-* The bit generators can be used in downstream projects via
-  :ref:`Cython <random_cython>`.
-* `Generator.integers` is now the canonical way to generate integer
-  random numbers from a discrete uniform distribution. The ``rand`` and
-  ``randn`` methods are only available through the legacy `RandomState`.
-  The ``endpoint`` keyword can be used to specify open or closed intervals.
-  This replaces both ``randint`` and the deprecated ``random_integers``.
-* `Generator.random` is now the canonical way to generate floating-point
-  random numbers, which replaces `RandomState.random_sample`,
-  `RandomState.sample`, and `RandomState.ranf`. This is consistent with
-  Python's `random.random`.
-* All BitGenerators in numpy use `SeedSequence` to convert seeds into
-  initialized states.
-* The addition of an ``axis`` keyword argument to methods such as 
-  `Generator.choice`, `Generator.permutation`,  and `Generator.shuffle` 
-  improves support for sampling from and shuffling multi-dimensional arrays.
-
-See :ref:`new-or-different` for a complete list of improvements and
-differences from the traditional ``Randomstate``.
+  The pseudo-random number generators implemented in this module are designed
+  for statistical modeling and simulation. They are not suitable for security
+  or cryptographic purposes. See the :py:mod:`secrets` module from the
+  standard library for such use cases.
+
+Seeds should be large positive integers. `default_rng` can take positive
+integers of any size. We recommend using very large, unique numbers to ensure
+that your seed is different from anyone else's. This is good practice to ensure
+that your results are statistically independent from theirs unless you are
+intentionally *trying* to reproduce their result. A convenient way to get
+such a seed number is to use :py:func:`secrets.randbits` to get an
+arbitrary 128-bit integer.
+
+::
+
+    >>> import secrets
+    >>> import numpy as np
+    >>> secrets.randbits(128)  #doctest: +SKIP
+    122807528840384100672342137672332424406  # may vary
+    >>> rng1 = np.random.default_rng(122807528840384100672342137672332424406)
+    >>> rng1.random()
+    0.5363922081269535
+    >>> rng2 = np.random.default_rng(122807528840384100672342137672332424406)
+    >>> rng2.random()
+    0.5363922081269535
+
+See the documentation on `default_rng` and `SeedSequence` for more advanced
+options for controlling the seed in specialized scenarios.
+
+`Generator` and its associated infrastructure was introduced in NumPy version
+1.17.0. There is still a lot of code that uses the older `RandomState` and the
+functions in `numpy.random`. While there are no plans to remove them at this
+time, we do recommend transitioning to `Generator` as you can. The algorithms
+are faster, more flexible, and will receive more improvements in the future.
+For the most part, `Generator` can be used as a replacement for `RandomState`.
+See :ref:`legacy` for information on the legacy infrastructure,
+:ref:`new-or-different` for information on transitioning, and :ref:`NEP 19
+<NEP19>` for some of the reasoning for the transition.
+
+Design
+------
+
+Users primarily interact with `Generator` instances. Each `Generator` instance
+owns a `BitGenerator` instance that implements the core RNG algorithm. The
+`BitGenerator` has a limited set of responsibilities. It manages state and
+provides functions to produce random doubles and random unsigned 32- and 64-bit
+values.
+
+The `Generator` takes the bit generator-provided stream and transforms them
+into more useful distributions, e.g., simulated normal random values. This
+structure allows alternative bit generators to be used with little code
+duplication.
+
+NumPy implements several different `BitGenerator` classes implementing
+different RNG algorithms. `default_rng` currently uses `~PCG64` as the
+default `BitGenerator`. It has better statistical properties and performance
+than the `~MT19937` algorithm used in the legacy `RandomState`. See
+:ref:`random-bit-generators` for more details on the supported BitGenerators.
+
+`default_rng` and BitGenerators delegate the conversion of seeds into RNG
+states to `SeedSequence` internally. `SeedSequence` implements a sophisticated
+algorithm that intermediates between the user's input and the internal
+implementation details of each `BitGenerator` algorithm, each of which can
+require different amounts of bits for its state. Importantly, it lets you use
+arbitrary-sized integers and arbitrary sequences of such integers to mix
+together into the RNG state. This is a useful primitive for constructing
+a :ref:`flexible pattern for parallel RNG streams <seedsequence-spawn>`.
+
+For backward compatibility, we still maintain the legacy `RandomState` class.
+It continues to use the `~MT19937` algorithm by default, and old seeds continue
+to reproduce the same results. The convenience :ref:`functions-in-numpy-random`
+are still aliases to the methods on a single global `RandomState` instance. See
+:ref:`legacy` for the complete details. See :ref:`new-or-different` for
+a detailed comparison between `Generator` and `RandomState`.
 
 Parallel Generation
 ~~~~~~~~~~~~~~~~~~~
@@ -235,6 +149,7 @@ Concepts
    Legacy Generator (RandomState) <legacy>
    BitGenerators, SeedSequences <bit_generators/index>
    Upgrading PCG64 with PCG64DXSM <upgrading-pcg64>
+   compatibility
 
 Features
 --------
diff --git a/doc/source/reference/random/legacy.rst b/doc/source/reference/random/legacy.rst
index b1fce49a1..00921c477 100644
--- a/doc/source/reference/random/legacy.rst
+++ b/doc/source/reference/random/legacy.rst
@@ -52,7 +52,7 @@ using the state of the `RandomState`:
     :exclude-members: __init__
 
 Seeding and State
------------------
+=================
 
 .. autosummary::
    :toctree: generated/
@@ -62,7 +62,7 @@ Seeding and State
    ~RandomState.seed
 
 Simple random data
-------------------
+==================
 .. autosummary::
    :toctree: generated/
 
@@ -75,7 +75,7 @@ Simple random data
    ~RandomState.bytes
 
 Permutations
-------------
+============
 .. autosummary::
    :toctree: generated/
 
@@ -83,7 +83,7 @@ Permutations
    ~RandomState.permutation
 
 Distributions
--------------
+==============
 .. autosummary::
    :toctree: generated/
 
@@ -123,8 +123,10 @@ Distributions
    ~RandomState.weibull
    ~RandomState.zipf
 
+.. _functions-in-numpy-random:
+
 Functions in `numpy.random`
----------------------------
+===========================
 Many of the RandomState methods above are exported as functions in
 `numpy.random` This usage is discouraged, as it is implemented via a global
 `RandomState` instance which is not advised on two counts:
@@ -133,8 +135,7 @@ Many of the RandomState methods above are exported as functions in
 
 - It uses a `RandomState` rather than the more modern `Generator`.
 
-For backward compatible legacy reasons, we cannot change this. See
-:ref:`random-quick-start`.
+For backward compatible legacy reasons, we will not change this.
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst
index 8f4a70540..9b5bf38e5 100644
--- a/doc/source/reference/random/new-or-different.rst
+++ b/doc/source/reference/random/new-or-different.rst
@@ -5,17 +5,9 @@
 What's New or Different
 -----------------------
 
-.. warning::
-
-  The Box-Muller method used to produce NumPy's normals is no longer available
-  in `Generator`.  It is not possible to reproduce the exact random
-  values using ``Generator`` for the normal distribution or any other
-  distribution that relies on the normal such as the `Generator.gamma` or
-  `Generator.standard_t`. If you require bitwise backward compatible
-  streams, use `RandomState`, i.e., `RandomState.gamma` or
-  `RandomState.standard_t`.
-
-Quick comparison of legacy :ref:`mtrand <legacy>` to the new `Generator`
+NumPy 1.17.0 introduced `Generator` as an improved replacement for
+the :ref:`legacy <legacy>` `RandomState`. Here is a quick comparison of the two
+implementations.
 
 ================== ==================== =============
 Feature            Older Equivalent     Notes
@@ -44,21 +36,17 @@ Feature            Older Equivalent     Notes
                                         ``high`` interval endpoint
 ================== ==================== =============
 
-And in more detail:
-
-* Simulate from the complex normal distribution
-  (`~.Generator.complex_normal`)
 * The normal, exponential and gamma generators use 256-step Ziggurat
   methods which are 2-10 times faster than NumPy's default implementation in
   `~.Generator.standard_normal`, `~.Generator.standard_exponential` or
-  `~.Generator.standard_gamma`.
-
+  `~.Generator.standard_gamma`. Because of the change in algorithms, it is not
+  possible to reproduce the exact random values using ``Generator`` for these
+  distributions or any distribution method that relies on them.
 
 .. ipython:: python
 
-  from  numpy.random import Generator, PCG64
   import numpy.random
-  rng = Generator(PCG64())
+  rng = np.random.default_rng()
   %timeit -n 1 rng.standard_normal(100000)
   %timeit -n 1 numpy.random.standard_normal(100000)
 
@@ -74,18 +62,25 @@ And in more detail:
 
 
 * `~.Generator.integers` is now the canonical way to generate integer
-  random numbers from a discrete uniform distribution. The ``rand`` and
-  ``randn`` methods are only available through the legacy `~.RandomState`.
-  This replaces both ``randint`` and the deprecated ``random_integers``.
-* The Box-Muller method used to produce NumPy's normals is no longer available.
+  random numbers from a discrete uniform distribution. This replaces both
+  ``randint`` and the deprecated ``random_integers``.
+* The ``rand`` and ``randn`` methods are only available through the legacy
+  `~.RandomState`.
+* `Generator.random` is now the canonical way to generate floating-point
+  random numbers, which replaces `RandomState.random_sample`,
+  `sample`, and `ranf`, all of which were aliases. This is consistent with
+  Python's `random.random`.
 * All bit generators can produce doubles, uint64s and
   uint32s via CTypes (`~PCG64.ctypes`) and CFFI (`~PCG64.cffi`).
   This allows these bit generators to be used in numba.
 * The bit generators can be used in downstream projects via
   Cython.
+* All bit generators use `SeedSequence` to :ref:`convert seed integers to
+  initialized states <seeding_and_entropy>`.
 * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
   to produce either single or double precision uniform random variables for
-  select distributions
+  select distributions. `~.Generator.integers` accepts a ``dtype`` argument
+  with any signed or unsigned integer dtype.
 
   * Uniforms (`~.Generator.random` and `~.Generator.integers`)
   * Normals (`~.Generator.standard_normal`)
@@ -94,9 +89,10 @@ And in more detail:
 
 .. ipython:: python
 
-  rng = Generator(PCG64(0))
-  rng.random(3, dtype='d')
-  rng.random(3, dtype='f')
+  rng = np.random.default_rng()
+  rng.random(3, dtype=np.float64)
+  rng.random(3, dtype=np.float32)
+  rng.integers(0, 256, size=3, dtype=np.uint8)
 
 * Optional ``out`` argument that allows existing arrays to be filled for
   select distributions
@@ -111,6 +107,7 @@ And in more detail:
 
 .. ipython:: python
 
+  rng = np.random.default_rng()
   existing = np.zeros(4)
   rng.random(out=existing[:2])
   print(existing)
@@ -121,9 +118,12 @@ And in more detail:
 
 .. ipython:: python
 
-  rng = Generator(PCG64(123456789))
+  rng = np.random.default_rng()
   a = np.arange(12).reshape((3, 4))
   a
   rng.choice(a, axis=1, size=5)
   rng.shuffle(a, axis=1)        # Shuffle in-place
   a
+
+* Added a method to sample from the complex normal distribution
+  (`~.Generator.complex_normal`)
diff --git a/doc/source/reference/random/parallel.rst b/doc/source/reference/random/parallel.rst
index b625d34b7..b4934a0ca 100644
--- a/doc/source/reference/random/parallel.rst
+++ b/doc/source/reference/random/parallel.rst
@@ -12,6 +12,11 @@ or distributed).
 `~SeedSequence` spawning
 ------------------------
 
+NumPy allows you to spawn new (with very high probability) independent
+`~BitGenerator` and `~Generator` instances via their ``spawn()`` method.
+This spawning is implemented by the `~SeedSequence` used for initializing
+the bit generators random stream.
+
 `~SeedSequence` `implements an algorithm`_ to process a user-provided seed,
 typically as an integer of some size, and to convert it into an initial state for
 a `~BitGenerator`. It uses hashing techniques to ensure that low-quality seeds
@@ -53,15 +58,25 @@ wrap this together into an API that is easy to use and difficult to misuse.
 
 .. end_block
 
-Child `~SeedSequence` objects can also spawn to make grandchildren, and so on.
-Each `~SeedSequence` has its position in the tree of spawned `~SeedSequence`
-objects mixed in with the user-provided seed to generate independent (with very
-high probability) streams.
+For convenience the direct use of `~SeedSequence` is not necessary.
+The above ``streams`` can be spawned directly from a parent generator
+via `~Generator.spawn`:
+
+.. code-block:: python
+
+  parent_rng = default_rng(12345)
+  streams = parent_rng.spawn(10)
+
+.. end_block
+
+Child objects can also spawn to make grandchildren, and so on.
+Each child has a `~SeedSequence` with its position in the tree of spawned
+child objects mixed in with the user-provided seed to generate independent
+(with very high probability) streams.
 
 .. code-block:: python
 
-  grandchildren = child_seeds[0].spawn(4)
-  grand_streams = [default_rng(s) for s in grandchildren]
+  grandchildren = streams[0].spawn(4)
 
 .. end_block
 
diff --git a/doc/source/reference/routines.ctypeslib.rst b/doc/source/reference/routines.ctypeslib.rst
index c6127ca64..fe5ffb5a8 100644
--- a/doc/source/reference/routines.ctypeslib.rst
+++ b/doc/source/reference/routines.ctypeslib.rst
@@ -1,7 +1,7 @@
 .. module:: numpy.ctypeslib
 
 ***********************************************************
-C-Types Foreign Function Interface (:mod:`numpy.ctypeslib`)
+C-Types foreign function interface (:mod:`numpy.ctypeslib`)
 ***********************************************************
 
 .. currentmodule:: numpy.ctypeslib
diff --git a/doc/source/reference/routines.datetime.rst b/doc/source/reference/routines.datetime.rst
index 966ed5a47..72513882b 100644
--- a/doc/source/reference/routines.datetime.rst
+++ b/doc/source/reference/routines.datetime.rst
@@ -1,6 +1,6 @@
 .. _routines.datetime:
 
-Datetime Support Functions
+Datetime support functions
 **************************
 
 .. currentmodule:: numpy
@@ -12,7 +12,7 @@ Datetime Support Functions
    datetime_data
 
 
-Business Day Functions
+Business day functions
 ======================
 
 .. currentmodule:: numpy
diff --git a/doc/source/reference/routines.dual.rst b/doc/source/reference/routines.dual.rst
deleted file mode 100644
index 18c7791d0..000000000
--- a/doc/source/reference/routines.dual.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-Optionally SciPy-accelerated routines (:mod:`numpy.dual`)
-=========================================================
-
-.. automodule:: numpy.dual
-
-Linear algebra
---------------
-
-.. currentmodule:: numpy.linalg
-
-.. autosummary::
-
-   cholesky
-   det
-   eig
-   eigh
-   eigvals
-   eigvalsh
-   inv
-   lstsq
-   norm
-   pinv
-   solve
-   svd
-
-FFT
----
-
-.. currentmodule:: numpy.fft
-
-.. autosummary::
-
-   fft
-   fft2
-   fftn
-   ifft
-   ifft2
-   ifftn
-
-Other
------
-
-.. currentmodule:: numpy
-
-.. autosummary::
-
-   i0
diff --git a/doc/source/reference/routines.io.rst b/doc/source/reference/routines.io.rst
index 2542b336f..1ec2ccb5e 100644
--- a/doc/source/reference/routines.io.rst
+++ b/doc/source/reference/routines.io.rst
@@ -83,7 +83,7 @@ Data sources
 
    DataSource
 
-Binary Format Description
+Binary format description
 -------------------------
 .. autosummary::
    :toctree: generated/
diff --git a/doc/source/reference/routines.ma.rst b/doc/source/reference/routines.ma.rst
index d503cc243..fd22a74aa 100644
--- a/doc/source/reference/routines.ma.rst
+++ b/doc/source/reference/routines.ma.rst
@@ -31,6 +31,7 @@ From existing data
    ma.fromfunction
 
    ma.MaskedArray.copy
+   ma.diagflat
 
 
 Ones and zeros
@@ -72,6 +73,9 @@ Inspecting the array
    ma.isMaskedArray
    ma.isMA
    ma.isarray
+   ma.isin
+   ma.in1d
+   ma.unique
 
 
    ma.MaskedArray.all
@@ -394,6 +398,17 @@ Clipping and rounding
    ma.MaskedArray.clip
    ma.MaskedArray.round
 
+Set operations
+~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: generated/
+
+
+   ma.intersect1d
+   ma.setdiff1d
+   ma.setxor1d
+   ma.union1d
+
 
 Miscellanea
 ~~~~~~~~~~~
diff --git a/doc/source/reference/routines.math.rst b/doc/source/reference/routines.math.rst
index 35771cc9c..4cc8719fb 100644
--- a/doc/source/reference/routines.math.rst
+++ b/doc/source/reference/routines.math.rst
@@ -39,6 +39,7 @@ Rounding
 .. autosummary::
    :toctree: generated/
 
+   round
    around
    rint
    fix
@@ -148,13 +149,15 @@ Extrema Finding
    :toctree: generated/
 
    maximum
-   fmax
+   max
    amax
+   fmax
    nanmax
    
    minimum
-   fmin
+   min
    amin
+   fmin
    nanmin
    
 
diff --git a/doc/source/reference/routines.other.rst b/doc/source/reference/routines.other.rst
index 7b60545f1..769b3d910 100644
--- a/doc/source/reference/routines.other.rst
+++ b/doc/source/reference/routines.other.rst
@@ -59,3 +59,5 @@ Matlab-like Functions
    disp
 
 .. automodule:: numpy.exceptions
+
+.. automodule:: numpy.dtypes
diff --git a/doc/source/reference/routines.rst b/doc/source/reference/routines.rst
index 24117895b..72ed80972 100644
--- a/doc/source/reference/routines.rst
+++ b/doc/source/reference/routines.rst
@@ -24,7 +24,6 @@ indentation.
    routines.ctypeslib
    routines.datetime
    routines.dtype
-   routines.dual
    routines.emath
    routines.err
    routines.fft
diff --git a/doc/source/reference/routines.testing.rst b/doc/source/reference/routines.testing.rst
index 16d53bb4e..82e43dfdb 100644
--- a/doc/source/reference/routines.testing.rst
+++ b/doc/source/reference/routines.testing.rst
@@ -51,11 +51,6 @@ Decorators
 .. autosummary::
    :toctree: generated/
 
-   dec.deprecated
-   dec.knownfailureif
-   dec.setastest
-   dec.skipif
-   dec.slow
    decorate_methods
 
 Test Running
@@ -63,10 +58,8 @@ Test Running
 .. autosummary::
    :toctree: generated/
 
-   Tester
    clear_and_catch_warnings
    measure
-   run_module_suite
    rundocs
    suppress_warnings
 
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
index 0994f15aa..d1e2e6b8e 100644
--- a/doc/source/reference/simd/build-options.rst
+++ b/doc/source/reference/simd/build-options.rst
@@ -333,7 +333,7 @@ and here is how it looks on x86_64/gcc:
 .. literalinclude:: log_example.txt
    :language: bash
 
-As you see, there is a separate report for each of ``build_ext`` and ``build_clib``
+There is a separate report for each of ``build_ext`` and ``build_clib``
 that includes several sections, and each section has several values, representing the following:
 
 **Platform**:
@@ -371,6 +371,17 @@ that includes several sections, and each section has several values, representin
   - The lines that come after the above property and end with a ':' on a separate line,
     represent the paths of c/c++ sources that define the generated optimizations.
 
-Runtime Trace
--------------
-To be completed.
+.. _runtime-simd-dispatch:
+
+Runtime dispatch
+----------------
+Importing NumPy triggers a scan of the available CPU features from the set
+of dispatchable features. This can be further restricted by setting the
+environment variable ``NPY_DISABLE_CPU_FEATURES`` to a comma-, tab-, or
+space-separated list of features to disable. This will raise an error if
+parsing fails or if the feature was not enabled. For instance, on ``x86_64``
+this will disable ``AVX2`` and ``FMA3``::
+
+    NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"
+
+If the feature is not available, a warning will be emitted.
diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
index 9a38ef5c9..b141e23d0 100644
--- a/doc/source/reference/simd/gen_features.py
+++ b/doc/source/reference/simd/gen_features.py
@@ -168,7 +168,7 @@ if __name__ == '__main__':
     gen_path = path.join(
         path.dirname(path.realpath(__file__)), "generated_tables"
     )
-    with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
+    with open(path.join(gen_path, 'cpu_features.inc'), 'w') as fd:
         fd.write(f'.. generated via {__file__}\n\n')
         for arch in (
             ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X")
@@ -177,7 +177,7 @@ if __name__ == '__main__':
             table = Features(arch, 'gcc').table()
             fd.write(wrapper_section(title, table))
 
-    with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd:
+    with open(path.join(gen_path, 'compilers-diff.inc'), 'w') as fd:
         fd.write(f'.. generated via {__file__}\n\n')
         for arch, cc_names in (
             ("x86", ("clang", "ICC", "MSVC")),
diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc
index 4b9009a68..d5a87da3c 100644
--- a/doc/source/reference/simd/generated_tables/compilers-diff.inc
+++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc
@@ -1,33 +1,35 @@
-.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+.. generated via /numpy/numpy/./doc/source/reference/simd/gen_features.py
 
 On x86::Intel Compiler
 ~~~~~~~~~~~~~~~~~~~~~~
 .. table::
     :align: left
 
-    ================ ==========================================================================================================================================
-    Name             Implies                                                                                                                                   
-    ================ ==========================================================================================================================================
-    FMA3             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                           
-    AVX2             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                           
-    AVX512F          SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`                                                             
-    :disabled:`XOP`  :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
-    :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
-    ================ ==========================================================================================================================================
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
+    Name                   Implies                                                                                                                                                                                                                                                                                                                            Gathers               
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
+    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                                          
+    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                                          
+    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`                                                                                                                                                                                                                                                                            
+    :disabled:`XOP`        :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`                                                                                                                                                                                                               
+    :disabled:`FMA4`       :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`                                                                                                                                                                                                               
+    :disabled:`AVX512_SPR` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_SKX` :disabled:`AVX512_CLX` :disabled:`AVX512_CNL` :disabled:`AVX512_ICL` :disabled:`AVX512FP16`
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
 
 On x86::Microsoft Visual C/C++
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. table::
     :align: left
 
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
-    Name                   Implies                                                                                                                                                                                                                                                       Gathers                                                                      
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
-    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                            
-    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                            
-    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`                                                                                                                                                                                                                                        
-    AVX512CD               SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                    
-    :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD`                        :disabled:`AVX512ER` :disabled:`AVX512PF`                                    
-    :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
+    Name                   Implies                                                                                                                                                                                                                                                                                                                            Gathers                                                                      
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
+    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                                                                                                 
+    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                                                                                                 
+    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                                                                             
+    AVX512CD               SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                                                                                         
+    :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD`                                                                                             :disabled:`AVX512ER` :disabled:`AVX512PF`                                    
+    :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL`                                                                      :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
+    :disabled:`AVX512_SPR` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_SKX` :disabled:`AVX512_CLX` :disabled:`AVX512_CNL` :disabled:`AVX512_ICL` :disabled:`AVX512FP16`                                                       
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
 
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
index 7782172d2..603370e21 100644
--- a/doc/source/reference/simd/generated_tables/cpu_features.inc
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -1,35 +1,36 @@
-.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py
+.. generated via /numpy/numpy/./doc/source/reference/simd/gen_features.py
 
 On x86
 ~~~~~~
 .. table::
     :align: left
 
-    ============== =========================================================================================================================================================================== =====================================================
-    Name           Implies                                                                                                                                                                     Gathers                                              
-    ============== =========================================================================================================================================================================== =====================================================
-    ``SSE``        ``SSE2``                                                                                                                                                                                                                         
-    ``SSE2``       ``SSE``                                                                                                                                                                                                                          
-    ``SSE3``       ``SSE`` ``SSE2``                                                                                                                                                                                                                 
-    ``SSSE3``      ``SSE`` ``SSE2`` ``SSE3``                                                                                                                                                                                                        
-    ``SSE41``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                                                                                                                                              
-    ``POPCNT``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                                                                                                                                    
-    ``SSE42``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                                                                                                                                         
-    ``AVX``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                                                                                                                                               
-    ``XOP``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``FMA4``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``F16C``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``FMA3``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
-    ``AVX2``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
-    ``AVX512F``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``                                                                                                                            
-    ``AVX512CD``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``                                                                                                                
-    ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512ER`` ``AVX512PF``                            
-    ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                               ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
-    ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``               
-    ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512VNNI``                                       
-    ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512IFMA`` ``AVX512VBMI``                        
-    ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` 
-    ============== =========================================================================================================================================================================== =====================================================
+    ============== ========================================================================================================================================================================================== =====================================================
+    Name           Implies                                                                                                                                                                                    Gathers                                              
+    ============== ========================================================================================================================================================================================== =====================================================
+    ``SSE``        ``SSE2``                                                                                                                                                                                                                                        
+    ``SSE2``       ``SSE``                                                                                                                                                                                                                                         
+    ``SSE3``       ``SSE`` ``SSE2``                                                                                                                                                                                                                                
+    ``SSSE3``      ``SSE`` ``SSE2`` ``SSE3``                                                                                                                                                                                                                       
+    ``SSE41``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                                                                                                                                                             
+    ``POPCNT``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                                                                                                                                                   
+    ``SSE42``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                                                                                                                                                        
+    ``AVX``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                                                                                                                                                              
+    ``XOP``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``FMA4``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``F16C``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``FMA3``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                                             
+    ``AVX2``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                                             
+    ``AVX512F``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``                                                                                                                                           
+    ``AVX512CD``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``                                                                                                                               
+    ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                                             ``AVX512ER`` ``AVX512PF``                            
+    ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                                              ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
+    ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                                             ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``               
+    ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                                              ``AVX512VNNI``                                       
+    ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                                              ``AVX512IFMA`` ``AVX512VBMI``                        
+    ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL``                ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` 
+    ``AVX512_SPR`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512_ICL`` ``AVX512FP16``                                       
+    ============== ========================================================================================================================================================================================== =====================================================
 
 On IBM/POWER big-endian
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst
index 19b3dba45..dac80dd02 100644
--- a/doc/source/reference/simd/how-it-works.rst
+++ b/doc/source/reference/simd/how-it-works.rst
@@ -1,6 +1,6 @@
-**********************************
+*********************************
 How does the CPU dispatcher work?
-**********************************
+*********************************
 
 NumPy dispatcher is based on multi-source compiling, which means taking
 a certain source and compiling it multiple times with different compiler
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 84aff7f66..b8a877924 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -6,6 +6,9 @@ Release notes
     :maxdepth: 3
 
     1.25.0 <release/1.25.0-notes>
+    1.24.3 <release/1.24.3-notes>
+    1.24.2 <release/1.24.2-notes>
+    1.24.1 <release/1.24.1-notes>
     1.24.0 <release/1.24.0-notes>
     1.23.5 <release/1.23.5-notes>
     1.23.4 <release/1.23.4-notes>
diff --git a/doc/source/release/1.15.0-notes.rst b/doc/source/release/1.15.0-notes.rst
index 2d9d068e5..e84386f0f 100644
--- a/doc/source/release/1.15.0-notes.rst
+++ b/doc/source/release/1.15.0-notes.rst
@@ -92,7 +92,7 @@ Deprecations
   ``np.sum(np.from_iter(generator))`` or the built-in Python ``sum`` instead.
 
 * Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
-  ``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
+  ``PyArray_DiscardWritebackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
   flag set, before deallocating the array. A deprecation warning will be
   emitted if those calls are not used when needed.
 
diff --git a/doc/source/release/1.23.0-notes.rst b/doc/source/release/1.23.0-notes.rst
index c5c23620a..2301192cc 100644
--- a/doc/source/release/1.23.0-notes.rst
+++ b/doc/source/release/1.23.0-notes.rst
@@ -83,8 +83,8 @@ Expired deprecations
 * The ``UPDATEIFCOPY`` array flag has been removed together with the enum
   ``NPY_ARRAY_UPDATEIFCOPY``. The associated (and deprecated)
   ``PyArray_XDECREF_ERR`` was also removed. These were all deprecated in 1.14. They
-  are replaced by ``WRITEBACKIFCOPY``, that requires calling
-  ``PyArray_ResoveWritebackIfCopy`` before the array is deallocated.
+  are replaced by ``NPY_ARRAY_WRITEBACKIFCOPY``, that requires calling
+  ``PyArray_ResolveWritebackIfCopy`` before the array is deallocated.
 
   (`gh-20589 <https://github.com/numpy/numpy/pull/20589>`__)
 
diff --git a/doc/source/release/1.24.0-notes.rst b/doc/source/release/1.24.0-notes.rst
index 68134bce2..1c9e719b3 100644
--- a/doc/source/release/1.24.0-notes.rst
+++ b/doc/source/release/1.24.0-notes.rst
@@ -1,45 +1,515 @@
 .. currentmodule:: numpy
 
-==========================
-NumPy 1.24.0 Release Notes
-==========================
+========================
+NumPy 1.24 Release Notes
+========================
+The NumPy 1.24.0 release continues the ongoing work to improve the handling and
+promotion of dtypes, increase the execution speed, and clarify the
+documentation.  There are also a large number of new and expired deprecations
+due to changes in promotion and cleanups. This might be called a deprecation
+release. Highlights are
 
+* Many new deprecations, check them out.
+* Many expired deprecations,
+* New F2PY features and fixes.
+* New "dtype" and "casting" keywords for stacking functions.
 
-Highlights
-==========
+See below for the details,
 
-
-New functions
-=============
+This release supports Python versions 3.8-3.11.
 
 
 Deprecations
 ============
 
+Deprecate fastCopyAndTranspose and PyArray_CopyAndTranspose
+-----------------------------------------------------------
+The ``numpy.fastCopyAndTranspose`` function has been deprecated. Use the
+corresponding copy and transpose methods directly::
+
+    arr.T.copy()
+
+The underlying C function ``PyArray_CopyAndTranspose`` has also been deprecated
+from the NumPy C-API.
+
+(`gh-22313 <https://github.com/numpy/numpy/pull/22313>`__)
+
+Conversion of out-of-bound Python integers
+------------------------------------------
+Attempting a conversion from a Python integer to a NumPy value will now always
+check whether the result can be represented by NumPy.  This means the following
+examples will fail in the future and give a ``DeprecationWarning`` now::
+
+    np.uint8(-1)
+    np.array([3000], dtype=np.int8)
+
+Many of these did succeed before.  Such code was mainly useful for unsigned
+integers with negative values such as ``np.uint8(-1)`` giving
+``np.iinfo(np.uint8).max``.
+
+Note that conversion between NumPy integers is unaffected, so that
+``np.array(-1).astype(np.uint8)`` continues to work and use C integer overflow
+logic.  For negative values, it will also work to view the array:
+``np.array(-1, dtype=np.int8).view(np.uint8)``.
+In some cases, using ``np.iinfo(np.uint8).max`` or ``val % 2**8`` may also
+work well.
 
-Future Changes
-==============
+In rare cases input data may mix both negative values and very large unsigned
+values (i.e. ``-1`` and ``2**63``).  There it is unfortunately necessary
+to use ``%`` on the Python value or use signed or unsigned conversion
+depending on whether negative values are expected.
+
+(`gh-22385 <https://github.com/numpy/numpy/pull/22385>`__)
+
+Deprecate ``msort``
+-------------------
+The ``numpy.msort`` function is deprecated. Use ``np.sort(a, axis=0)`` instead.
+
+(`gh-22456 <https://github.com/numpy/numpy/pull/22456>`__)
+
+``np.str0`` and similar are now deprecated
+------------------------------------------
+The scalar type aliases ending in a 0 bit size: ``np.object0``, ``np.str0``,
+``np.bytes0``, ``np.void0``, ``np.int0``, ``np.uint0`` as well as ``np.bool8``
+are now deprecated and will eventually be removed.
+
+(`gh-22607 <https://github.com/numpy/numpy/pull/22607>`__)
 
 
 Expired deprecations
 ====================
 
+* The ``normed`` keyword argument has been removed from
+  `np.histogram`, `np.histogram2d`, and `np.histogramdd`.
+  Use ``density`` instead.  If ``normed`` was passed by
+  position, ``density`` is now used.
+
+  (`gh-21645 <https://github.com/numpy/numpy/pull/21645>`__)
+
+* Ragged array creation will now always raise a ``ValueError`` unless
+  ``dtype=object`` is passed.  This includes very deeply nested sequences.
+
+  (`gh-22004 <https://github.com/numpy/numpy/pull/22004>`__)
+
+* Support for Visual Studio 2015 and earlier has been removed.
+
+* Support for the Windows Interix POSIX interop layer has been removed.
+
+  (`gh-22139 <https://github.com/numpy/numpy/pull/22139>`__)
+
+* Support for Cygwin < 3.3 has been removed.
+
+  (`gh-22159 <https://github.com/numpy/numpy/pull/22159>`__)
+
+* The mini() method of ``np.ma.MaskedArray`` has been removed. Use either
+  ``np.ma.MaskedArray.min()`` or ``np.ma.minimum.reduce()``.
+
+* The single-argument form of ``np.ma.minimum`` and ``np.ma.maximum`` has been
+  removed. Use ``np.ma.minimum.reduce()`` or ``np.ma.maximum.reduce()``
+  instead.
+
+  (`gh-22228 <https://github.com/numpy/numpy/pull/22228>`__)
+
+* Passing dtype instances other than the canonical (mainly native byte-order)
+  ones to ``dtype=`` or ``signature=`` in ufuncs will now raise a
+  ``TypeError``.  We recommend passing the strings ``"int8"`` or scalar types
+  ``np.int8`` since the byte-order, datetime/timedelta unit, etc. are never
+  enforced.  (Initially deprecated in NumPy 1.21.)
+
+  (`gh-22540 <https://github.com/numpy/numpy/pull/22540>`__)
+
+* The ``dtype=`` argument to comparison ufuncs is now applied correctly.  That
+  means that only ``bool`` and ``object`` are valid values and ``dtype=object``
+  is enforced.
+
+  (`gh-22541 <https://github.com/numpy/numpy/pull/22541>`__)
+
+* The deprecation for the aliases ``np.object``, ``np.bool``, ``np.float``,
+  ``np.complex``, ``np.str``, and ``np.int`` is expired (introduces NumPy
+  1.20).  Some of these will now give a FutureWarning in addition to raising an
+  error since they will be mapped to the NumPy scalars in the future.
+
+  (`gh-22607 <https://github.com/numpy/numpy/pull/22607>`__)
+
 
 Compatibility notes
 ===================
 
+``array.fill(scalar)`` may behave slightly different
+----------------------------------------------------
+``numpy.ndarray.fill`` may in some cases behave slightly different now due to
+the fact that the logic is aligned with item assignment::
+
+    arr = np.array([1])  # with any dtype/value
+    arr.fill(scalar)
+    # is now identical to:
+    arr[0] = scalar
+
+Previously casting may have produced slightly different answers when using
+values that could not be represented in the target ``dtype`` or when the target
+had ``object`` dtype.
+
+(`gh-20924 <https://github.com/numpy/numpy/pull/20924>`__)
+
+Subarray to object cast now copies
+----------------------------------
+Casting a dtype that includes a subarray to an object will now ensure a copy of
+the subarray.  Previously an unsafe view was returned::
+
+    arr = np.ones(3, dtype=[("f", "i", 3)])
+    subarray_fields = arr.astype(object)[0]
+    subarray = subarray_fields[0]  # "f" field
+
+    np.may_share_memory(subarray, arr)
+
+Is now always false.  While previously it was true for the specific cast.
+
+(`gh-21925 <https://github.com/numpy/numpy/pull/21925>`__)
+
+Returned arrays respect uniqueness of dtype kwarg objects
+---------------------------------------------------------
+When the ``dtype`` keyword argument is used with :py:func:`np.array()` or
+:py:func:`asarray()`, the dtype of the returned array now always exactly
+matches the dtype provided by the caller.
+
+In some cases this change means that a *view* rather than the input array is
+returned.  The following is an example for this on 64bit Linux where ``long``
+and ``longlong`` are the same precision but different ``dtypes``::
+
+    >>> arr = np.array([1, 2, 3], dtype="long")
+    >>> new_dtype = np.dtype("longlong")
+    >>> new = np.asarray(arr, dtype=new_dtype)
+    >>> new.dtype is new_dtype
+    True
+    >>> new is arr
+    False
+
+Before the change, the ``dtype`` did not match because ``new is arr`` was
+``True``.
+
+(`gh-21995 <https://github.com/numpy/numpy/pull/21995>`__)
+
+DLPack export raises ``BufferError``
+------------------------------------
+When an array buffer cannot be exported via DLPack a ``BufferError`` is now
+always raised where previously ``TypeError`` or ``RuntimeError`` was raised.
+This allows falling back to the buffer protocol or ``__array_interface__`` when
+DLPack was tried first.
 
-C API changes
-=============
+(`gh-22542 <https://github.com/numpy/numpy/pull/22542>`__)
+
+NumPy builds are no longer tested on GCC-6
+------------------------------------------
+Ubuntu 18.04 is deprecated for GitHub actions and GCC-6 is not available on
+Ubuntu 20.04, so builds using that compiler are no longer tested. We still test
+builds using GCC-7 and GCC-8.
+
+(`gh-22598 <https://github.com/numpy/numpy/pull/22598>`__)
 
 
 New Features
 ============
 
+New attribute ``symbol`` added to polynomial classes
+----------------------------------------------------
+The polynomial classes in the ``numpy.polynomial`` package have a new
+``symbol`` attribute which is used to represent the indeterminate of the
+polynomial.  This can be used to change the value of the variable when
+printing::
+
+    >>> P_y = np.polynomial.Polynomial([1, 0, -1], symbol="y")
+    >>> print(P_y)
+    1.0 + 0.0·y¹ - 1.0·y²
+
+Note that the polynomial classes only support 1D polynomials, so operations
+that involve polynomials with different symbols are disallowed when the result
+would be multivariate::
+
+    >>> P = np.polynomial.Polynomial([1, -1])  # default symbol is "x"
+    >>> P_z = np.polynomial.Polynomial([1, 1], symbol="z")
+    >>> P * P_z
+    Traceback (most recent call last)
+       ...
+    ValueError: Polynomial symbols differ
+
+The symbol can be any valid Python identifier. The default is ``symbol=x``,
+consistent with existing behavior.
+
+(`gh-16154 <https://github.com/numpy/numpy/pull/16154>`__)
+
+F2PY support for Fortran ``character`` strings
+----------------------------------------------
+F2PY now supports wrapping Fortran functions with:
+
+* character (e.g. ``character x``)
+* character array (e.g. ``character, dimension(n) :: x``)
+* character string (e.g. ``character(len=10) x``)
+* and character string array (e.g. ``character(len=10), dimension(n, m) :: x``)
+
+arguments, including passing Python unicode strings as Fortran character string
+arguments.
+
+(`gh-19388 <https://github.com/numpy/numpy/pull/19388>`__)
+
+New function ``np.show_runtime``
+--------------------------------
+A new function ``numpy.show_runtime`` has been added to display the runtime
+information of the machine in addition to ``numpy.show_config`` which displays
+the build-related information.
+
+(`gh-21468 <https://github.com/numpy/numpy/pull/21468>`__)
+
+``strict`` option for ``testing.assert_array_equal``
+----------------------------------------------------
+The ``strict`` option is now available for ``testing.assert_array_equal``.
+Setting ``strict=True`` will disable the broadcasting behaviour for scalars and
+ensure that input arrays have the same data type.
+
+(`gh-21595 <https://github.com/numpy/numpy/pull/21595>`__)
+
+New parameter ``equal_nan`` added to ``np.unique``
+--------------------------------------------------
+``np.unique`` was changed in 1.21 to treat all ``NaN`` values as equal and
+return a single ``NaN``. Setting ``equal_nan=False`` will restore pre-1.21
+behavior to treat ``NaNs`` as unique. Defaults to ``True``.
+
+(`gh-21623 <https://github.com/numpy/numpy/pull/21623>`__)
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.stack``
+---------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.stack``.  To use them, write ``np.stack(..., dtype=None,
+casting='same_kind')``.
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.vstack``
+----------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.vstack``.  To use them, write ``np.vstack(..., dtype=None,
+casting='same_kind')``.
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.hstack``
+----------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.hstack``.  To use them, write ``np.hstack(..., dtype=None,
+casting='same_kind')``.
+
+(`gh-21627 <https://github.com/numpy/numpy/pull/21627>`__)
+
+The bit generator underlying the singleton RandomState can be changed
+---------------------------------------------------------------------
+The singleton ``RandomState`` instance exposed in the ``numpy.random`` module
+is initialized at startup with the ``MT19937`` bit generator. The new function
+``set_bit_generator`` allows the default bit generator to be replaced with a
+user-provided bit generator. This function has been introduced to provide a
+method allowing seamless integration of a high-quality, modern bit generator in
+new code with existing code that makes use of the singleton-provided random
+variate generating functions. The companion function ``get_bit_generator``
+returns the current bit generator being used by the singleton ``RandomState``.
+This is provided to simplify restoring the original source of randomness if
+required.
+
+The preferred method to generate reproducible random numbers is to use a modern
+bit generator in an instance of ``Generator``. The function ``default_rng``
+simplifies instantiation::
+
+   >>> rg = np.random.default_rng(3728973198)
+   >>> rg.random()
+
+The same bit generator can then be shared with the singleton instance so that
+calling functions in the ``random`` module will use the same bit generator::
+
+   >>> orig_bit_gen = np.random.get_bit_generator()
+   >>> np.random.set_bit_generator(rg.bit_generator)
+   >>> np.random.normal()
+
+The swap is permanent (until reversed) and so any call to functions in the
+``random`` module will use the new bit generator. The original can be restored
+if required for code to run correctly::
+
+   >>> np.random.set_bit_generator(orig_bit_gen)
+
+(`gh-21976 <https://github.com/numpy/numpy/pull/21976>`__)
+
+``np.void`` now has a ``dtype`` argument
+----------------------------------------
+NumPy now allows constructing structured void scalars directly by
+passing the ``dtype`` argument to ``np.void``.
+
+(`gh-22316 <https://github.com/numpy/numpy/pull/22316>`__)
+
 
 Improvements
 ============
 
+F2PY Improvements
+-----------------
+* The generated extension modules don't use the deprecated NumPy-C API anymore
+* Improved ``f2py`` generated exception messages
+* Numerous bug and ``flake8`` warning fixes
+* various CPP macros that one can use within C-expressions of signature files
+  are prefixed with ``f2py_``. For example, one should use ``f2py_len(x)``
+  instead of ``len(x)``
+* A new construct ``character(f2py_len=...)`` is introduced to support
+  returning assumed length character strings (e.g. ``character(len=*)``) from
+  wrapper functions
+
+A hook to support rewriting ``f2py`` internal data structures after reading all
+its input files is introduced. This is required, for instance, for BC of SciPy
+support where character arguments are treated as character strings arguments in
+``C`` expressions.
+
+(`gh-19388 <https://github.com/numpy/numpy/pull/19388>`__)
+
+IBM zSystems Vector Extension Facility (SIMD)
+---------------------------------------------
+Added support for SIMD extensions of zSystem (z13, z14, z15), through the
+universal intrinsics interface. This support leads to performance improvements
+for all SIMD kernels implemented using the universal intrinsics, including the
+following operations: rint, floor, trunc, ceil, sqrt, absolute, square,
+reciprocal, tanh, sin, cos, equal, not_equal, greater, greater_equal, less,
+less_equal, maximum, minimum, fmax, fmin, argmax, argmin, add, subtract,
+multiply, divide.
+
+(`gh-20913 <https://github.com/numpy/numpy/pull/20913>`__)
+
+NumPy now gives floating point errors in casts
+----------------------------------------------
+In most cases, NumPy previously did not give floating point warnings or errors
+when these happened during casts.  For examples, casts like::
+
+    np.array([2e300]).astype(np.float32)  # overflow for float32
+    np.array([np.inf]).astype(np.int64)
+
+Should now generally give floating point warnings.  These warnings should warn
+that floating point overflow occurred.  For errors when converting floating
+point values to integers users should expect invalid value warnings.
+
+Users can modify the behavior of these warnings using ``np.errstate``.
+
+Note that for float to int casts, the exact warnings that are given may
+be platform dependent.  For example::
+
+    arr = np.full(100, fill_value=1000, dtype=np.float64)
+    arr.astype(np.int8)
+
+May give a result equivalent to (the intermediate cast means no warning is
+given)::
+
+    arr.astype(np.int64).astype(np.int8)
+
+May return an undefined result, with a warning set::
+
+    RuntimeWarning: invalid value encountered in cast
+
+The precise behavior is subject to the C99 standard and its implementation in
+both software and hardware.
+
+(`gh-21437 <https://github.com/numpy/numpy/pull/21437>`__)
+
+F2PY supports the value attribute
+---------------------------------
+The Fortran standard requires that variables declared with the ``value``
+attribute must be passed by value instead of reference. F2PY now supports this
+use pattern correctly. So ``integer, intent(in), value :: x`` in Fortran codes
+will have correct wrappers generated.
+
+(`gh-21807 <https://github.com/numpy/numpy/pull/21807>`__)
+
+Added pickle support for third-party BitGenerators
+--------------------------------------------------
+The pickle format for bit generators was extended to allow each bit generator
+to supply its own constructor when during pickling. Previous  versions of NumPy
+only supported unpickling ``Generator`` instances created with one of the core
+set of bit generators supplied with NumPy. Attempting to unpickle a
+``Generator`` that used a third-party bit generators would fail since the
+constructor used during the unpickling was only aware of the bit generators
+included in NumPy.
+
+(`gh-22014 <https://github.com/numpy/numpy/pull/22014>`__)
+
+arange() now explicitly fails with dtype=str
+---------------------------------------------
+Previously, the ``np.arange(n, dtype=str)`` function worked for ``n=1`` and
+``n=2``, but would raise a non-specific exception message for other values of
+``n``. Now, it raises a `TypeError` informing that ``arange`` does not support
+string dtypes::
+
+    >>> np.arange(2, dtype=str)
+    Traceback (most recent call last)
+       ...
+    TypeError: arange() not supported for inputs with DType <class 'numpy.dtype[str_]'>.
+
+(`gh-22055 <https://github.com/numpy/numpy/pull/22055>`__)
+
+``numpy.typing`` protocols are now runtime checkable
+----------------------------------------------------
+The protocols used in ``numpy.typing.ArrayLike`` and ``numpy.typing.DTypeLike``
+are now properly marked as runtime checkable, making them easier to use for
+runtime type checkers.
+
+(`gh-22357 <https://github.com/numpy/numpy/pull/22357>`__)
+
+
+Performance improvements and changes
+====================================
+
+Faster version of ``np.isin`` and ``np.in1d`` for integer arrays
+----------------------------------------------------------------
+``np.in1d`` (used by ``np.isin``) can now switch to a faster algorithm (up to
+>10x faster) when it is passed two integer arrays.  This is often automatically
+used, but you can use ``kind="sort"`` or ``kind="table"`` to force the old or
+new method, respectively.
+
+(`gh-12065 <https://github.com/numpy/numpy/pull/12065>`__)
+
+Faster comparison operators
+----------------------------
+The comparison functions (``numpy.equal``, ``numpy.not_equal``, ``numpy.less``,
+``numpy.less_equal``, ``numpy.greater`` and ``numpy.greater_equal``) are now
+much faster as they are now vectorized with universal intrinsics. For a CPU
+with SIMD extension AVX512BW, the performance gain is up to 2.57x, 1.65x and
+19.15x for integer, float and boolean data types, respectively (with N=50000).
+
+(`gh-21483 <https://github.com/numpy/numpy/pull/21483>`__)
+
 
 Changes
 =======
+
+Better reporting of integer division overflow
+---------------------------------------------
+Integer division overflow of scalars and arrays used to provide a
+``RuntimeWarning`` and the return value was undefined leading to crashes at
+rare occasions::
+
+    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
+    <stdin>:1: RuntimeWarning: divide by zero encountered in floor_divide
+    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)
+
+Integer division overflow now returns the input dtype's minimum value and raise
+the following ``RuntimeWarning``::
+
+    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
+    <stdin>:1: RuntimeWarning: overflow encountered in floor_divide
+    array([-2147483648, -2147483648, -2147483648, -2147483648, -2147483648,
+           -2147483648, -2147483648, -2147483648, -2147483648, -2147483648],
+          dtype=int32)
+
+(`gh-21506 <https://github.com/numpy/numpy/pull/21506>`__)
+
+``masked_invalid`` now modifies the mask in-place
+-------------------------------------------------
+When used with ``copy=False``, ``numpy.ma.masked_invalid`` now modifies the
+input masked array in-place.  This makes it behave identically to
+``masked_where`` and better matches the documentation.
+
+(`gh-22046 <https://github.com/numpy/numpy/pull/22046>`__)
+
+``nditer``/``NpyIter`` allows all allocating all operands
+---------------------------------------------------------
+The NumPy iterator available through ``np.nditer`` in Python and as ``NpyIter``
+in C now supports allocating all arrays.  The iterator shape defaults to ``()``
+in this case.  The operands dtype must be provided, since a "common dtype"
+cannot be inferred from the other inputs.
+
+(`gh-22457 <https://github.com/numpy/numpy/pull/22457>`__)
diff --git a/doc/source/release/1.24.1-notes.rst b/doc/source/release/1.24.1-notes.rst
new file mode 100644
index 000000000..c346f6d20
--- /dev/null
+++ b/doc/source/release/1.24.1-notes.rst
@@ -0,0 +1,50 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.1 Release Notes
+==========================
+NumPy 1.24.1 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.0 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Andrew Nelson
+* Ben Greiner +
+* Charles Harris
+* Clément Robert
+* Matteo Raso
+* Matti Picus
+* Melissa Weber Mendonça
+* Miles Cranmer
+* Ralf Gommers
+* Rohit Goswami
+* Sayed Adel
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 18 pull requests were merged for this release.
+
+* `#22820 <https://github.com/numpy/numpy/pull/22820>`__: BLD: add workaround in setup.py for newer setuptools
+* `#22830 <https://github.com/numpy/numpy/pull/22830>`__: BLD: CIRRUS_TAG redux
+* `#22831 <https://github.com/numpy/numpy/pull/22831>`__: DOC: fix a couple typos in 1.23 notes
+* `#22832 <https://github.com/numpy/numpy/pull/22832>`__: BUG: Fix refcounting errors found using pytest-leaks
+* `#22834 <https://github.com/numpy/numpy/pull/22834>`__: BUG, SIMD: Fix invalid value encountered in several ufuncs
+* `#22837 <https://github.com/numpy/numpy/pull/22837>`__: TST: ignore more np.distutils.log imports
+* `#22839 <https://github.com/numpy/numpy/pull/22839>`__: BUG: Do not use getdata() in np.ma.masked_invalid
+* `#22847 <https://github.com/numpy/numpy/pull/22847>`__: BUG: Ensure correct behavior for rows ending in delimiter in...
+* `#22848 <https://github.com/numpy/numpy/pull/22848>`__: BUG, SIMD: Fix the bitmask of the boolean comparison
+* `#22857 <https://github.com/numpy/numpy/pull/22857>`__: BLD: Help raspian arm + clang 13 about __builtin_mul_overflow
+* `#22858 <https://github.com/numpy/numpy/pull/22858>`__: API: Ensure a full mask is returned for masked_invalid
+* `#22866 <https://github.com/numpy/numpy/pull/22866>`__: BUG: Polynomials now copy properly (#22669)
+* `#22867 <https://github.com/numpy/numpy/pull/22867>`__: BUG, SIMD: Fix memory overlap in ufunc comparison loops
+* `#22868 <https://github.com/numpy/numpy/pull/22868>`__: BUG: Fortify string casts against floating point warnings
+* `#22875 <https://github.com/numpy/numpy/pull/22875>`__: TST: Ignore nan-warnings in randomized out tests
+* `#22883 <https://github.com/numpy/numpy/pull/22883>`__: MAINT: restore npymath implementations needed for freebsd
+* `#22884 <https://github.com/numpy/numpy/pull/22884>`__: BUG: Fix integer overflow in in1d for mixed integer dtypes #22877
+* `#22887 <https://github.com/numpy/numpy/pull/22887>`__: BUG: Use whole file for encoding checks with ``charset_normalizer``.
diff --git a/doc/source/release/1.24.2-notes.rst b/doc/source/release/1.24.2-notes.rst
new file mode 100644
index 000000000..9e9412244
--- /dev/null
+++ b/doc/source/release/1.24.2-notes.rst
@@ -0,0 +1,51 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.2 Release Notes
+==========================
+NumPy 1.24.2 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.1 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 14 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Bas van Beek
+* Charles Harris
+* Khem Raj +
+* Mark Harfouche
+* Matti Picus
+* Panagiotis Zestanakis +
+* Peter Hawkins
+* Pradipta Ghosh
+* Ross Barnowski
+* Sayed Adel
+* Sebastian Berg
+* Syam Gadde +
+* dmbelov +
+* pkubaj +
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#22965 <https://github.com/numpy/numpy/pull/22965>`__: MAINT: Update python 3.11-dev to 3.11.
+* `#22966 <https://github.com/numpy/numpy/pull/22966>`__: DOC: Remove dangling deprecation warning
+* `#22967 <https://github.com/numpy/numpy/pull/22967>`__: ENH: Detect CPU features on FreeBSD/powerpc64*
+* `#22968 <https://github.com/numpy/numpy/pull/22968>`__: BUG: np.loadtxt cannot load text file with quoted fields separated...
+* `#22969 <https://github.com/numpy/numpy/pull/22969>`__: TST: Add fixture to avoid issue with randomizing test order.
+* `#22970 <https://github.com/numpy/numpy/pull/22970>`__: BUG: Fix fill violating read-only flag. (#22959)
+* `#22971 <https://github.com/numpy/numpy/pull/22971>`__: MAINT: Add additional information to missing scalar AttributeError
+* `#22972 <https://github.com/numpy/numpy/pull/22972>`__: MAINT: Move export for scipy arm64 helper into main module
+* `#22976 <https://github.com/numpy/numpy/pull/22976>`__: BUG, SIMD: Fix spurious invalid exception for sin/cos on arm64/clang
+* `#22989 <https://github.com/numpy/numpy/pull/22989>`__: BUG: Ensure correct loop order in sin, cos, and arctan2
+* `#23030 <https://github.com/numpy/numpy/pull/23030>`__: DOC: Add version added information for the strict parameter in...
+* `#23031 <https://github.com/numpy/numpy/pull/23031>`__: BUG: use ``_Alignof`` rather than ``offsetof()`` on most compilers
+* `#23147 <https://github.com/numpy/numpy/pull/23147>`__: BUG: Fix for npyv__trunc_s32_f32 (VXE)
+* `#23148 <https://github.com/numpy/numpy/pull/23148>`__: BUG: Fix integer / float scalar promotion
+* `#23149 <https://github.com/numpy/numpy/pull/23149>`__: BUG: Add missing <type_traits> header.
+* `#23150 <https://github.com/numpy/numpy/pull/23150>`__: TYP, MAINT: Add a missing explicit ``Any`` parameter to the ``npt.ArrayLike``...
+* `#23161 <https://github.com/numpy/numpy/pull/23161>`__: BLD: remove redundant definition of npy_nextafter [wheel build]
diff --git a/doc/source/release/1.24.3-notes.rst b/doc/source/release/1.24.3-notes.rst
new file mode 100644
index 000000000..1aaf79cb6
--- /dev/null
+++ b/doc/source/release/1.24.3-notes.rst
@@ -0,0 +1,49 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.3 Release Notes
+==========================
+NumPy 1.24.3 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.2 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Aleksei Nikiforov +
+* Alexander Heger
+* Bas van Beek
+* Bob Eldering
+* Brock Mendel
+* Charles Harris
+* Kyle Sunden
+* Peter Hawkins
+* Rohit Goswami
+* Sebastian Berg
+* Warren Weckesser
+* dependabot[bot]
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#23206 <https://github.com/numpy/numpy/pull/23206>`__: BUG: fix for f2py string scalars (#23194)
+* `#23207 <https://github.com/numpy/numpy/pull/23207>`__: BUG: datetime64/timedelta64 comparisons return NotImplemented
+* `#23208 <https://github.com/numpy/numpy/pull/23208>`__: MAINT: Pin matplotlib to version 3.6.3 for refguide checks
+* `#23221 <https://github.com/numpy/numpy/pull/23221>`__: DOC: Fix matplotlib error in documentation
+* `#23226 <https://github.com/numpy/numpy/pull/23226>`__: CI: Ensure submodules are initialized in gitpod.
+* `#23341 <https://github.com/numpy/numpy/pull/23341>`__: TYP: Replace duplicate reduce in ufunc type signature with reduceat.
+* `#23342 <https://github.com/numpy/numpy/pull/23342>`__: TYP: Remove duplicate CLIP/WRAP/RAISE in ``__init__.pyi``.
+* `#23343 <https://github.com/numpy/numpy/pull/23343>`__: TYP: Mark ``d`` argument to fftfreq and rfftfreq as optional...
+* `#23344 <https://github.com/numpy/numpy/pull/23344>`__: TYP: Add type annotations for comparison operators to MaskedArray.
+* `#23345 <https://github.com/numpy/numpy/pull/23345>`__: TYP: Remove some stray type-check-only imports of ``msort``
+* `#23370 <https://github.com/numpy/numpy/pull/23370>`__: BUG: Ensure like is only stripped for ``like=`` dispatched functions
+* `#23543 <https://github.com/numpy/numpy/pull/23543>`__: BUG: fix loading and storing big arrays on s390x
+* `#23544 <https://github.com/numpy/numpy/pull/23544>`__: MAINT: Bump larsoner/circleci-artifacts-redirector-action
+* `#23634 <https://github.com/numpy/numpy/pull/23634>`__: BUG: Ignore invalid and overflow warnings in masked setitem
+* `#23635 <https://github.com/numpy/numpy/pull/23635>`__: BUG: Fix masked array raveling when ``order="A"`` or ``order="K"``
+* `#23636 <https://github.com/numpy/numpy/pull/23636>`__: MAINT: Update conftest for newer hypothesis versions
+* `#23637 <https://github.com/numpy/numpy/pull/23637>`__: BUG: Fix bug in parsing F77 style string arrays.
diff --git a/doc/source/user/absolute_beginners.rst b/doc/source/user/absolute_beginners.rst
index dae2c4f06..dfcdc669b 100644
--- a/doc/source/user/absolute_beginners.rst
+++ b/doc/source/user/absolute_beginners.rst
@@ -64,8 +64,8 @@ To access NumPy and its functions import it in your Python code like this::
   import numpy as np
 
 We shorten the imported name to ``np`` for better readability of code using
-NumPy. This is a widely adopted convention that you should follow so that
-anyone working with your code can easily understand it.
+NumPy. This is a widely adopted convention that makes your code more readable
+for everyone working on it. We recommend to always use import numpy as ``np``.
 
 Reading the example code
 ------------------------
diff --git a/doc/source/user/basics.creation.rst b/doc/source/user/basics.creation.rst
index a97d69d8b..3ee501889 100644
--- a/doc/source/user/basics.creation.rst
+++ b/doc/source/user/basics.creation.rst
@@ -75,8 +75,8 @@ the computation, here ``uint32`` and ``int32`` can both be represented in
 as ``int64``. 
 
 The default NumPy behavior is to create arrays in either 32 or 64-bit signed
-integers (platform dependent and matches C int size) or double precision
-floating point numbers, int32/int64 and float, respectively. If you expect your
+integers (platform dependent and matches C ``long`` size) or double precision
+floating point numbers. If you expect your
 integer arrays to be a specific type, then you need to specify the dtype while
 you create the array.
 
@@ -352,7 +352,7 @@ and :func:`numpy.genfromtxt`. These functions have more involved use cases in
  2, 4
  3, 9
 
-Importing ``simple.csv`` is accomplished using :func:`loadtxt`::
+Importing ``simple.csv`` is accomplished using :func:`numpy.loadtxt`::
 
  >>> np.loadtxt('simple.csv', delimiter = ',', skiprows = 1) # doctest: +SKIP
  array([[0., 0.],
diff --git a/doc/source/user/basics.dispatch.rst b/doc/source/user/basics.dispatch.rst
index 7c30272ad..a493ef769 100644
--- a/doc/source/user/basics.dispatch.rst
+++ b/doc/source/user/basics.dispatch.rst
@@ -281,14 +281,14 @@ Numpy provides some utilities to aid testing of custom array containers that
 implement the ``__array_ufunc__`` and ``__array_function__`` protocols in the
 ``numpy.testing.overrides`` namespace.
 
-To check if a Numpy function can be overriden via ``__array_ufunc__``, you can
+To check if a Numpy function can be overridden via ``__array_ufunc__``, you can
 use :func:`~numpy.testing.overrides.allows_array_ufunc_override`:
 
 >>> from np.testing.overrides import allows_array_ufunc_override
 >>> allows_array_ufunc_override(np.add)
 True
 
-Similarly, you can check if a function can be overriden via
+Similarly, you can check if a function can be overridden via
 ``__array_function__`` using
 :func:`~numpy.testing.overrides.allows_array_function_override`.
 
diff --git a/doc/source/user/basics.indexing.rst b/doc/source/user/basics.indexing.rst
index b140e223a..17fc9c0cc 100644
--- a/doc/source/user/basics.indexing.rst
+++ b/doc/source/user/basics.indexing.rst
@@ -481,12 +481,13 @@ tuple (of length :attr:`obj.ndim <ndarray.ndim>`) of integer index
 arrays showing the :py:data:`True` elements of *obj*. However, it is
 faster when ``obj.shape == x.shape``.
 
-If ``obj.ndim == x.ndim``, ``x[obj]`` returns a 1-dimensional array
-filled with the elements of *x* corresponding to the :py:data:`True`
-values of *obj*.  The search order will be :term:`row-major`,
-C-style. If *obj* has :py:data:`True` values at entries that are outside
-of the bounds of *x*, then an index error will be raised. If *obj* is
-smaller than *x* it is identical to filling it with :py:data:`False`.
+If ``obj.ndim == x.ndim``, ``x[obj]``
+returns a 1-dimensional array filled with the elements of *x*
+corresponding to the :py:data:`True` values of *obj*. The search order
+will be :term:`row-major`, C-style. An index error will be raised if
+the shape of *obj* does not match the corresponding dimensions of *x*,
+regardless of whether those values are :py:data:`True` or
+:py:data:`False`.
 
 A common use case for this is filtering for desired element values.
 For example, one may wish to select all entries from an array which
diff --git a/doc/source/user/basics.rec.rst b/doc/source/user/basics.rec.rst
index b56b4d177..640cfaa8b 100644
--- a/doc/source/user/basics.rec.rst
+++ b/doc/source/user/basics.rec.rst
@@ -166,6 +166,11 @@ attribute of the dtype object::
  >>> d.names
  ('x', 'y')
 
+The dtype of each individual field can be looked up by name::
+
+ >>> d['x']
+ dtype('int64')
+
 The field names may be modified by assigning to the ``names`` attribute using a
 sequence of strings of the same length.
 
diff --git a/doc/source/user/building.rst b/doc/source/user/building.rst
index d01b6c44e..442bda4b3 100644
--- a/doc/source/user/building.rst
+++ b/doc/source/user/building.rst
@@ -3,31 +3,14 @@
 Building from source
 ====================
 
-There are two options for building NumPy- building with Gitpod or locally from
-source. Your choice depends on your operating system and familiarity with the
-command line.
-
-Gitpod
-------
-
-Gitpod is an open-source platform that automatically creates
-the correct development environment right in your browser, reducing the need to
-install local development environments and deal with incompatible dependencies.
-
-If you are a Windows user, unfamiliar with using the command line or building
-NumPy for the first time, it is often faster to build with Gitpod. Here are the
-in-depth instructions for building NumPy with `building NumPy with Gitpod`_.
-
-.. _building NumPy with Gitpod: https://numpy.org/devdocs/dev/development_gitpod.html
-
-Building locally
-----------------
-
-Building locally on your machine gives you
-more granular control. If you are a MacOS or Linux user familiar with using the
+Building locally on your machine gives you complete control over build options.
+If you are a MacOS or Linux user familiar with using the
 command line, you can continue with building NumPy locally by following the
 instructions below.
 
+.. note:: If you want to build NumPy for development purposes, please refer to 
+   :ref:`development-environment` for additional information.
+
 ..
   This page is referenced from numpy/numpy/__init__.py. Please keep its
   location in sync with the link there.
@@ -37,7 +20,7 @@ Prerequisites
 
 Building NumPy requires the following software installed:
 
-1) Python 3.8.x or newer
+1) Python 3.9.x or newer
 
    Please note that the Python development headers also need to be installed,
    e.g., on Debian/Ubuntu one needs to install both `python3` and
diff --git a/doc/source/user/how-to-index.rst b/doc/source/user/how-to-index.rst
index e47e9a204..97c451260 100644
--- a/doc/source/user/how-to-index.rst
+++ b/doc/source/user/how-to-index.rst
@@ -309,6 +309,30 @@ result as dimensions with size one::
     array([[[2, 2, 2, 2, 2]],
     <BLANKLINE>
            [[2, 2, 2, 2, 2]]])
+	   
+To get the indices of each maximum or minimum value for each
+(N-1)-dimensional array in an N-dimensional array, use :meth:`reshape`
+to reshape the array to a 2D array, apply :meth:`argmax` or :meth:`argmin`
+along ``axis=1`` and use :meth:`unravel_index` to recover the index of the
+values per slice::
+
+    >>> x = np.arange(2*2*3).reshape(2, 2, 3) % 7  # 3D example array
+    >>> x
+    array([[[0, 1, 2],
+            [3, 4, 5]],
+    <BLANKLINE>
+           [[6, 0, 1],
+            [2, 3, 4]]])
+    >>> x_2d = np.reshape(x, (x.shape[0], -1))
+    >>> indices_2d = np.argmax(x_2d, axis=1)
+    >>> indices_2d
+    array([5, 0])
+    >>> np.unravel_index(indices_2d, x.shape[1:])
+    (array([1, 0]), array([2, 0]))
+    
+The first array returned contains the indices along axis 1 in the original
+array, the second array contains the indices along axis 2. The highest
+value in ``x[0]`` is therefore ``x[0, 1, 2]``.
 
 Index the same ndarray multiple times efficiently
 =================================================
@@ -348,4 +372,4 @@ Exercises `6`_, `8`_, `10`_, `15`_, `16`_, `19`_, `20`_, `45`_, `59`_,
 .. _87: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#87-consider-a-16x16-array-how-to-get-the-block-sum-block-size-is-4x4-
 .. _90: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#90-given-an-arbitrary-number-of-vectors-build-the-cartesian-product-every-combinations-of-every-item-
 .. _93: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#93-consider-two-arrays-a-and-b-of-shape-83-and-22-how-to-find-rows-of-a-that-contain-elements-of-each-row-of-b-regardless-of-the-order-of-the-elements-in-b-
-.. _94: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#94-considering-a-10x3-matrix-extract-rows-with-unequal-values-eg-223-
-\ No newline at end of file
+.. _94: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#94-considering-a-10x3-matrix-extract-rows-with-unequal-values-eg-223-
diff --git a/doc/source/user/how-to-verify-bug.rst b/doc/source/user/how-to-verify-bug.rst
index 4fc58c707..6e76f453a 100644
--- a/doc/source/user/how-to-verify-bug.rst
+++ b/doc/source/user/how-to-verify-bug.rst
@@ -76,7 +76,7 @@ The report references NumPy version 1.18.4, so that is the version you need to
 install in this case.
 
 Since this bug is tied to a release and not a specific commit, a pre-built wheel
-installed in your virtual environment via `pip` will suffice::
+installed in your virtual environment via ``pip`` will suffice::
 
     pip install numpy==1.18.4
 
diff --git a/doc/source/user/howtos_index.rst b/doc/source/user/howtos_index.rst
index 4a0a22900..ca30f7e91 100644
--- a/doc/source/user/howtos_index.rst
+++ b/doc/source/user/howtos_index.rst
@@ -1,7 +1,7 @@
 .. _howtos:
 
 ################
-NumPy How Tos
+NumPy how-tos
 ################
 
 These documents are intended as recipes to common tasks using NumPy. For
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index d138242d7..783d5a447 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -1482,7 +1482,7 @@ Further reading
 
 -  The `Python tutorial <https://docs.python.org/tutorial/>`__
 -  :ref:`reference`
--  `SciPy Tutorial <https://docs.scipy.org/doc/scipy/reference/tutorial/index.html>`__
+-  `SciPy Tutorial <https://docs.scipy.org/doc/scipy/tutorial/index.html>`__
 -  `SciPy Lecture Notes <https://scipy-lectures.org>`__
 -  A `matlab, R, IDL, NumPy/SciPy dictionary <http://mathesaurus.sf.net/>`__
 -  :doc:`tutorial-svd <numpy-tutorials:content/tutorial-svd>`
diff --git a/doc/source/user/theory.broadcasting.rst b/doc/source/user/theory.broadcasting.rst
index a4973e4e6..f277d4afd 100644
--- a/doc/source/user/theory.broadcasting.rst
+++ b/doc/source/user/theory.broadcasting.rst
@@ -1,7 +1,7 @@
 :orphan:
 
 ===========================
-Array Broadcasting in Numpy
+Array broadcasting in Numpy
 ===========================
 
 .. 
diff --git a/doc/source/user/troubleshooting-importerror.rst b/doc/source/user/troubleshooting-importerror.rst
index 2227d7b07..552748a6a 100644
--- a/doc/source/user/troubleshooting-importerror.rst
+++ b/doc/source/user/troubleshooting-importerror.rst
@@ -6,9 +6,9 @@
    to this page.
 
 
-***************************
-Troubleshooting ImportError
-***************************
+***************
+Troubleshooting
+***************
 
 .. note::
 
@@ -183,3 +183,65 @@ that usually works is to upgrade the NumPy version::
 
     pip install numpy --upgrade
 
+Segfaults or crashes
+====================
+
+NumPy tries to use advanced CPU features (SIMD) to speed up operations. If you
+are getting an "illegal instruction" error or a segfault, one cause could be
+that the environment claims it can support one or more of these features but
+actually cannot. This can happen inside a docker image or a VM (qemu, VMWare,
+...)
+
+You can use the output of ``np.show_runtime()`` to show which SIMD features are
+detected. For instance::
+
+    >>> np.show_runtime()
+    WARNING: `threadpoolctl` not found in system! Install it by `pip install \
+    threadpoolctl`. Once installed, try `np.show_runtime` again for more detailed
+    build information
+    [{'simd_extensions': {'baseline': ['SSE', 'SSE2', 'SSE3'],
+                          'found': ['SSSE3',
+                                    'SSE41',
+                                    'POPCNT',
+                                    'SSE42',
+                                    'AVX',
+                                    'F16C',
+                                    'FMA3',
+                                    'AVX2'],
+                          'not_found': ['AVX512F',
+                                        'AVX512CD',
+                                        'AVX512_KNL',
+                                        'AVX512_KNM',
+                                        'AVX512_SKX',
+                                        'AVX512_CLX',
+                                        'AVX512_CNL',
+                                        'AVX512_ICL']}}]
+
+In this case, it shows AVX2 and FMA3 under the ``found`` section, so you can
+try disabling them by setting ``NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"`` in your
+environment before running python (for cmd.exe on windows)::
+
+    >SET NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"
+    >python <myprogram.py>
+
+By installing threadpoolctl ``np.show_runtime()`` will show additional information::
+
+    ...
+    {'architecture': 'Zen',
+      'filepath': '/tmp/venv3/lib/python3.9/site-packages/numpy.libs/libopenblas64_p-r0-15028c96.3.21.so',
+      'internal_api': 'openblas',
+      'num_threads': 24,
+      'prefix': 'libopenblas',
+      'threading_layer': 'pthreads',
+      'user_api': 'blas',
+      'version': '0.3.21'}]
+
+If you use the wheel from PyPI, it contains code from the OpenBLAS project to
+speed up matrix operations. This code too can try to use SIMD instructions. It
+has a different mechanism for choosing which to use, based on a CPU
+architecture, You can override this architecture by setting
+``OPENBLAS_CORETYPE``: a minimal value for ``x86_64`` is
+``OPENBLAS_CORETYPE=Haswell``.  This too needs to be set before running your
+python (this time for posix)::
+
+    $ OPENBLAS_CORETYPE=Haswell python <myprogram.py>
diff --git a/doc_requirements.txt b/doc_requirements.txt
index 4c7396efe..b8a82aff7 100644
--- a/doc_requirements.txt
+++ b/doc_requirements.txt
@@ -1,13 +1,13 @@
 # doxygen required, use apt-get or dnf
 sphinx>=4.5.0
 numpydoc==1.4
-pydata-sphinx-theme==0.9.0
+pydata-sphinx-theme==0.13.3
 sphinx-design
 ipython!=8.1.0
 scipy
 matplotlib
 pandas
-breathe
+breathe>4.33.0
 
 # needed to build release notes
 towncrier
diff --git a/environment.yml b/environment.yml
index 8dbb1f80d..a6a309c8d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -15,8 +15,9 @@ dependencies:
   - setuptools=59.2.0
   - meson >= 0.64.0
   - ninja
-  - pkg-config  # note: not available on Windows, comment out this line
+  - pkg-config
   - meson-python
+  - pip   # so you can use pip to install spin
   # For testing
   - pytest
   - pytest-cov
@@ -33,7 +34,7 @@ dependencies:
   - scipy
   - pandas
   - matplotlib
-  - pydata-sphinx-theme=0.9.0
+  - pydata-sphinx-theme=0.13.3
   - doxygen
   # NOTE: breathe 4.33.0 collides with sphinx.ext.graphviz
   - breathe>4.33.0
diff --git a/linter_requirements.txt b/linter_requirements.txt
index 6ed26c5c0..2e0298bae 100644
--- a/linter_requirements.txt
+++ b/linter_requirements.txt
@@ -1,2 +1,2 @@
 pycodestyle==2.8.0
-GitPython==3.1.13
-\ No newline at end of file
+GitPython>=3.1.30
diff --git a/meson.build b/meson.build
index 2a1a40384..47e71efc0 100644
--- a/meson.build
+++ b/meson.build
@@ -9,8 +9,9 @@ project(
   meson_version: '>= 0.64.0',
   default_options: [
     'buildtype=debugoptimized',
+    'b_ndebug=if-release',
     'c_std=c99',
-    'cpp_std=c++14',
+    'cpp_std=c++17',
     'blas=openblas',
     'lapack=openblas',
     'pkgconfig.relocatable=true',
diff --git a/numpy/__config__.py.in b/numpy/__config__.py.in
index cda615904..6c6c21cb8 100644
--- a/numpy/__config__.py.in
+++ b/numpy/__config__.py.in
@@ -1,52 +1,130 @@
 # This file is generated by numpy's build process
 # It contains system_info results at the time of building this package.
-__all__ = ["get_info", "show"]
-
-import os
-import sys
-
-extra_dll_dir = os.path.join(os.path.dirname(__file__), '.libs')
-
-if sys.platform == 'win32' and os.path.isdir(extra_dll_dir):
-    os.add_dll_directory(extra_dll_dir)
-
-blas_armpl_info={}
-blas_mkl_info={}
-blis_info={}
-openblas_info={}
-accelerate_info={}
-atlas_3_10_blas_threads_info={}
-atlas_3_10_blas_info={}
-atlas_blas_threads_info={}
-atlas_blas_info={}
-blas_info={}
-blas_src_info={}
-blas_opt_info={}
-lapack_armpl_info={}
-lapack_mkl_info={}
-openblas_lapack_info={}
-openblas_clapack_info={}
-flame_info={}
-atlas_3_10_threads_info={}
-atlas_3_10_info={}
-atlas_threads_info={}
-atlas_info={}
-lapack_info={}
-lapack_src_info={}
-lapack_opt_info={}
-numpy_linalg_lapack_lite={}
-
-def get_info(name):
-    g = globals()
-    return g.get(name, g.get(name + "_info", {}))
-
-def show():
+from enum import Enum
+from numpy.core._multiarray_umath import (
+    __cpu_features__,
+    __cpu_baseline__,
+    __cpu_dispatch__,
+)
+
+__all__ = ["show"]
+_built_with_meson = True
+
+
+class DisplayModes(Enum):
+    stdout = "stdout"
+    dicts = "dicts"
+
+
+def _cleanup(d):
     """
-    Show libraries in the system on which NumPy was built.
+    Removes empty values in a `dict` recursively
+    This ensures we remove values that Meson could not provide to CONFIG
+    """
+    if isinstance(d, dict):
+        return {k: _cleanup(v) for k, v in d.items() if v and _cleanup(v)}
+    else:
+        return d
+
+
+CONFIG = _cleanup(
+    {
+        "Compilers": {
+            "c": {
+                "name": "@C_COMP@",
+                "linker": "@C_COMP_LINKER_ID@",
+                "version": "@C_COMP_VERSION@",
+                "commands": "@C_COMP_CMD_ARRAY@",
+            },
+            "cython": {
+                "name": "@CYTHON_COMP@",
+                "linker": "@CYTHON_COMP_LINKER_ID@",
+                "version": "@CYTHON_COMP_VERSION@",
+                "commands": "@CYTHON_COMP_CMD_ARRAY@",
+            },
+            "c++": {
+                "name": "@CPP_COMP@",
+                "linker": "@CPP_COMP_LINKER_ID@",
+                "version": "@CPP_COMP_VERSION@",
+                "commands": "@CPP_COMP_CMD_ARRAY@",
+            },
+        },
+        "Machine Information": {
+            "host": {
+                "cpu": "@HOST_CPU@",
+                "family": "@HOST_CPU_FAMILY@",
+                "endian": "@HOST_CPU_ENDIAN@",
+                "system": "@HOST_CPU_SYSTEM@",
+            },
+            "build": {
+                "cpu": "@BUILD_CPU@",
+                "family": "@BUILD_CPU_FAMILY@",
+                "endian": "@BUILD_CPU_ENDIAN@",
+                "system": "@BUILD_CPU_SYSTEM@",
+            },
+            "cross-compiled": bool("@CROSS_COMPILED@".lower().replace("false", "")),
+        },
+        "Build Dependencies": {
+            "blas": {
+                "name": "@BLAS_NAME@",
+                "found": bool("@BLAS_FOUND@".lower().replace("false", "")),
+                "version": "@BLAS_VERSION@",
+                "detection method": "@BLAS_TYPE_NAME@",
+                "include directory": r"@BLAS_INCLUDEDIR@",
+                "lib directory": r"@BLAS_LIBDIR@",
+                "openblas configuration": "@BLAS_OPENBLAS_CONFIG@",
+                "pc file directory": r"@BLAS_PCFILEDIR@",
+            },
+            "lapack": {
+                "name": "@LAPACK_NAME@",
+                "found": bool("@LAPACK_FOUND@".lower().replace("false", "")),
+                "version": "@LAPACK_VERSION@",
+                "detection method": "@LAPACK_TYPE_NAME@",
+                "include directory": r"@LAPACK_INCLUDEDIR@",
+                "lib directory": r"@LAPACK_LIBDIR@",
+                "openblas configuration": "@LAPACK_OPENBLAS_CONFIG@",
+                "pc file directory": r"@LAPACK_PCFILEDIR@",
+            },
+        },
+        "Python Information": {
+            "path": r"@PYTHON_PATH@",
+            "version": "@PYTHON_VERSION@",
+        },
+        "SIMD Extensions": {
+            "baseline": __cpu_baseline__,
+            "found": [
+                feature for feature in __cpu_dispatch__ if __cpu_features__[feature]
+            ],
+            "not found": [
+                feature for feature in __cpu_dispatch__ if not __cpu_features__[feature]
+            ],
+        },
+    }
+)
+
+
+def _check_pyyaml():
+    import yaml
+
+    return yaml
+
+
+def show(mode=DisplayModes.stdout.value):
+    """
+    Show libraries and system information on which NumPy was built
+    and is being used
+
+    Parameters
+    ----------
+    mode : {`'stdout'`, `'dicts'`}, optional.
+        Indicates how to display the config information.
+        `'stdout'` prints to console, `'dicts'` returns a dictionary
+        of the configuration.
 
-    Print information about various resources (libraries, library
-    directories, include directories, etc.) in the system on which
-    NumPy was built.
+    Returns
+    -------
+    out : {`dict`, `None`}
+        If mode is `'dicts'`, a dict is returned, else None
 
     See Also
     --------
@@ -55,80 +133,24 @@ def show():
 
     Notes
     -----
-    1. Classes specifying the information to be printed are defined
-       in the `numpy.distutils.system_info` module.
-
-       Information may include:
-
-       * ``language``: language used to write the libraries (mostly
-         C or f77)
-       * ``libraries``: names of libraries found in the system
-       * ``library_dirs``: directories containing the libraries
-       * ``include_dirs``: directories containing library header files
-       * ``src_dirs``: directories containing library source files
-       * ``define_macros``: preprocessor macros used by
-         ``distutils.setup``
-       * ``baseline``: minimum CPU features required
-       * ``found``: dispatched features supported in the system
-       * ``not found``: dispatched features that are not supported
-         in the system
-
-    2. NumPy BLAS/LAPACK Installation Notes
-
-       Installing a numpy wheel (``pip install numpy`` or force it
-       via ``pip install numpy --only-binary :numpy: numpy``) includes
-       an OpenBLAS implementation of the BLAS and LAPACK linear algebra
-       APIs. In this case, ``library_dirs`` reports the original build
-       time configuration as compiled with gcc/gfortran; at run time
-       the OpenBLAS library is in
-       ``site-packages/numpy.libs/`` (linux), or
-       ``site-packages/numpy/.dylibs/`` (macOS), or
-       ``site-packages/numpy/.libs/`` (windows).
-
-       Installing numpy from source
-       (``pip install numpy --no-binary numpy``) searches for BLAS and
-       LAPACK dynamic link libraries at build time as influenced by
-       environment variables NPY_BLAS_LIBS, NPY_CBLAS_LIBS, and
-       NPY_LAPACK_LIBS; or NPY_BLAS_ORDER and NPY_LAPACK_ORDER;
-       or the optional file ``~/.numpy-site.cfg``.
-       NumPy remembers those locations and expects to load the same
-       libraries at run-time.
-       In NumPy 1.21+ on macOS, 'accelerate' (Apple's Accelerate BLAS
-       library) is in the default build-time search order after
-       'openblas'.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> np.show_config()
-    blas_opt_info:
-        language = c
-        define_macros = [('HAVE_CBLAS', None)]
-        libraries = ['openblas', 'openblas']
-        library_dirs = ['/usr/local/lib']
+    1. The `'stdout'` mode will give more readable
+       output if ``pyyaml`` is installed
+
     """
-    from numpy.core._multiarray_umath import (
-        __cpu_features__, __cpu_baseline__, __cpu_dispatch__
-    )
-    for name,info_dict in globals().items():
-        if name[0] == "_" or type(info_dict) is not type({}): continue
-        print(name + ":")
-        if not info_dict:
-            print("  NOT AVAILABLE")
-        for k,v in info_dict.items():
-            v = str(v)
-            if k == "sources" and len(v) > 200:
-                v = v[:60] + " ...\n... " + v[-60:]
-            print("    %s = %s" % (k,v))
-
-    features_found, features_not_found = [], []
-    for feature in __cpu_dispatch__:
-        if __cpu_features__[feature]:
-            features_found.append(feature)
-        else:
-            features_not_found.append(feature)
-
-    print("Supported SIMD extensions in this NumPy install:")
-    print("    baseline = %s" % (','.join(__cpu_baseline__)))
-    print("    found = %s" % (','.join(features_found)))
-    print("    not found = %s" % (','.join(features_not_found)))
+    if mode == DisplayModes.stdout.value:
+        try:  # Non-standard library, check import
+            yaml = _check_pyyaml()
+
+            print(yaml.dump(CONFIG))
+        except ModuleNotFoundError:
+            import warnings
+            import json
+
+            warnings.warn("Install `pyyaml` for better output", stacklevel=1)
+            print(json.dumps(CONFIG, indent=2))
+    elif mode == DisplayModes.dicts.value:
+        return CONFIG
+    else:
+        raise AttributeError(
+            f"Invalid `mode`, use one of: {', '.join([e.value for e in DisplayModes])}"
+        )
diff --git a/numpy/__init__.cython-30.pxd b/numpy/__init__.cython-30.pxd
index 7771d32bd..0dd2fff2b 100644
--- a/numpy/__init__.cython-30.pxd
+++ b/numpy/__init__.cython-30.pxd
@@ -401,7 +401,7 @@ cdef extern from "numpy/arrayobject.h":
     int PyArray_TYPE(ndarray arr) nogil
 
     object PyArray_GETITEM(ndarray arr, void *itemptr)
-    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
+    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) except -1
 
     bint PyTypeNum_ISBOOL(int) nogil
     bint PyTypeNum_ISUNSIGNED(int) nogil
@@ -452,7 +452,7 @@ cdef extern from "numpy/arrayobject.h":
 
     bint PyArray_SAFEALIGNEDCOPY(ndarray) nogil
     bint PyArray_ISNBO(char) nogil              # works on ndarray.byteorder
-    bint PyArray_IsNativeByteOrder(char) nogil  # works on ndarray.byteorder
+    bint PyArray_IsNativeByteOrder(char) nogil # works on ndarray.byteorder
     bint PyArray_ISNOTSWAPPED(ndarray) nogil
     bint PyArray_ISBYTESWAPPED(ndarray) nogil
 
@@ -515,7 +515,6 @@ cdef extern from "numpy/arrayobject.h":
     void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
     void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
 
-    void PyArray_XDECREF_ERR(ndarray)
     # Cannot be supported due to out arg
     # void PyArray_DESCR_REPLACE(descr)
 
@@ -549,21 +548,21 @@ cdef extern from "numpy/arrayobject.h":
     # Functions taking dtype and returning object/ndarray are disabled
     # for now as they steal dtype references. I'm conservative and disable
     # more than is probably needed until it can be checked further.
-    int PyArray_SetNumericOps        (object)
+    int PyArray_SetNumericOps (object) except -1
     object PyArray_GetNumericOps ()
-    int PyArray_INCREF (ndarray)
-    int PyArray_XDECREF (ndarray)
+    int PyArray_INCREF (ndarray) except *  # uses PyArray_Item_INCREF...
+    int PyArray_XDECREF (ndarray) except *  # uses PyArray_Item_DECREF...
     void PyArray_SetStringFunction (object, int)
     dtype PyArray_DescrFromType (int)
     object PyArray_TypeObjectFromType (int)
     char * PyArray_Zero (ndarray)
     char * PyArray_One (ndarray)
     #object PyArray_CastToType (ndarray, dtype, int)
-    int PyArray_CastTo (ndarray, ndarray)
-    int PyArray_CastAnyTo (ndarray, ndarray)
-    int PyArray_CanCastSafely (int, int)
-    npy_bool PyArray_CanCastTo (dtype, dtype)
-    int PyArray_ObjectType (object, int)
+    int PyArray_CastTo (ndarray, ndarray) except -1
+    int PyArray_CastAnyTo (ndarray, ndarray) except -1
+    int PyArray_CanCastSafely (int, int)  # writes errors
+    npy_bool PyArray_CanCastTo (dtype, dtype)  # writes errors
+    int PyArray_ObjectType (object, int) except 0
     dtype PyArray_DescrFromObject (object, dtype)
     #ndarray* PyArray_ConvertToCommonType (object, int *)
     dtype PyArray_DescrFromScalar (object)
@@ -587,34 +586,34 @@ cdef extern from "numpy/arrayobject.h":
     #object PyArray_FromIter (object, dtype, npy_intp)
     object PyArray_Return (ndarray)
     #object PyArray_GetField (ndarray, dtype, int)
-    #int PyArray_SetField (ndarray, dtype, int, object)
+    #int PyArray_SetField (ndarray, dtype, int, object) except -1
     object PyArray_Byteswap (ndarray, npy_bool)
     object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER)
-    int PyArray_MoveInto (ndarray, ndarray)
-    int PyArray_CopyInto (ndarray, ndarray)
-    int PyArray_CopyAnyInto (ndarray, ndarray)
-    int PyArray_CopyObject (ndarray, object)
+    int PyArray_MoveInto (ndarray, ndarray) except -1
+    int PyArray_CopyInto (ndarray, ndarray) except -1
+    int PyArray_CopyAnyInto (ndarray, ndarray) except -1
+    int PyArray_CopyObject (ndarray, object) except -1
     object PyArray_NewCopy (ndarray, NPY_ORDER)
     object PyArray_ToList (ndarray)
     object PyArray_ToString (ndarray, NPY_ORDER)
-    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *)
-    int PyArray_Dump (object, object, int)
+    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) except -1
+    int PyArray_Dump (object, object, int) except -1
     object PyArray_Dumps (object, int)
-    int PyArray_ValidType (int)
+    int PyArray_ValidType (int)  # Cannot error
     void PyArray_UpdateFlags (ndarray, int)
     object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object)
     #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object)
     #dtype PyArray_DescrNew (dtype)
     dtype PyArray_DescrNewFromType (int)
-    double PyArray_GetPriority (object, double)
+    double PyArray_GetPriority (object, double)  # clears errors as of 1.25
     object PyArray_IterNew (object)
     object PyArray_MultiIterNew (int, ...)
 
-    int PyArray_PyIntAsInt (object)
+    int PyArray_PyIntAsInt (object) except? -1
     npy_intp PyArray_PyIntAsIntp (object)
-    int PyArray_Broadcast (broadcast)
-    void PyArray_FillObjectArray (ndarray, object)
-    int PyArray_FillWithScalar (ndarray, object)
+    int PyArray_Broadcast (broadcast) except -1
+    void PyArray_FillObjectArray (ndarray, object) except *
+    int PyArray_FillWithScalar (ndarray, object) except -1
     npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)
     dtype PyArray_DescrNewByteorder (dtype, char)
     object PyArray_IterAllButAxis (object, int *)
@@ -628,10 +627,10 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFlagsObject (object)
     npy_bool PyArray_CanCastScalar (type, type)
     #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t)
-    int PyArray_RemoveSmallest (broadcast)
+    int PyArray_RemoveSmallest (broadcast) except -1
     int PyArray_ElementStrides (object)
-    void PyArray_Item_INCREF (char *, dtype)
-    void PyArray_Item_XDECREF (char *, dtype)
+    void PyArray_Item_INCREF (char *, dtype) except *
+    void PyArray_Item_XDECREF (char *, dtype) except *
     object PyArray_FieldNames (object)
     object PyArray_Transpose (ndarray, PyArray_Dims *)
     object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE)
@@ -639,7 +638,7 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_PutMask (ndarray, object, object)
     object PyArray_Repeat (ndarray, object, int)
     object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE)
-    int PyArray_Sort (ndarray, int, NPY_SORTKIND)
+    int PyArray_Sort (ndarray, int, NPY_SORTKIND) except -1
     object PyArray_ArgSort (ndarray, int, NPY_SORTKIND)
     object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE, PyObject *)
     object PyArray_ArgMax (ndarray, int, ndarray)
@@ -677,49 +676,49 @@ cdef extern from "numpy/arrayobject.h":
     #int PyArray_As2D (object*, char ***, int *, int *, int)
     int PyArray_Free (object, void *)
     #int PyArray_Converter (object, object*)
-    int PyArray_IntpFromSequence (object, npy_intp *, int)
+    int PyArray_IntpFromSequence (object, npy_intp *, int) except -1
     object PyArray_Concatenate (object, int)
     object PyArray_InnerProduct (object, object)
     object PyArray_MatrixProduct (object, object)
     object PyArray_CopyAndTranspose (object)
     object PyArray_Correlate (object, object, int)
     int PyArray_TypestrConvert (int, int)
-    #int PyArray_DescrConverter (object, dtype*)
-    #int PyArray_DescrConverter2 (object, dtype*)
-    int PyArray_IntpConverter (object, PyArray_Dims *)
-    #int PyArray_BufferConverter (object, chunk)
-    int PyArray_AxisConverter (object, int *)
-    int PyArray_BoolConverter (object, npy_bool *)
-    int PyArray_ByteorderConverter (object, char *)
-    int PyArray_OrderConverter (object, NPY_ORDER *)
-    unsigned char PyArray_EquivTypes (dtype, dtype)
+    #int PyArray_DescrConverter (object, dtype*) except 0
+    #int PyArray_DescrConverter2 (object, dtype*) except 0
+    int PyArray_IntpConverter (object, PyArray_Dims *) except 0
+    #int PyArray_BufferConverter (object, chunk) except 0
+    int PyArray_AxisConverter (object, int *) except 0
+    int PyArray_BoolConverter (object, npy_bool *) except 0
+    int PyArray_ByteorderConverter (object, char *) except 0
+    int PyArray_OrderConverter (object, NPY_ORDER *) except 0
+    unsigned char PyArray_EquivTypes (dtype, dtype)  # clears errors
     #object PyArray_Zeros (int, npy_intp *, dtype, int)
     #object PyArray_Empty (int, npy_intp *, dtype, int)
     object PyArray_Where (object, object, object)
     object PyArray_Arange (double, double, double, int)
     #object PyArray_ArangeObj (object, object, object, dtype)
-    int PyArray_SortkindConverter (object, NPY_SORTKIND *)
+    int PyArray_SortkindConverter (object, NPY_SORTKIND *) except 0
     object PyArray_LexSort (object, int)
     object PyArray_Round (ndarray, int, ndarray)
     unsigned char PyArray_EquivTypenums (int, int)
-    int PyArray_RegisterDataType (dtype)
-    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *)
-    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND)
+    int PyArray_RegisterDataType (dtype) except -1
+    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) except -1
+    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) except -1
     #void PyArray_InitArrFuncs (PyArray_ArrFuncs *)
     object PyArray_IntTupleFromIntp (int, npy_intp *)
     int PyArray_TypeNumFromName (char *)
-    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *)
-    #int PyArray_OutputConverter (object, ndarray*)
+    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) except 0
+    #int PyArray_OutputConverter (object, ndarray*) except 0
     object PyArray_BroadcastToShape (object, npy_intp *, int)
     void _PyArray_SigintHandler (int)
     void* _PyArray_GetSigintBuf ()
-    #int PyArray_DescrAlignConverter (object, dtype*)
-    #int PyArray_DescrAlignConverter2 (object, dtype*)
-    int PyArray_SearchsideConverter (object, void *)
+    #int PyArray_DescrAlignConverter (object, dtype*) except 0
+    #int PyArray_DescrAlignConverter2 (object, dtype*) except 0
+    int PyArray_SearchsideConverter (object, void *) except 0
     object PyArray_CheckAxis (ndarray, int *, int)
     npy_intp PyArray_OverflowMultiplyList (npy_intp *, int)
     int PyArray_CompareString (char *, char *, size_t)
-    int PyArray_SetBaseObject(ndarray, base)  # NOTE: steals a reference to base! Use "set_array_base()" instead.
+    int PyArray_SetBaseObject(ndarray, base) except -1 # NOTE: steals a reference to base! Use "set_array_base()" instead.
 
 
 # Typedefs that matches the runtime dtype objects in
@@ -906,7 +905,7 @@ cdef extern from "numpy/ufuncobject.h":
     object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *,
           void **, char *, int, int, int, int, char *, char *, int)
     int PyUFunc_RegisterLoopForType(ufunc, int,
-                                    PyUFuncGenericFunction, int *, void *)
+                                    PyUFuncGenericFunction, int *, void *) except -1
     void PyUFunc_f_f_As_d_d \
          (char **, npy_intp *, npy_intp *, void *)
     void PyUFunc_d_d \
@@ -956,7 +955,7 @@ cdef extern from "numpy/ufuncobject.h":
     void PyUFunc_clearfperr()
     int PyUFunc_getfperr()
     int PyUFunc_handlefperr \
-        (int, PyObject *, int, int *)
+        (int, PyObject *, int, int *) except -1
     int PyUFunc_ReplaceLoopBySignature \
         (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)
     object PyUFunc_FromFuncAndDataAndSignature \
diff --git a/numpy/__init__.pxd b/numpy/__init__.pxd
index 5ecdf3d0f..47d9294c1 100644
--- a/numpy/__init__.pxd
+++ b/numpy/__init__.pxd
@@ -359,7 +359,7 @@ cdef extern from "numpy/arrayobject.h":
     int PyArray_TYPE(ndarray arr) nogil
 
     object PyArray_GETITEM(ndarray arr, void *itemptr)
-    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
+    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) except -1
 
     bint PyTypeNum_ISBOOL(int) nogil
     bint PyTypeNum_ISUNSIGNED(int) nogil
@@ -473,7 +473,6 @@ cdef extern from "numpy/arrayobject.h":
     void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
     void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
 
-    void PyArray_XDECREF_ERR(ndarray)
     # Cannot be supported due to out arg
     # void PyArray_DESCR_REPLACE(descr)
 
@@ -507,21 +506,21 @@ cdef extern from "numpy/arrayobject.h":
     # Functions taking dtype and returning object/ndarray are disabled
     # for now as they steal dtype references. I'm conservative and disable
     # more than is probably needed until it can be checked further.
-    int PyArray_SetNumericOps        (object)
+    int PyArray_SetNumericOps (object) except -1
     object PyArray_GetNumericOps ()
-    int PyArray_INCREF (ndarray)
-    int PyArray_XDECREF (ndarray)
+    int PyArray_INCREF (ndarray) except *  # uses PyArray_Item_INCREF...
+    int PyArray_XDECREF (ndarray) except *  # uses PyArray_Item_DECREF...
     void PyArray_SetStringFunction (object, int)
     dtype PyArray_DescrFromType (int)
     object PyArray_TypeObjectFromType (int)
     char * PyArray_Zero (ndarray)
     char * PyArray_One (ndarray)
     #object PyArray_CastToType (ndarray, dtype, int)
-    int PyArray_CastTo (ndarray, ndarray)
-    int PyArray_CastAnyTo (ndarray, ndarray)
-    int PyArray_CanCastSafely (int, int)
-    npy_bool PyArray_CanCastTo (dtype, dtype)
-    int PyArray_ObjectType (object, int)
+    int PyArray_CastTo (ndarray, ndarray) except -1
+    int PyArray_CastAnyTo (ndarray, ndarray) except -1
+    int PyArray_CanCastSafely (int, int)  # writes errors
+    npy_bool PyArray_CanCastTo (dtype, dtype)  # writes errors
+    int PyArray_ObjectType (object, int) except 0
     dtype PyArray_DescrFromObject (object, dtype)
     #ndarray* PyArray_ConvertToCommonType (object, int *)
     dtype PyArray_DescrFromScalar (object)
@@ -545,34 +544,34 @@ cdef extern from "numpy/arrayobject.h":
     #object PyArray_FromIter (object, dtype, npy_intp)
     object PyArray_Return (ndarray)
     #object PyArray_GetField (ndarray, dtype, int)
-    #int PyArray_SetField (ndarray, dtype, int, object)
+    #int PyArray_SetField (ndarray, dtype, int, object) except -1
     object PyArray_Byteswap (ndarray, npy_bool)
     object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER)
-    int PyArray_MoveInto (ndarray, ndarray)
-    int PyArray_CopyInto (ndarray, ndarray)
-    int PyArray_CopyAnyInto (ndarray, ndarray)
-    int PyArray_CopyObject (ndarray, object)
+    int PyArray_MoveInto (ndarray, ndarray) except -1
+    int PyArray_CopyInto (ndarray, ndarray) except -1
+    int PyArray_CopyAnyInto (ndarray, ndarray) except -1
+    int PyArray_CopyObject (ndarray, object) except -1
     object PyArray_NewCopy (ndarray, NPY_ORDER)
     object PyArray_ToList (ndarray)
     object PyArray_ToString (ndarray, NPY_ORDER)
-    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *)
-    int PyArray_Dump (object, object, int)
+    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) except -1
+    int PyArray_Dump (object, object, int) except -1
     object PyArray_Dumps (object, int)
-    int PyArray_ValidType (int)
+    int PyArray_ValidType (int)  # Cannot error
     void PyArray_UpdateFlags (ndarray, int)
     object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object)
     #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object)
     #dtype PyArray_DescrNew (dtype)
     dtype PyArray_DescrNewFromType (int)
-    double PyArray_GetPriority (object, double)
+    double PyArray_GetPriority (object, double)  # clears errors as of 1.25
     object PyArray_IterNew (object)
     object PyArray_MultiIterNew (int, ...)
 
-    int PyArray_PyIntAsInt (object)
+    int PyArray_PyIntAsInt (object) except? -1
     npy_intp PyArray_PyIntAsIntp (object)
-    int PyArray_Broadcast (broadcast)
-    void PyArray_FillObjectArray (ndarray, object)
-    int PyArray_FillWithScalar (ndarray, object)
+    int PyArray_Broadcast (broadcast) except -1
+    void PyArray_FillObjectArray (ndarray, object) except *
+    int PyArray_FillWithScalar (ndarray, object) except -1
     npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)
     dtype PyArray_DescrNewByteorder (dtype, char)
     object PyArray_IterAllButAxis (object, int *)
@@ -586,10 +585,10 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFlagsObject (object)
     npy_bool PyArray_CanCastScalar (type, type)
     #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t)
-    int PyArray_RemoveSmallest (broadcast)
+    int PyArray_RemoveSmallest (broadcast) except -1
     int PyArray_ElementStrides (object)
-    void PyArray_Item_INCREF (char *, dtype)
-    void PyArray_Item_XDECREF (char *, dtype)
+    void PyArray_Item_INCREF (char *, dtype) except *
+    void PyArray_Item_XDECREF (char *, dtype) except *
     object PyArray_FieldNames (object)
     object PyArray_Transpose (ndarray, PyArray_Dims *)
     object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE)
@@ -597,7 +596,7 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_PutMask (ndarray, object, object)
     object PyArray_Repeat (ndarray, object, int)
     object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE)
-    int PyArray_Sort (ndarray, int, NPY_SORTKIND)
+    int PyArray_Sort (ndarray, int, NPY_SORTKIND) except -1
     object PyArray_ArgSort (ndarray, int, NPY_SORTKIND)
     object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE, PyObject *)
     object PyArray_ArgMax (ndarray, int, ndarray)
@@ -635,49 +634,49 @@ cdef extern from "numpy/arrayobject.h":
     #int PyArray_As2D (object*, char ***, int *, int *, int)
     int PyArray_Free (object, void *)
     #int PyArray_Converter (object, object*)
-    int PyArray_IntpFromSequence (object, npy_intp *, int)
+    int PyArray_IntpFromSequence (object, npy_intp *, int) except -1
     object PyArray_Concatenate (object, int)
     object PyArray_InnerProduct (object, object)
     object PyArray_MatrixProduct (object, object)
     object PyArray_CopyAndTranspose (object)
     object PyArray_Correlate (object, object, int)
     int PyArray_TypestrConvert (int, int)
-    #int PyArray_DescrConverter (object, dtype*)
-    #int PyArray_DescrConverter2 (object, dtype*)
-    int PyArray_IntpConverter (object, PyArray_Dims *)
-    #int PyArray_BufferConverter (object, chunk)
-    int PyArray_AxisConverter (object, int *)
-    int PyArray_BoolConverter (object, npy_bool *)
-    int PyArray_ByteorderConverter (object, char *)
-    int PyArray_OrderConverter (object, NPY_ORDER *)
-    unsigned char PyArray_EquivTypes (dtype, dtype)
+    #int PyArray_DescrConverter (object, dtype*) except 0
+    #int PyArray_DescrConverter2 (object, dtype*) except 0
+    int PyArray_IntpConverter (object, PyArray_Dims *) except 0
+    #int PyArray_BufferConverter (object, chunk) except 0
+    int PyArray_AxisConverter (object, int *) except 0
+    int PyArray_BoolConverter (object, npy_bool *) except 0
+    int PyArray_ByteorderConverter (object, char *) except 0
+    int PyArray_OrderConverter (object, NPY_ORDER *) except 0
+    unsigned char PyArray_EquivTypes (dtype, dtype)  # clears errors
     #object PyArray_Zeros (int, npy_intp *, dtype, int)
     #object PyArray_Empty (int, npy_intp *, dtype, int)
     object PyArray_Where (object, object, object)
     object PyArray_Arange (double, double, double, int)
     #object PyArray_ArangeObj (object, object, object, dtype)
-    int PyArray_SortkindConverter (object, NPY_SORTKIND *)
+    int PyArray_SortkindConverter (object, NPY_SORTKIND *) except 0
     object PyArray_LexSort (object, int)
     object PyArray_Round (ndarray, int, ndarray)
     unsigned char PyArray_EquivTypenums (int, int)
-    int PyArray_RegisterDataType (dtype)
-    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *)
-    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND)
+    int PyArray_RegisterDataType (dtype) except -1
+    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) except -1
+    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) except -1
     #void PyArray_InitArrFuncs (PyArray_ArrFuncs *)
     object PyArray_IntTupleFromIntp (int, npy_intp *)
     int PyArray_TypeNumFromName (char *)
-    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *)
-    #int PyArray_OutputConverter (object, ndarray*)
+    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) except 0
+    #int PyArray_OutputConverter (object, ndarray*) except 0
     object PyArray_BroadcastToShape (object, npy_intp *, int)
     void _PyArray_SigintHandler (int)
     void* _PyArray_GetSigintBuf ()
-    #int PyArray_DescrAlignConverter (object, dtype*)
-    #int PyArray_DescrAlignConverter2 (object, dtype*)
-    int PyArray_SearchsideConverter (object, void *)
+    #int PyArray_DescrAlignConverter (object, dtype*) except 0
+    #int PyArray_DescrAlignConverter2 (object, dtype*) except 0
+    int PyArray_SearchsideConverter (object, void *) except 0
     object PyArray_CheckAxis (ndarray, int *, int)
     npy_intp PyArray_OverflowMultiplyList (npy_intp *, int)
     int PyArray_CompareString (char *, char *, size_t)
-    int PyArray_SetBaseObject(ndarray, base)  # NOTE: steals a reference to base! Use "set_array_base()" instead.
+    int PyArray_SetBaseObject(ndarray, base) except -1 # NOTE: steals a reference to base! Use "set_array_base()" instead.
 
 
 # Typedefs that matches the runtime dtype objects in
@@ -864,7 +863,7 @@ cdef extern from "numpy/ufuncobject.h":
     object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *,
           void **, char *, int, int, int, int, char *, char *, int)
     int PyUFunc_RegisterLoopForType(ufunc, int,
-                                    PyUFuncGenericFunction, int *, void *)
+                                    PyUFuncGenericFunction, int *, void *) except -1
     void PyUFunc_f_f_As_d_d \
          (char **, npy_intp *, npy_intp *, void *)
     void PyUFunc_d_d \
@@ -914,7 +913,7 @@ cdef extern from "numpy/ufuncobject.h":
     void PyUFunc_clearfperr()
     int PyUFunc_getfperr()
     int PyUFunc_handlefperr \
-        (int, PyObject *, int, int *)
+        (int, PyObject *, int, int *) except -1
     int PyUFunc_ReplaceLoopBySignature \
         (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)
     object PyUFunc_FromFuncAndDataAndSignature \
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 1b5d90674..ae8631387 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -74,10 +74,6 @@ test
     Run numpy unittests
 show_config
     Show numpy build configuration
-dual
-    Overwrite certain functions with high-performance SciPy tools.
-    Note: `numpy.dual` is deprecated.  Use the functions from NumPy or Scipy
-    directly instead of importing them from `numpy.dual`.
 matlib
     Make everything matrices.
 __version__
@@ -122,6 +118,9 @@ except NameError:
 if __NUMPY_SETUP__:
     sys.stderr.write('Running from numpy source directory.\n')
 else:
+    # Allow distributors to run custom init code before importing numpy.core
+    from . import _distributor_init
+
     try:
         from numpy.__config__ import show as show_config
     except ImportError as e:
@@ -137,13 +136,11 @@ else:
     # mapping of {name: (value, deprecation_msg)}
     __deprecated_attrs__ = {}
 
-    # Allow distributors to run custom init code
-    from . import _distributor_init
-
     from . import core
     from .core import *
     from . import compat
     from . import exceptions
+    from . import dtypes
     from . import lib
     # NOTE: to be revisited following future namespace cleanup.
     # See gh-14454 and gh-15672 for discussion.
@@ -158,12 +155,46 @@ else:
     from . import matrixlib as _mat
     from .matrixlib import *
 
+    # Deprecations introduced in NumPy 1.20.0, 2020-06-06
+    import builtins as _builtins
+
+    _msg = (
+        "module 'numpy' has no attribute '{n}'.\n"
+        "`np.{n}` was a deprecated alias for the builtin `{n}`. "
+        "To avoid this error in existing code, use `{n}` by itself. "
+        "Doing this will not modify any behavior and is safe. {extended_msg}\n"
+        "The aliases was originally deprecated in NumPy 1.20; for more "
+        "details and guidance see the original release note at:\n"
+        "    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations")
+
+    _specific_msg = (
+        "If you specifically wanted the numpy scalar type, use `np.{}` here.")
+
+    _int_extended_msg = (
+        "When replacing `np.{}`, you may wish to use e.g. `np.int64` "
+        "or `np.int32` to specify the precision. If you wish to review "
+        "your current use, check the release note link for "
+        "additional information.")
+
+    _type_info = [
+        ("object", ""),  # The NumPy scalar only exists by name.
+        ("bool", _specific_msg.format("bool_")),
+        ("float", _specific_msg.format("float64")),
+        ("complex", _specific_msg.format("complex128")),
+        ("str", _specific_msg.format("str_")),
+        ("int", _int_extended_msg.format("int"))]
+
+    __former_attrs__ = {
+         n: _msg.format(n=n, extended_msg=extended_msg)
+         for n, extended_msg in _type_info
+     }
+
     # Future warning introduced in NumPy 1.24.0, 2022-11-17
     _msg = (
         "`np.{n}` is a deprecated alias for `{an}`.  (Deprecated NumPy 1.24)")
 
     # Some of these are awkward (since `np.str` may be preferable in the long
-    # term), but overall the names ending in 0 seem undesireable
+    # term), but overall the names ending in 0 seem undesirable
     _type_info = [
         ("bool8", bool_, "np.bool_"),
         ("int0", intp, "np.intp"),
@@ -185,9 +216,16 @@ else:
     __deprecated_attrs__.update({
         n: (alias, _msg.format(n=n, an=an)) for n, alias, an in _type_info})
 
-    del _msg, _type_info
+    import math
+
+    __deprecated_attrs__['math'] = (math,
+        "`np.math` is a deprecated alias for the standard library `math` "
+        "module (Deprecated Numpy 1.25). Replace usages of `np.math` with "
+        "`math`")
 
-    from .core import round, abs, max, min
+    del math, _msg, _type_info
+
+    from .core import abs
     # now that numpy modules are imported, can initialize limits
     core.getlimits._register_known_types()
 
@@ -242,6 +280,7 @@ else:
         # Warn for expired attributes, and return a dummy function
         # that always raises an exception.
         import warnings
+        import math
         try:
             msg = __expired_functions__[attr]
         except KeyError:
@@ -268,27 +307,23 @@ else:
             # the AttributeError
             warnings.warn(
                 f"In the future `np.{attr}` will be defined as the "
-                "corresponding NumPy scalar.  (This may have returned Python "
-                "scalars in past versions.", FutureWarning, stacklevel=2)
-
-        # Importing Tester requires importing all of UnitTest which is not a
-        # cheap import Since it is mainly used in test suits, we lazy import it
-        # here to save on the order of 10 ms of import time for most users
-        #
-        # The previous way Tester was imported also had a side effect of adding
-        # the full `numpy.testing` namespace
+                "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
+
+        if attr in __former_attrs__:
+            raise AttributeError(__former_attrs__[attr])
+
         if attr == 'testing':
             import numpy.testing as testing
             return testing
         elif attr == 'Tester':
-            from .testing import Tester
-            return Tester
+            "Removed in NumPy 1.25.0"
+            raise RuntimeError("Tester was removed in NumPy 1.25.")
 
         raise AttributeError("module {!r} has no attribute "
                              "{!r}".format(__name__, attr))
 
     def __dir__():
-        public_symbols = globals().keys() | {'Tester', 'testing'}
+        public_symbols = globals().keys() | {'testing'}
         public_symbols -= {
             "core", "matrixlib",
             # These were moved in 1.25 and may be deprecated eventually:
@@ -385,13 +420,17 @@ else:
 
     # Note that this will currently only make a difference on Linux
     core.multiarray._set_madvise_hugepage(use_hugepage)
+    del use_hugepage
 
     # Give a warning if NumPy is reloaded or imported on a sub-interpreter
     # We do this from python, since the C-module may not be reloaded and
     # it is tidier organized.
     core.multiarray._multiarray_umath._reload_guard()
 
-    core._set_promotion_state(os.environ.get("NPY_PROMOTION_STATE", "legacy"))
+    # default to "weak" promotion for "NumPy 2".
+    core._set_promotion_state(
+        os.environ.get("NPY_PROMOTION_STATE",
+                       "weak" if _using_numpy2_behavior() else "legacy"))
 
     # Tell PyInstaller where to find hook-numpy.py
     def _pyinstaller_hooks_dir():
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 69ac47a76..da1e98dd6 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -1,19 +1,15 @@
 import builtins
 import os
-import sys
 import mmap
 import ctypes as ct
 import array as _array
 import datetime as dt
 import enum
 from abc import abstractmethod
-from types import TracebackType, MappingProxyType
+from types import TracebackType, MappingProxyType, GenericAlias
 from contextlib import ContextDecorator
 from contextlib import contextmanager
 
-if sys.version_info >= (3, 9):
-    from types import GenericAlias
-
 from numpy._pytesttester import PytestTester
 from numpy.core._internal import _ctypes
 
@@ -213,6 +209,8 @@ from numpy import (
     random as random,
     testing as testing,
     version as version,
+    exceptions as exceptions,
+    dtypes as dtypes,
 )
 
 from numpy.core import defchararray, records
@@ -254,6 +252,8 @@ from numpy.core.fromnumeric import (
     any as any,
     cumsum as cumsum,
     ptp as ptp,
+    max as max,
+    min as min,
     amax as amax,
     amin as amin,
     prod as prod,
@@ -261,6 +261,7 @@ from numpy.core.fromnumeric import (
     ndim as ndim,
     size as size,
     around as around,
+    round as round,
     mean as mean,
     std as std,
     var as var,
@@ -395,7 +396,6 @@ from numpy.core.numerictypes import (
     issubsctype as issubsctype,
     issubdtype as issubdtype,
     sctype2char as sctype2char,
-    find_common_type as find_common_type,
     nbytes as nbytes,
     cast as cast,
     ScalarType as ScalarType,
@@ -412,6 +412,15 @@ from numpy.core.shape_base import (
     vstack as vstack,
 )
 
+from numpy.exceptions import (
+    ComplexWarning as ComplexWarning,
+    ModuleDeprecationWarning as ModuleDeprecationWarning,
+    VisibleDeprecationWarning as VisibleDeprecationWarning,
+    TooHardError as TooHardError,
+    DTypePromotionError as DTypePromotionError,
+    AxisError as AxisError,
+)
+
 from numpy.lib import (
     emath as emath,
 )
@@ -458,7 +467,6 @@ from numpy.lib.function_base import (
     digitize as digitize,
     cov as cov,
     corrcoef as corrcoef,
-    msort as msort,
     median as median,
     sinc as sinc,
     hamming as hamming,
@@ -666,19 +674,9 @@ test: PytestTester
 #
 # Placeholders for classes
 
-# Some of these are aliases; others are wrappers with an identical signature
-round = around
-round_ = around
-max = amax
-min = amin
-product = prod
-cumproduct = cumprod
-sometrue = any
-alltrue = all
-
 def show_config() -> None: ...
 
-_NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray)
+_NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray[Any, Any])
 _DTypeScalar_co = TypeVar("_DTypeScalar_co", covariant=True, bound=generic)
 _ByteOrder = L["S", "<", ">", "=", "|", "L", "B", "N", "I"]
 
@@ -804,7 +802,7 @@ class dtype(Generic[_DTypeScalar_co]):
     @overload
     def __new__(cls, dtype: _VoidCodes, align: bool = ..., copy: bool = ...) -> dtype[void]: ...
     @overload
-    def __new__(cls, dtype: _ObjectCodes | type[ct.py_object], align: bool = ..., copy: bool = ...) -> dtype[object_]: ...
+    def __new__(cls, dtype: _ObjectCodes | type[ct.py_object[Any]], align: bool = ..., copy: bool = ...) -> dtype[object_]: ...
 
     # dtype of a dtype is the same dtype
     @overload
@@ -846,8 +844,7 @@ class dtype(Generic[_DTypeScalar_co]):
         copy: bool = ...,
     ) -> dtype[object_]: ...
 
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
 
     @overload
     def __getitem__(self: dtype[void], key: list[builtins.str]) -> dtype[void]: ...
@@ -929,13 +926,13 @@ class dtype(Generic[_DTypeScalar_co]):
 
 _ArrayLikeInt = Union[
     int,
-    integer,
-    Sequence[Union[int, integer]],
+    integer[Any],
+    Sequence[Union[int, integer[Any]]],
     Sequence[Sequence[Any]],  # TODO: wait for support for recursive types
-    ndarray
+    ndarray[Any, Any]
 ]
 
-_FlatIterSelf = TypeVar("_FlatIterSelf", bound=flatiter)
+_FlatIterSelf = TypeVar("_FlatIterSelf", bound=flatiter[Any])
 
 @final
 class flatiter(Generic[_NdArraySubClass]):
@@ -953,7 +950,7 @@ class flatiter(Generic[_NdArraySubClass]):
     @overload
     def __getitem__(
         self: flatiter[ndarray[Any, dtype[_ScalarType]]],
-        key: int | integer | tuple[int | integer],
+        key: int | integer[Any] | tuple[int | integer[Any]],
     ) -> _ScalarType: ...
     @overload
     def __getitem__(
@@ -1148,7 +1145,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         kind: None | _SortKind = ...,
         order: None | str | Sequence[str] = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
 
     @overload
     def choose(
@@ -1156,7 +1153,7 @@ class _ArrayOrScalarCommon:
         choices: ArrayLike,
         out: None = ...,
         mode: _ModeKind = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def choose(
         self,
@@ -1172,7 +1169,7 @@ class _ArrayOrScalarCommon:
         max: None | ArrayLike = ...,
         out: None = ...,
         **kwargs: Any,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def clip(
         self,
@@ -1180,7 +1177,7 @@ class _ArrayOrScalarCommon:
         max: ArrayLike = ...,
         out: None = ...,
         **kwargs: Any,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def clip(
         self,
@@ -1204,7 +1201,7 @@ class _ArrayOrScalarCommon:
         a: ArrayLike,
         axis: None | SupportsIndex = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def compress(
         self,
@@ -1223,7 +1220,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         dtype: DTypeLike = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def cumprod(
         self,
@@ -1238,7 +1235,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         dtype: DTypeLike = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def cumsum(
         self,
@@ -1483,7 +1480,7 @@ class _SupportsImag(Protocol[_T_co]):
 class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     __hash__: ClassVar[None]
     @property
-    def base(self) -> None | ndarray: ...
+    def base(self) -> None | ndarray[Any, Any]: ...
     @property
     def ndim(self) -> int: ...
     @property
@@ -1510,8 +1507,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
         order: _OrderKACF = ...,
     ) -> _ArraySelf: ...
 
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
 
     @overload
     def __array__(self, dtype: None = ..., /) -> ndarray[Any, _DType_co]: ...
@@ -1536,7 +1532,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
 
     # NOTE: In practice any object is accepted by `obj`, but as `__array_finalize__`
     # is a pseudo-abstract method the type has been narrowed down in order to
-    # grant subclasses a bit more flexiblity
+    # grant subclasses a bit more flexibility
     def __array_finalize__(self, obj: None | NDArray[Any], /) -> None: ...
 
     def __array_wrap__(
@@ -1651,7 +1647,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     # 1D + 1D returns a scalar;
     # all other with at least 1 non-0D array return an ndarray.
     @overload
-    def dot(self, b: _ScalarLike_co, out: None = ...) -> ndarray: ...
+    def dot(self, b: _ScalarLike_co, out: None = ...) -> ndarray[Any, Any]: ...
     @overload
     def dot(self, b: ArrayLike, out: None = ...) -> Any: ...  # type: ignore[misc]
     @overload
@@ -1928,7 +1924,6 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     def __neg__(self: NDArray[object_]) -> Any: ...
 
     # Binary ops
-    # NOTE: `ndarray` does not implement `__imatmul__`
     @overload
     def __matmul__(self: NDArray[bool_], other: _ArrayLikeBool_co) -> NDArray[bool_]: ...  # type: ignore[misc]
     @overload
@@ -2515,6 +2510,19 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     @overload
     def __ior__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
 
+    @overload
+    def __imatmul__(self: NDArray[bool_], other: _ArrayLikeBool_co) -> NDArray[bool_]: ...
+    @overload
+    def __imatmul__(self: NDArray[unsignedinteger[_NBit1]], other: _ArrayLikeUInt_co) -> NDArray[unsignedinteger[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[signedinteger[_NBit1]], other: _ArrayLikeInt_co) -> NDArray[signedinteger[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[floating[_NBit1]], other: _ArrayLikeFloat_co) -> NDArray[floating[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[complexfloating[_NBit1, _NBit1]], other: _ArrayLikeComplex_co) -> NDArray[complexfloating[_NBit1, _NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
+
     def __dlpack__(self: NDArray[number[Any]], *, stream: None = ...) -> _PyCapsule: ...
     def __dlpack_device__(self) -> tuple[int, L[0]]: ...
 
@@ -2674,8 +2682,7 @@ class number(generic, Generic[_NBit1]):  # type: ignore
     def real(self: _ArraySelf) -> _ArraySelf: ...
     @property
     def imag(self: _ArraySelf) -> _ArraySelf: ...
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
     def __int__(self) -> int: ...
     def __float__(self) -> float: ...
     def __complex__(self) -> complex: ...
@@ -2829,20 +2836,20 @@ class integer(number[_NBit1]):  # type: ignore
     def __index__(self) -> int: ...
     __truediv__: _IntTrueDiv[_NBit1]
     __rtruediv__: _IntTrueDiv[_NBit1]
-    def __mod__(self, value: _IntLike_co) -> integer: ...
-    def __rmod__(self, value: _IntLike_co) -> integer: ...
+    def __mod__(self, value: _IntLike_co) -> integer[Any]: ...
+    def __rmod__(self, value: _IntLike_co) -> integer[Any]: ...
     def __invert__(self: _IntType) -> _IntType: ...
     # Ensure that objects annotated as `integer` support bit-wise operations
-    def __lshift__(self, other: _IntLike_co) -> integer: ...
-    def __rlshift__(self, other: _IntLike_co) -> integer: ...
-    def __rshift__(self, other: _IntLike_co) -> integer: ...
-    def __rrshift__(self, other: _IntLike_co) -> integer: ...
-    def __and__(self, other: _IntLike_co) -> integer: ...
-    def __rand__(self, other: _IntLike_co) -> integer: ...
-    def __or__(self, other: _IntLike_co) -> integer: ...
-    def __ror__(self, other: _IntLike_co) -> integer: ...
-    def __xor__(self, other: _IntLike_co) -> integer: ...
-    def __rxor__(self, other: _IntLike_co) -> integer: ...
+    def __lshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rlshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rrshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __and__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rand__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __or__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __ror__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __xor__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rxor__(self, other: _IntLike_co) -> integer[Any]: ...
 
 class signedinteger(integer[_NBit1]):
     def __init__(self, value: _IntValue = ..., /) -> None: ...
@@ -2967,8 +2974,8 @@ ulonglong = unsignedinteger[_NBitLongLong]
 class inexact(number[_NBit1]):  # type: ignore
     def __getnewargs__(self: inexact[_64Bit]) -> tuple[float, ...]: ...
 
-_IntType = TypeVar("_IntType", bound=integer)
-_FloatType = TypeVar('_FloatType', bound=floating)
+_IntType = TypeVar("_IntType", bound=integer[Any])
+_FloatType = TypeVar('_FloatType', bound=floating[Any])
 
 class floating(inexact[_NBit1]):
     def __init__(self, value: _FloatValue = ..., /) -> None: ...
@@ -2982,9 +2989,8 @@ class floating(inexact[_NBit1]):
     @classmethod
     def fromhex(cls: type[float64], string: str, /) -> float64: ...
     def as_integer_ratio(self) -> tuple[int, int]: ...
-    if sys.version_info >= (3, 9):
-        def __ceil__(self: float64) -> int: ...
-        def __floor__(self: float64) -> int: ...
+    def __ceil__(self: float64) -> int: ...
+    def __floor__(self: float64) -> int: ...
     def __trunc__(self: float64) -> int: ...
     def __getnewargs__(self: float64) -> tuple[float]: ...
     def __getformat__(self: float64, typestr: L["double", "float"], /) -> str: ...
@@ -3142,10 +3148,6 @@ infty: Final[float]
 nan: Final[float]
 pi: Final[float]
 
-CLIP: L[0]
-WRAP: L[1]
-RAISE: L[2]
-
 ERR_IGNORE: L[0]
 ERR_WARN: L[1]
 ERR_RAISE: L[2]
@@ -3214,7 +3216,7 @@ class ufunc:
     # can't type them very precisely.
     reduce: Any
     accumulate: Any
-    reduce: Any
+    reduceat: Any
     outer: Any
     # Similarly at won't be defined for ufuncs that return multiple
     # outputs, so we can't type it very precisely.
@@ -3320,22 +3322,8 @@ class _CopyMode(enum.Enum):
     NEVER: L[2]
 
 # Warnings
-class ModuleDeprecationWarning(DeprecationWarning): ...
-class VisibleDeprecationWarning(UserWarning): ...
-class ComplexWarning(RuntimeWarning): ...
 class RankWarning(UserWarning): ...
 
-# Errors
-class TooHardError(RuntimeError): ...
-
-class AxisError(ValueError, IndexError):
-    axis: None | int
-    ndim: None | int
-    @overload
-    def __init__(self, axis: str, ndim: None = ..., msg_prefix: None = ...) -> None: ...
-    @overload
-    def __init__(self, axis: int, ndim: int, msg_prefix: None | str = ...) -> None: ...
-
 _CallType = TypeVar("_CallType", bound=_ErrFunc | _SupportsWrite[str])
 
 class errstate(Generic[_CallType], ContextDecorator):
diff --git a/numpy/_pyinstaller/hook-numpy.py b/numpy/_pyinstaller/hook-numpy.py
index 69992bdeb..6f24318ad 100644
--- a/numpy/_pyinstaller/hook-numpy.py
+++ b/numpy/_pyinstaller/hook-numpy.py
@@ -20,16 +20,15 @@ if is_pure_conda:
     from PyInstaller.utils.hooks import conda_support
     datas = conda_support.collect_dynamic_libs("numpy", dependencies=True)
 
-# Submodules PyInstaller cannot detect (probably because they are only imported
-# by extension modules, which PyInstaller cannot read).
-hiddenimports = ['numpy.core._dtype_ctypes']
+# Submodules PyInstaller cannot detect.  `_dtype_ctypes` is only imported
+# from C and `_multiarray_tests` is used in tests (which are not packed).
+hiddenimports = ['numpy.core._dtype_ctypes', 'numpy.core._multiarray_tests']
 
 # Remove testing and building code and packages that are referenced throughout
 # NumPy but are not really dependencies.
 excludedimports = [
     "scipy",
     "pytest",
-    "nose",
     "f2py",
     "setuptools",
     "numpy.f2py",
diff --git a/numpy/_typing/__init__.py b/numpy/_typing/__init__.py
index a4e763050..29922d958 100644
--- a/numpy/_typing/__init__.py
+++ b/numpy/_typing/__init__.py
@@ -180,6 +180,7 @@ from ._dtype_like import (
     _DTypeLikeComplex_co as _DTypeLikeComplex_co,
 )
 from ._array_like import (
+    NDArray as NDArray,
     ArrayLike as ArrayLike,
     _ArrayLike as _ArrayLike,
     _FiniteNestedSequence as _FiniteNestedSequence,
@@ -201,11 +202,6 @@ from ._array_like import (
     _ArrayLikeUnknown as _ArrayLikeUnknown,
     _UnknownType as _UnknownType,
 )
-from ._generic_alias import (
-    NDArray as NDArray,
-    _DType as _DType,
-    _GenericAlias as _GenericAlias,
-)
 
 if TYPE_CHECKING:
     from ._ufunc import (
diff --git a/numpy/_typing/_add_docstring.py b/numpy/_typing/_add_docstring.py
index 10d77f516..f84d19271 100644
--- a/numpy/_typing/_add_docstring.py
+++ b/numpy/_typing/_add_docstring.py
@@ -3,7 +3,7 @@
 import re
 import textwrap
 
-from ._generic_alias import NDArray
+from ._array_like import NDArray
 
 _docstrings_list = []
 
diff --git a/numpy/_typing/_array_like.py b/numpy/_typing/_array_like.py
index 67d67ce19..cba6fffaf 100644
--- a/numpy/_typing/_array_like.py
+++ b/numpy/_typing/_array_like.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 
-# NOTE: Import `Sequence` from `typing` as we it is needed for a type-alias,
-# not an annotation
-from collections.abc import Collection, Callable
-from typing import Any, Sequence, Protocol, Union, TypeVar, runtime_checkable
+from collections.abc import Collection, Callable, Sequence
+from typing import Any, Protocol, Union, TypeVar, runtime_checkable
 from numpy import (
     ndarray,
     dtype,
@@ -25,8 +23,11 @@ from ._nested_sequence import _NestedSequence
 
 _T = TypeVar("_T")
 _ScalarType = TypeVar("_ScalarType", bound=generic)
-_DType = TypeVar("_DType", bound="dtype[Any]")
-_DType_co = TypeVar("_DType_co", covariant=True, bound="dtype[Any]")
+_ScalarType_co = TypeVar("_ScalarType_co", bound=generic, covariant=True)
+_DType = TypeVar("_DType", bound=dtype[Any])
+_DType_co = TypeVar("_DType_co", covariant=True, bound=dtype[Any])
+
+NDArray = ndarray[Any, dtype[_ScalarType_co]]
 
 # The `_SupportsArray` protocol only cares about the default dtype
 # (i.e. `dtype=None` or no `dtype` parameter at all) of the to-be returned
@@ -61,8 +62,8 @@ _FiniteNestedSequence = Union[
 
 # A subset of `npt.ArrayLike` that can be parametrized w.r.t. `np.generic`
 _ArrayLike = Union[
-    _SupportsArray["dtype[_ScalarType]"],
-    _NestedSequence[_SupportsArray["dtype[_ScalarType]"]],
+    _SupportsArray[dtype[_ScalarType]],
+    _NestedSequence[_SupportsArray[dtype[_ScalarType]]],
 ]
 
 # A union representing array-like objects; consists of two typevars:
@@ -83,64 +84,69 @@ _DualArrayLike = Union[
 #
 # https://github.com/python/typing/issues/593
 ArrayLike = _DualArrayLike[
-    dtype,
+    dtype[Any],
     Union[bool, int, float, complex, str, bytes],
 ]
 
 # `ArrayLike<X>_co`: array-like objects that can be coerced into `X`
 # given the casting rules `same_kind`
 _ArrayLikeBool_co = _DualArrayLike[
-    "dtype[bool_]",
+    dtype[bool_],
     bool,
 ]
 _ArrayLikeUInt_co = _DualArrayLike[
-    "dtype[Union[bool_, unsignedinteger[Any]]]",
+    dtype[Union[bool_, unsignedinteger[Any]]],
     bool,
 ]
 _ArrayLikeInt_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any]]]",
+    dtype[Union[bool_, integer[Any]]],
     Union[bool, int],
 ]
 _ArrayLikeFloat_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], floating[Any]]]",
+    dtype[Union[bool_, integer[Any], floating[Any]]],
     Union[bool, int, float],
 ]
 _ArrayLikeComplex_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], floating[Any], complexfloating[Any, Any]]]",
+    dtype[Union[
+        bool_,
+        integer[Any],
+        floating[Any],
+        complexfloating[Any, Any],
+    ]],
     Union[bool, int, float, complex],
 ]
 _ArrayLikeNumber_co = _DualArrayLike[
-    "dtype[Union[bool_, number[Any]]]",
+    dtype[Union[bool_, number[Any]]],
     Union[bool, int, float, complex],
 ]
 _ArrayLikeTD64_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], timedelta64]]",
+    dtype[Union[bool_, integer[Any], timedelta64]],
     Union[bool, int],
 ]
 _ArrayLikeDT64_co = Union[
-    _SupportsArray["dtype[datetime64]"],
-    _NestedSequence[_SupportsArray["dtype[datetime64]"]],
+    _SupportsArray[dtype[datetime64]],
+    _NestedSequence[_SupportsArray[dtype[datetime64]]],
 ]
 _ArrayLikeObject_co = Union[
-    _SupportsArray["dtype[object_]"],
-    _NestedSequence[_SupportsArray["dtype[object_]"]],
+    _SupportsArray[dtype[object_]],
+    _NestedSequence[_SupportsArray[dtype[object_]]],
 ]
 
 _ArrayLikeVoid_co = Union[
-    _SupportsArray["dtype[void]"],
-    _NestedSequence[_SupportsArray["dtype[void]"]],
+    _SupportsArray[dtype[void]],
+    _NestedSequence[_SupportsArray[dtype[void]]],
 ]
 _ArrayLikeStr_co = _DualArrayLike[
-    "dtype[str_]",
+    dtype[str_],
     str,
 ]
 _ArrayLikeBytes_co = _DualArrayLike[
-    "dtype[bytes_]",
+    dtype[bytes_],
     bytes,
 ]
 
 _ArrayLikeInt = _DualArrayLike[
-    "dtype[integer[Any]]",
+    dtype[integer[Any]],
     int,
 ]
 
@@ -153,6 +159,6 @@ class _UnknownType:
 
 
 _ArrayLikeUnknown = _DualArrayLike[
-    "dtype[_UnknownType]",
+    dtype[_UnknownType],
     _UnknownType,
 ]
diff --git a/numpy/_typing/_callable.pyi b/numpy/_typing/_callable.pyi
index 5dfe9e7c2..ee818e905 100644
--- a/numpy/_typing/_callable.pyi
+++ b/numpy/_typing/_callable.pyi
@@ -43,7 +43,7 @@ from ._scalars import (
     _NumberLike_co,
 )
 from . import NBitBase
-from ._generic_alias import NDArray
+from ._array_like import NDArray
 from ._nested_sequence import _NestedSequence
 
 _T1 = TypeVar("_T1")
diff --git a/numpy/_typing/_dtype_like.py b/numpy/_typing/_dtype_like.py
index e92e17dd2..207a99c56 100644
--- a/numpy/_typing/_dtype_like.py
+++ b/numpy/_typing/_dtype_like.py
@@ -1,10 +1,8 @@
+from collections.abc import Sequence
 from typing import (
     Any,
-    List,
     Sequence,
-    Tuple,
     Union,
-    Type,
     TypeVar,
     Protocol,
     TypedDict,
@@ -14,7 +12,6 @@ from typing import (
 import numpy as np
 
 from ._shape import _ShapeLike
-from ._generic_alias import _DType as DType
 
 from ._char_codes import (
     _BoolCodes,
@@ -59,7 +56,7 @@ from ._char_codes import (
 )
 
 _SCT = TypeVar("_SCT", bound=np.generic)
-_DType_co = TypeVar("_DType_co", covariant=True, bound=DType[Any])
+_DType_co = TypeVar("_DType_co", covariant=True, bound=np.dtype[Any])
 
 _DTypeLikeNested = Any  # TODO: wait for support for recursive types
 
@@ -89,41 +86,41 @@ class _SupportsDType(Protocol[_DType_co]):
 
 # A subset of `npt.DTypeLike` that can be parametrized w.r.t. `np.generic`
 _DTypeLike = Union[
-    "np.dtype[_SCT]",
-    Type[_SCT],
-    _SupportsDType["np.dtype[_SCT]"],
+    np.dtype[_SCT],
+    type[_SCT],
+    _SupportsDType[np.dtype[_SCT]],
 ]
 
 
 # Would create a dtype[np.void]
 _VoidDTypeLike = Union[
     # (flexible_dtype, itemsize)
-    Tuple[_DTypeLikeNested, int],
+    tuple[_DTypeLikeNested, int],
     # (fixed_dtype, shape)
-    Tuple[_DTypeLikeNested, _ShapeLike],
+    tuple[_DTypeLikeNested, _ShapeLike],
     # [(field_name, field_dtype, field_shape), ...]
     #
     # The type here is quite broad because NumPy accepts quite a wide
     # range of inputs inside the list; see the tests for some
     # examples.
-    List[Any],
+    list[Any],
     # {'names': ..., 'formats': ..., 'offsets': ..., 'titles': ...,
     #  'itemsize': ...}
     _DTypeDict,
     # (base_dtype, new_dtype)
-    Tuple[_DTypeLikeNested, _DTypeLikeNested],
+    tuple[_DTypeLikeNested, _DTypeLikeNested],
 ]
 
 # Anything that can be coerced into numpy.dtype.
 # Reference: https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
 DTypeLike = Union[
-    DType[Any],
+    np.dtype[Any],
     # default data type (float64)
     None,
     # array-scalar types and generic types
-    Type[Any],  # NOTE: We're stuck with `Type[Any]` due to object dtypes
+    type[Any],  # NOTE: We're stuck with `type[Any]` due to object dtypes
     # anything with a dtype attribute
-    _SupportsDType[DType[Any]],
+    _SupportsDType[np.dtype[Any]],
     # character codes, type strings or comma-separated fields, e.g., 'float64'
     str,
     _VoidDTypeLike,
@@ -139,16 +136,16 @@ DTypeLike = Union[
 # Aliases for commonly used dtype-like objects.
 # Note that the precision of `np.number` subclasses is ignored herein.
 _DTypeLikeBool = Union[
-    Type[bool],
-    Type[np.bool_],
-    DType[np.bool_],
-    _SupportsDType[DType[np.bool_]],
+    type[bool],
+    type[np.bool_],
+    np.dtype[np.bool_],
+    _SupportsDType[np.dtype[np.bool_]],
     _BoolCodes,
 ]
 _DTypeLikeUInt = Union[
-    Type[np.unsignedinteger],
-    DType[np.unsignedinteger],
-    _SupportsDType[DType[np.unsignedinteger]],
+    type[np.unsignedinteger],
+    np.dtype[np.unsignedinteger],
+    _SupportsDType[np.dtype[np.unsignedinteger]],
     _UInt8Codes,
     _UInt16Codes,
     _UInt32Codes,
@@ -161,10 +158,10 @@ _DTypeLikeUInt = Union[
     _ULongLongCodes,
 ]
 _DTypeLikeInt = Union[
-    Type[int],
-    Type[np.signedinteger],
-    DType[np.signedinteger],
-    _SupportsDType[DType[np.signedinteger]],
+    type[int],
+    type[np.signedinteger],
+    np.dtype[np.signedinteger],
+    _SupportsDType[np.dtype[np.signedinteger]],
     _Int8Codes,
     _Int16Codes,
     _Int32Codes,
@@ -177,10 +174,10 @@ _DTypeLikeInt = Union[
     _LongLongCodes,
 ]
 _DTypeLikeFloat = Union[
-    Type[float],
-    Type[np.floating],
-    DType[np.floating],
-    _SupportsDType[DType[np.floating]],
+    type[float],
+    type[np.floating],
+    np.dtype[np.floating],
+    _SupportsDType[np.dtype[np.floating]],
     _Float16Codes,
     _Float32Codes,
     _Float64Codes,
@@ -190,10 +187,10 @@ _DTypeLikeFloat = Union[
     _LongDoubleCodes,
 ]
 _DTypeLikeComplex = Union[
-    Type[complex],
-    Type[np.complexfloating],
-    DType[np.complexfloating],
-    _SupportsDType[DType[np.complexfloating]],
+    type[complex],
+    type[np.complexfloating],
+    np.dtype[np.complexfloating],
+    _SupportsDType[np.dtype[np.complexfloating]],
     _Complex64Codes,
     _Complex128Codes,
     _CSingleCodes,
@@ -201,42 +198,42 @@ _DTypeLikeComplex = Union[
     _CLongDoubleCodes,
 ]
 _DTypeLikeDT64 = Union[
-    Type[np.timedelta64],
-    DType[np.timedelta64],
-    _SupportsDType[DType[np.timedelta64]],
+    type[np.timedelta64],
+    np.dtype[np.timedelta64],
+    _SupportsDType[np.dtype[np.timedelta64]],
     _TD64Codes,
 ]
 _DTypeLikeTD64 = Union[
-    Type[np.datetime64],
-    DType[np.datetime64],
-    _SupportsDType[DType[np.datetime64]],
+    type[np.datetime64],
+    np.dtype[np.datetime64],
+    _SupportsDType[np.dtype[np.datetime64]],
     _DT64Codes,
 ]
 _DTypeLikeStr = Union[
-    Type[str],
-    Type[np.str_],
-    DType[np.str_],
-    _SupportsDType[DType[np.str_]],
+    type[str],
+    type[np.str_],
+    np.dtype[np.str_],
+    _SupportsDType[np.dtype[np.str_]],
     _StrCodes,
 ]
 _DTypeLikeBytes = Union[
-    Type[bytes],
-    Type[np.bytes_],
-    DType[np.bytes_],
-    _SupportsDType[DType[np.bytes_]],
+    type[bytes],
+    type[np.bytes_],
+    np.dtype[np.bytes_],
+    _SupportsDType[np.dtype[np.bytes_]],
     _BytesCodes,
 ]
 _DTypeLikeVoid = Union[
-    Type[np.void],
-    DType[np.void],
-    _SupportsDType[DType[np.void]],
+    type[np.void],
+    np.dtype[np.void],
+    _SupportsDType[np.dtype[np.void]],
     _VoidCodes,
     _VoidDTypeLike,
 ]
 _DTypeLikeObject = Union[
     type,
-    DType[np.object_],
-    _SupportsDType[DType[np.object_]],
+    np.dtype[np.object_],
+    _SupportsDType[np.dtype[np.object_]],
     _ObjectCodes,
 ]
 
diff --git a/numpy/_typing/_extended_precision.py b/numpy/_typing/_extended_precision.py
index edc1778ce..7246b47d0 100644
--- a/numpy/_typing/_extended_precision.py
+++ b/numpy/_typing/_extended_precision.py
@@ -5,8 +5,6 @@ The subclasses are defined here (instead of ``__init__.pyi``) such
 that they can be imported conditionally via the numpy's mypy plugin.
 """
 
-from typing import TYPE_CHECKING
-
 import numpy as np
 from . import (
     _80Bit,
@@ -15,29 +13,15 @@ from . import (
     _256Bit,
 )
 
-if TYPE_CHECKING:
-    uint128 = np.unsignedinteger[_128Bit]
-    uint256 = np.unsignedinteger[_256Bit]
-    int128 = np.signedinteger[_128Bit]
-    int256 = np.signedinteger[_256Bit]
-    float80 = np.floating[_80Bit]
-    float96 = np.floating[_96Bit]
-    float128 = np.floating[_128Bit]
-    float256 = np.floating[_256Bit]
-    complex160 = np.complexfloating[_80Bit, _80Bit]
-    complex192 = np.complexfloating[_96Bit, _96Bit]
-    complex256 = np.complexfloating[_128Bit, _128Bit]
-    complex512 = np.complexfloating[_256Bit, _256Bit]
-else:
-    uint128 = Any
-    uint256 = Any
-    int128 = Any
-    int256 = Any
-    float80 = Any
-    float96 = Any
-    float128 = Any
-    float256 = Any
-    complex160 = Any
-    complex192 = Any
-    complex256 = Any
-    complex512 = Any
+uint128 = np.unsignedinteger[_128Bit]
+uint256 = np.unsignedinteger[_256Bit]
+int128 = np.signedinteger[_128Bit]
+int256 = np.signedinteger[_256Bit]
+float80 = np.floating[_80Bit]
+float96 = np.floating[_96Bit]
+float128 = np.floating[_128Bit]
+float256 = np.floating[_256Bit]
+complex160 = np.complexfloating[_80Bit, _80Bit]
+complex192 = np.complexfloating[_96Bit, _96Bit]
+complex256 = np.complexfloating[_128Bit, _128Bit]
+complex512 = np.complexfloating[_256Bit, _256Bit]
diff --git a/numpy/_typing/_generic_alias.py b/numpy/_typing/_generic_alias.py
deleted file mode 100644
index 01cd224ad..000000000
--- a/numpy/_typing/_generic_alias.py
+++ /dev/null
@@ -1,245 +0,0 @@
-from __future__ import annotations
-
-import sys
-import types
-from collections.abc import Generator, Iterable, Iterator
-from typing import (
-    Any,
-    ClassVar,
-    NoReturn,
-    TypeVar,
-    TYPE_CHECKING,
-)
-
-import numpy as np
-
-__all__ = ["_GenericAlias", "NDArray"]
-
-_T = TypeVar("_T", bound="_GenericAlias")
-
-
-def _to_str(obj: object) -> str:
-    """Helper function for `_GenericAlias.__repr__`."""
-    if obj is Ellipsis:
-        return '...'
-    elif isinstance(obj, type) and not isinstance(obj, _GENERIC_ALIAS_TYPE):
-        if obj.__module__ == 'builtins':
-            return obj.__qualname__
-        else:
-            return f'{obj.__module__}.{obj.__qualname__}'
-    else:
-        return repr(obj)
-
-
-def _parse_parameters(args: Iterable[Any]) -> Generator[TypeVar, None, None]:
-    """Search for all typevars and typevar-containing objects in `args`.
-
-    Helper function for `_GenericAlias.__init__`.
-
-    """
-    for i in args:
-        if hasattr(i, "__parameters__"):
-            yield from i.__parameters__
-        elif isinstance(i, TypeVar):
-            yield i
-
-
-def _reconstruct_alias(alias: _T, parameters: Iterator[TypeVar]) -> _T:
-    """Recursively replace all typevars with those from `parameters`.
-
-    Helper function for `_GenericAlias.__getitem__`.
-
-    """
-    args = []
-    for i in alias.__args__:
-        if isinstance(i, TypeVar):
-            value: Any = next(parameters)
-        elif isinstance(i, _GenericAlias):
-            value = _reconstruct_alias(i, parameters)
-        elif hasattr(i, "__parameters__"):
-            prm_tup = tuple(next(parameters) for _ in i.__parameters__)
-            value = i[prm_tup]
-        else:
-            value = i
-        args.append(value)
-
-    cls = type(alias)
-    return cls(alias.__origin__, tuple(args), alias.__unpacked__)
-
-
-class _GenericAlias:
-    """A python-based backport of the `types.GenericAlias` class.
-
-    E.g. for ``t = list[int]``, ``t.__origin__`` is ``list`` and
-    ``t.__args__`` is ``(int,)``.
-
-    See Also
-    --------
-    :pep:`585`
-        The PEP responsible for introducing `types.GenericAlias`.
-
-    """
-
-    __slots__ = (
-        "__weakref__",
-        "_origin",
-        "_args",
-        "_parameters",
-        "_hash",
-        "_starred",
-    )
-
-    @property
-    def __origin__(self) -> type:
-        return super().__getattribute__("_origin")
-
-    @property
-    def __args__(self) -> tuple[object, ...]:
-        return super().__getattribute__("_args")
-
-    @property
-    def __parameters__(self) -> tuple[TypeVar, ...]:
-        """Type variables in the ``GenericAlias``."""
-        return super().__getattribute__("_parameters")
-
-    @property
-    def __unpacked__(self) -> bool:
-        return super().__getattribute__("_starred")
-
-    @property
-    def __typing_unpacked_tuple_args__(self) -> tuple[object, ...] | None:
-        # NOTE: This should return `__args__` if `__origin__` is a tuple,
-        # which should never be the case with how `_GenericAlias` is used
-        # within numpy
-        return None
-
-    def __init__(
-        self,
-        origin: type,
-        args: object | tuple[object, ...],
-        starred: bool = False,
-    ) -> None:
-        self._origin = origin
-        self._args = args if isinstance(args, tuple) else (args,)
-        self._parameters = tuple(_parse_parameters(self.__args__))
-        self._starred = starred
-
-    @property
-    def __call__(self) -> type[Any]:
-        return self.__origin__
-
-    def __reduce__(self: _T) -> tuple[
-        type[_T],
-        tuple[type[Any], tuple[object, ...], bool],
-    ]:
-        cls = type(self)
-        return cls, (self.__origin__, self.__args__, self.__unpacked__)
-
-    def __mro_entries__(self, bases: Iterable[object]) -> tuple[type[Any]]:
-        return (self.__origin__,)
-
-    def __dir__(self) -> list[str]:
-        """Implement ``dir(self)``."""
-        cls = type(self)
-        dir_origin = set(dir(self.__origin__))
-        return sorted(cls._ATTR_EXCEPTIONS | dir_origin)
-
-    def __hash__(self) -> int:
-        """Return ``hash(self)``."""
-        # Attempt to use the cached hash
-        try:
-            return super().__getattribute__("_hash")
-        except AttributeError:
-            self._hash: int = (
-                hash(self.__origin__) ^
-                hash(self.__args__) ^
-                hash(self.__unpacked__)
-            )
-            return super().__getattribute__("_hash")
-
-    def __instancecheck__(self, obj: object) -> NoReturn:
-        """Check if an `obj` is an instance."""
-        raise TypeError("isinstance() argument 2 cannot be a "
-                        "parameterized generic")
-
-    def __subclasscheck__(self, cls: type) -> NoReturn:
-        """Check if a `cls` is a subclass."""
-        raise TypeError("issubclass() argument 2 cannot be a "
-                        "parameterized generic")
-
-    def __repr__(self) -> str:
-        """Return ``repr(self)``."""
-        args = ", ".join(_to_str(i) for i in self.__args__)
-        origin = _to_str(self.__origin__)
-        prefix = "*" if self.__unpacked__ else ""
-        return f"{prefix}{origin}[{args}]"
-
-    def __getitem__(self: _T, key: object | tuple[object, ...]) -> _T:
-        """Return ``self[key]``."""
-        key_tup = key if isinstance(key, tuple) else (key,)
-
-        if len(self.__parameters__) == 0:
-            raise TypeError(f"There are no type variables left in {self}")
-        elif len(key_tup) > len(self.__parameters__):
-            raise TypeError(f"Too many arguments for {self}")
-        elif len(key_tup) < len(self.__parameters__):
-            raise TypeError(f"Too few arguments for {self}")
-
-        key_iter = iter(key_tup)
-        return _reconstruct_alias(self, key_iter)
-
-    def __eq__(self, value: object) -> bool:
-        """Return ``self == value``."""
-        if not isinstance(value, _GENERIC_ALIAS_TYPE):
-            return NotImplemented
-        return (
-            self.__origin__ == value.__origin__ and
-            self.__args__ == value.__args__ and
-            self.__unpacked__ == getattr(
-                value, "__unpacked__", self.__unpacked__
-            )
-        )
-
-    def __iter__(self: _T) -> Generator[_T, None, None]:
-        """Return ``iter(self)``."""
-        cls = type(self)
-        yield cls(self.__origin__, self.__args__, True)
-
-    _ATTR_EXCEPTIONS: ClassVar[frozenset[str]] = frozenset({
-        "__origin__",
-        "__args__",
-        "__parameters__",
-        "__mro_entries__",
-        "__reduce__",
-        "__reduce_ex__",
-        "__copy__",
-        "__deepcopy__",
-        "__unpacked__",
-        "__typing_unpacked_tuple_args__",
-        "__class__",
-    })
-
-    def __getattribute__(self, name: str) -> Any:
-        """Return ``getattr(self, name)``."""
-        # Pull the attribute from `__origin__` unless its
-        # name is in `_ATTR_EXCEPTIONS`
-        cls = type(self)
-        if name in cls._ATTR_EXCEPTIONS:
-            return super().__getattribute__(name)
-        return getattr(self.__origin__, name)
-
-
-# See `_GenericAlias.__eq__`
-if sys.version_info >= (3, 9):
-    _GENERIC_ALIAS_TYPE = (_GenericAlias, types.GenericAlias)
-else:
-    _GENERIC_ALIAS_TYPE = (_GenericAlias,)
-
-ScalarType = TypeVar("ScalarType", bound=np.generic, covariant=True)
-
-if TYPE_CHECKING or sys.version_info >= (3, 9):
-    _DType = np.dtype[ScalarType]
-    NDArray = np.ndarray[Any, np.dtype[ScalarType]]
-else:
-    _DType = _GenericAlias(np.dtype, (ScalarType,))
-    NDArray = _GenericAlias(np.ndarray, (Any, _DType))
diff --git a/numpy/_typing/_nested_sequence.py b/numpy/_typing/_nested_sequence.py
index 789bf3844..4b6cafc51 100644
--- a/numpy/_typing/_nested_sequence.py
+++ b/numpy/_typing/_nested_sequence.py
@@ -2,9 +2,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from typing import (
     Any,
-    Iterator,
     overload,
     TypeVar,
     Protocol,
diff --git a/numpy/_typing/_scalars.py b/numpy/_typing/_scalars.py
index 516b996dc..e46ff04a0 100644
--- a/numpy/_typing/_scalars.py
+++ b/numpy/_typing/_scalars.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple, Any
+from typing import Union, Any
 
 import numpy as np
 
@@ -10,13 +10,13 @@ _CharLike_co = Union[str, bytes]
 # The 6 `<X>Like_co` type-aliases below represent all scalars that can be
 # coerced into `<X>` (with the casting rule `same_kind`)
 _BoolLike_co = Union[bool, np.bool_]
-_UIntLike_co = Union[_BoolLike_co, np.unsignedinteger]
-_IntLike_co = Union[_BoolLike_co, int, np.integer]
-_FloatLike_co = Union[_IntLike_co, float, np.floating]
-_ComplexLike_co = Union[_FloatLike_co, complex, np.complexfloating]
+_UIntLike_co = Union[_BoolLike_co, np.unsignedinteger[Any]]
+_IntLike_co = Union[_BoolLike_co, int, np.integer[Any]]
+_FloatLike_co = Union[_IntLike_co, float, np.floating[Any]]
+_ComplexLike_co = Union[_FloatLike_co, complex, np.complexfloating[Any, Any]]
 _TD64Like_co = Union[_IntLike_co, np.timedelta64]
 
-_NumberLike_co = Union[int, float, complex, np.number, np.bool_]
+_NumberLike_co = Union[int, float, complex, np.number[Any], np.bool_]
 _ScalarLike_co = Union[
     int,
     float,
@@ -27,4 +27,4 @@ _ScalarLike_co = Union[
 ]
 
 # `_VoidLike_co` is technically not a scalar, but it's close enough
-_VoidLike_co = Union[Tuple[Any, ...], np.void]
+_VoidLike_co = Union[tuple[Any, ...], np.void]
diff --git a/numpy/_typing/_shape.py b/numpy/_typing/_shape.py
index c28859b19..4f1204e47 100644
--- a/numpy/_typing/_shape.py
+++ b/numpy/_typing/_shape.py
@@ -1,6 +1,7 @@
-from typing import Sequence, Tuple, Union, SupportsIndex
+from collections.abc import Sequence
+from typing import Union, SupportsIndex
 
-_Shape = Tuple[int, ...]
+_Shape = tuple[int, ...]
 
 # Anything that can be coerced to a shape tuple
 _ShapeLike = Union[SupportsIndex, Sequence[SupportsIndex]]
diff --git a/numpy/array_api/__init__.py b/numpy/array_api/__init__.py
index 5e58ee0a8..e154b9952 100644
--- a/numpy/array_api/__init__.py
+++ b/numpy/array_api/__init__.py
@@ -333,6 +333,10 @@ __all__ += [
     "trunc",
 ]
 
+from ._indexing_functions import take
+
+__all__ += ["take"]
+
 # linalg is an extension in the array API spec, which is a sub-namespace. Only
 # a subset of functions in it are imported into the top-level namespace.
 from . import linalg
diff --git a/numpy/array_api/_array_object.py b/numpy/array_api/_array_object.py
index c4746fad9..a949b5977 100644
--- a/numpy/array_api/_array_object.py
+++ b/numpy/array_api/_array_object.py
@@ -56,7 +56,7 @@ class Array:
     functions, such as asarray().
 
     """
-    _array: np.ndarray
+    _array: np.ndarray[Any, Any]
 
     # Use a custom constructor instead of __init__, as manually initializing
     # this class is not supported API.
@@ -850,23 +850,13 @@ class Array:
         """
         Performs the operation __imatmul__.
         """
-        # Note: NumPy does not implement __imatmul__.
-
         # matmul is not defined for scalars, but without this, we may get
         # the wrong error message from asarray.
         other = self._check_allowed_dtypes(other, "numeric", "__imatmul__")
         if other is NotImplemented:
             return other
-
-        # __imatmul__ can only be allowed when it would not change the shape
-        # of self.
-        other_shape = other.shape
-        if self.shape == () or other_shape == ():
-            raise ValueError("@= requires at least one dimension")
-        if len(other_shape) == 1 or other_shape[-1] != other_shape[-2]:
-            raise ValueError("@= cannot change the shape of the input array")
-        self._array[:] = self._array.__matmul__(other._array)
-        return self
+        res = self._array.__imatmul__(other._array)
+        return self.__class__._new(res)
 
     def __rmatmul__(self: Array, other: Array, /) -> Array:
         """
diff --git a/numpy/array_api/_indexing_functions.py b/numpy/array_api/_indexing_functions.py
new file mode 100644
index 000000000..ba56bcd6f
--- /dev/null
+++ b/numpy/array_api/_indexing_functions.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from ._array_object import Array
+from ._dtypes import _integer_dtypes
+
+import numpy as np
+
+def take(x: Array, indices: Array, /, *, axis: int) -> Array:
+    """
+    Array API compatible wrapper for :py:func:`np.take <numpy.take>`.
+
+    See its docstring for more information.
+    """    
+    if indices.dtype not in _integer_dtypes:
+        raise TypeError("Only integer dtypes are allowed in indexing")
+    if indices.ndim != 1: 
+        raise ValueError("Only 1-dim indices array is supported")
+    return Array._new(np.take(x._array, indices._array, axis=axis))
diff --git a/numpy/array_api/_typing.py b/numpy/array_api/_typing.py
index dfa87b358..3f9b7186a 100644
--- a/numpy/array_api/_typing.py
+++ b/numpy/array_api/_typing.py
@@ -17,14 +17,12 @@ __all__ = [
     "PyCapsule",
 ]
 
-import sys
 from typing import (
     Any,
     Literal,
     Sequence,
     Type,
     Union,
-    TYPE_CHECKING,
     TypeVar,
     Protocol,
 )
@@ -51,21 +49,20 @@ class NestedSequence(Protocol[_T_co]):
     def __len__(self, /) -> int: ...
 
 Device = Literal["cpu"]
-if TYPE_CHECKING or sys.version_info >= (3, 9):
-    Dtype = dtype[Union[
-        int8,
-        int16,
-        int32,
-        int64,
-        uint8,
-        uint16,
-        uint32,
-        uint64,
-        float32,
-        float64,
-    ]]
-else:
-    Dtype = dtype
+
+Dtype = dtype[Union[
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float32,
+    float64,
+]]
+
 
 SupportsBufferProtocol = Any
 PyCapsule = Any
diff --git a/numpy/array_api/tests/test_indexing_functions.py b/numpy/array_api/tests/test_indexing_functions.py
new file mode 100644
index 000000000..9e05c6386
--- /dev/null
+++ b/numpy/array_api/tests/test_indexing_functions.py
@@ -0,0 +1,24 @@
+import pytest
+
+from numpy import array_api as xp
+
+
+@pytest.mark.parametrize(
+    "x, indices, axis, expected",
+    [
+        ([2, 3], [1, 1, 0], 0,  [3, 3, 2]),
+        ([2, 3], [1, 1, 0], -1, [3, 3, 2]),
+        ([[2, 3]], [1], -1, [[3]]),
+        ([[2, 3]], [0, 0], 0, [[2, 3], [2, 3]]),
+    ],
+)
+def test_take_function(x, indices, axis, expected):
+    """
+    Indices respect relative order of a descending stable-sort
+
+    See https://github.com/numpy/numpy/issues/20778
+    """
+    x = xp.asarray(x)
+    indices = xp.asarray(indices)
+    out = xp.take(x, indices, axis=axis)
+    assert xp.all(out == xp.asarray(expected))
diff --git a/numpy/compat/py3k.py b/numpy/compat/py3k.py
index 3d10bb988..d02c9f8fe 100644
--- a/numpy/compat/py3k.py
+++ b/numpy/compat/py3k.py
@@ -47,7 +47,15 @@ def asstr(s):
     return str(s)
 
 def isfileobj(f):
-    return isinstance(f, (io.FileIO, io.BufferedReader, io.BufferedWriter))
+    if not isinstance(f, (io.FileIO, io.BufferedReader, io.BufferedWriter)):
+        return False
+    try:
+        # BufferedReader/Writer may raise OSError when
+        # fetching `fileno()` (e.g. when wrapping BytesIO).
+        f.fileno()
+        return True
+    except OSError:
+        return False
 
 def open_latin1(filename, mode='r'):
     return open(filename, mode=mode, encoding='iso-8859-1')
diff --git a/numpy/compat/tests/test_compat.py b/numpy/compat/tests/test_compat.py
index 2b8acbaa0..d4391565e 100644
--- a/numpy/compat/tests/test_compat.py
+++ b/numpy/compat/tests/test_compat.py
@@ -1,4 +1,5 @@
 from os.path import join
+from io import BufferedReader, BytesIO
 
 from numpy.compat import isfileobj
 from numpy.testing import assert_
@@ -17,3 +18,5 @@ def test_isfileobj():
 
         with open(filename, 'rb') as f:
             assert_(isfileobj(f))
+
+        assert_(isfileobj(BufferedReader(BytesIO())) is False)
diff --git a/numpy/conftest.py b/numpy/conftest.py
index 8aa6587ee..f1a3eda98 100644
--- a/numpy/conftest.py
+++ b/numpy/conftest.py
@@ -30,7 +30,7 @@ hypothesis.settings.register_profile(
 hypothesis.settings.register_profile(
     name="np.test() profile",
     deadline=None, print_blob=True, database=None, derandomize=True,
-    suppress_health_check=hypothesis.HealthCheck.all(),
+    suppress_health_check=list(hypothesis.HealthCheck),
 )
 # Note that the default profile is chosen based on the presence 
 # of pytest.ini, but can be overridden by passing the 
@@ -40,6 +40,8 @@ hypothesis.settings.load_profile(
     "numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
 )
 
+# The experimentalAPI is used in _umath_tests
+os.environ["NUMPY_EXPERIMENTAL_DTYPE_API"] = "1"
 
 def pytest_configure(config):
     config.addinivalue_line("markers",
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 4375aa9c5..08e717363 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -92,7 +92,6 @@ from . import einsumfunc
 from .einsumfunc import *
 del nt
 
-from .fromnumeric import amax as max, amin as min, round_ as round
 from .numeric import absolute as abs
 
 # do this after everything else, to minimize the chance of this misleadingly
@@ -143,11 +142,14 @@ def _DType_reconstruct(scalar_type):
 
 
 def _DType_reduce(DType):
-    # To pickle a DType without having to add top-level names, pickle the
-    # scalar type for now (and assume that reconstruction will be possible).
-    if DType is dtype:
-        return "dtype"  # must pickle `np.dtype` as a singleton.
-    scalar_type = DType.type  # pickle the scalar type for reconstruction
+    # As types/classes, most DTypes can simply be pickled by their name:
+    if not DType._legacy or DType.__module__ == "numpy.dtypes":
+        return DType.__name__
+
+    # However, user defined legacy dtypes (like rational) do not end up in
+    # `numpy.dtypes` as module and do not have a public class at all.
+    # For these, we pickle them by reconstructing them from the scalar type:
+    scalar_type = DType.type
     return _DType_reconstruct, (scalar_type,)
 
 
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 3bbc3c5ed..bd7c4f519 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -12,6 +12,7 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 from numpy.core.function_base import add_newdoc
 from numpy.core.overrides import array_function_like_doc
 
+
 ###############################################################################
 #
 # flatiter
@@ -2703,6 +2704,12 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('nbytes',
     Does not include memory consumed by non-element attributes of the
     array object.
 
+    See Also
+    --------
+    sys.getsizeof
+        Memory consumed by the object itself without parents in case view.
+        This does include memory consumed by non-element attributes.
+
     Examples
     --------
     >>> x = np.zeros((3,5,2), dtype=np.complex128)
@@ -2928,7 +2935,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('T',
 
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array__',
-    """ a.__array__([dtype], /) -> reference if type unchanged, copy otherwise.
+    """ a.__array__([dtype], /)
 
     Returns either a new reference to self if dtype is not given or a new array
     of provided data type if dtype is different from the current dtype of the
@@ -2991,10 +2998,6 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
     >>> np.ndarray[Any, np.dtype[Any]]
     numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
@@ -3005,7 +3008,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
 
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__deepcopy__',
-    """a.__deepcopy__(memo, /) -> Deep copy of array.
+    """a.__deepcopy__(memo, /)
 
     Used if :func:`copy.deepcopy` is called on an array.
 
@@ -6493,10 +6496,6 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('__class_getitem__',
     >>> np.dtype[np.int64]
     numpy.dtype[numpy.int64]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
@@ -7006,10 +7005,6 @@ add_newdoc('numpy.core.numerictypes', 'number', ('__class_getitem__',
     >>> np.signedinteger[Any]
     numpy.signedinteger[typing.Any]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
index 15d37522a..f9a6ad963 100644
--- a/numpy/core/_add_newdocs_scalars.py
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -93,7 +93,7 @@ def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
     add_newdoc('numpy.core.numerictypes', obj, docstring)
 
 
-add_newdoc_for_scalar_type('bool_', ['bool8'],
+add_newdoc_for_scalar_type('bool_', [],
     """
     Boolean type (True or False), stored as a byte.
 
@@ -204,7 +204,11 @@ add_newdoc_for_scalar_type('str_', ['unicode_'],
     r"""
     A unicode string.
 
-    When used in arrays, this type strips trailing null codepoints.
+    This type strips trailing null codepoints.
+
+    >>> s = np.str_("abc\x00")
+    >>> s
+    'abc'
 
     Unlike the builtin `str`, this supports the :ref:`python:bufferobjects`, exposing its
     contents as UCS4:
@@ -255,7 +259,7 @@ add_newdoc_for_scalar_type('void', [],
        ``\0`` bytes.  The 5 can be a Python or NumPy integer.
     2. ``np.void(b"bytes-like")`` creates a void scalar from the byte string.
        The dtype itemsize will match the byte string length, here ``"V10"``.
-    3. When a ``dtype=`` is passed the call is rougly the same as an
+    3. When a ``dtype=`` is passed the call is roughly the same as an
        array creation.  However, a void scalar rather than array is returned.
 
     Please see the examples which show all three different conventions.
diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py
index cbaab8c3f..a9abc5a88 100644
--- a/numpy/core/_asarray.py
+++ b/numpy/core/_asarray.py
@@ -24,10 +24,6 @@ POSSIBLE_FLAGS = {
 }
 
 
-def _require_dispatcher(a, dtype=None, requirements=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def require(a, dtype=None, requirements=None, *, like=None):
@@ -100,10 +96,10 @@ def require(a, dtype=None, requirements=None, *, like=None):
     """
     if like is not None:
         return _require_with_like(
+            like,
             a,
             dtype=dtype,
             requirements=requirements,
-            like=like,
         )
 
     if not requirements:
@@ -135,6 +131,4 @@ def require(a, dtype=None, requirements=None, *, like=None):
     return arr
 
 
-_require_with_like = array_function_dispatch(
-    _require_dispatcher
-)(require)
+_require_with_like = array_function_dispatch()(require)
diff --git a/numpy/core/_asarray.pyi b/numpy/core/_asarray.pyi
index 473bc037c..69d1528d4 100644
--- a/numpy/core/_asarray.pyi
+++ b/numpy/core/_asarray.pyi
@@ -1,10 +1,10 @@
 from collections.abc import Iterable
-from typing import TypeVar, Union, overload, Literal
+from typing import Any, TypeVar, Union, overload, Literal
 
 from numpy import ndarray
 from numpy._typing import DTypeLike, _SupportsArrayFunc
 
-_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+_ArrayType = TypeVar("_ArrayType", bound=ndarray[Any, Any])
 
 _Requirements = Literal[
     "C", "C_CONTIGUOUS", "CONTIGUOUS",
@@ -31,7 +31,7 @@ def require(
     requirements: _E | Iterable[_RequirementsWithE] = ...,
     *,
     like: _SupportsArrayFunc = ...
-) -> ndarray: ...
+) -> ndarray[Any, Any]: ...
 @overload
 def require(
     a: object,
@@ -39,4 +39,4 @@ def require(
     requirements: None | _Requirements | Iterable[_Requirements] = ...,
     *,
     like: _SupportsArrayFunc = ...
-) -> ndarray: ...
+) -> ndarray[Any, Any]: ...
diff --git a/numpy/core/_dtype.py b/numpy/core/_dtype.py
index 3db80c17e..ff50f5199 100644
--- a/numpy/core/_dtype.py
+++ b/numpy/core/_dtype.py
@@ -114,13 +114,13 @@ def _scalar_str(dtype, short):
         # platforms, so it should never include the itemsize here.
         return "'O'"
 
-    elif dtype.type == np.string_:
+    elif dtype.type == np.bytes_:
         if _isunsized(dtype):
             return "'S'"
         else:
             return "'S%d'" % dtype.itemsize
 
-    elif dtype.type == np.unicode_:
+    elif dtype.type == np.str_:
         if _isunsized(dtype):
             return "'%sU'" % byteorder
         else:
@@ -334,6 +334,8 @@ def _name_includes_bit_suffix(dtype):
     elif dtype.type == np.bool_:
         # implied
         return False
+    elif dtype.type is None:
+        return True
     elif np.issubdtype(dtype, np.flexible) and _isunsized(dtype):
         # unspecified
         return False
@@ -348,7 +350,9 @@ def _name_get(dtype):
         # user dtypes don't promise to do anything special
         return dtype.type.__name__
 
-    if issubclass(dtype.type, np.void):
+    if dtype.kind == '\x00':
+        name = type(dtype).__name__
+    elif issubclass(dtype.type, np.void):
         # historically, void subclasses preserve their name, eg `record64`
         name = dtype.type.__name__
     else:
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 040f02a9d..0fc070b34 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -87,79 +87,16 @@ def _count_reduce_items(arr, axis, keepdims=False, where=True):
                         keepdims)
     return items
 
-# Numpy 1.17.0, 2019-02-24
-# Various clip behavior deprecations, marked with _clip_dep as a prefix.
-
-def _clip_dep_is_scalar_nan(a):
-    # guarded to protect circular imports
-    from numpy.core.fromnumeric import ndim
-    if ndim(a) != 0:
-        return False
-    try:
-        return um.isnan(a)
-    except TypeError:
-        return False
-
-def _clip_dep_is_byte_swapped(a):
-    if isinstance(a, mu.ndarray):
-        return not a.dtype.isnative
-    return False
-
-def _clip_dep_invoke_with_casting(ufunc, *args, out=None, casting=None, **kwargs):
-    # normal path
-    if casting is not None:
-        return ufunc(*args, out=out, casting=casting, **kwargs)
-
-    # try to deal with broken casting rules
-    try:
-        return ufunc(*args, out=out, **kwargs)
-    except _exceptions._UFuncOutputCastingError as e:
-        # Numpy 1.17.0, 2019-02-24
-        warnings.warn(
-            "Converting the output of clip from {!r} to {!r} is deprecated. "
-            "Pass `casting=\"unsafe\"` explicitly to silence this warning, or "
-            "correct the type of the variables.".format(e.from_, e.to),
-            DeprecationWarning,
-            stacklevel=2
-        )
-        return ufunc(*args, out=out, casting="unsafe", **kwargs)
-
-def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
+def _clip(a, min=None, max=None, out=None, **kwargs):
     if min is None and max is None:
         raise ValueError("One of max or min must be given")
 
-    # Numpy 1.17.0, 2019-02-24
-    # This deprecation probably incurs a substantial slowdown for small arrays,
-    # it will be good to get rid of it.
-    if not _clip_dep_is_byte_swapped(a) and not _clip_dep_is_byte_swapped(out):
-        using_deprecated_nan = False
-        if _clip_dep_is_scalar_nan(min):
-            min = -float('inf')
-            using_deprecated_nan = True
-        if _clip_dep_is_scalar_nan(max):
-            max = float('inf')
-            using_deprecated_nan = True
-        if using_deprecated_nan:
-            warnings.warn(
-                "Passing `np.nan` to mean no clipping in np.clip has always "
-                "been unreliable, and is now deprecated. "
-                "In future, this will always return nan, like it already does "
-                "when min or max are arrays that contain nan. "
-                "To skip a bound, pass either None or an np.inf of an "
-                "appropriate sign.",
-                DeprecationWarning,
-                stacklevel=2
-            )
-
     if min is None:
-        return _clip_dep_invoke_with_casting(
-            um.minimum, a, max, out=out, casting=casting, **kwargs)
+        return um.minimum(a, max, out=out, **kwargs)
     elif max is None:
-        return _clip_dep_invoke_with_casting(
-            um.maximum, a, min, out=out, casting=casting, **kwargs)
+        return um.maximum(a, min, out=out, **kwargs)
     else:
-        return _clip_dep_invoke_with_casting(
-            um.clip, a, min, max, out=out, casting=casting, **kwargs)
+        return um.clip(a, min, max, out=out, **kwargs)
 
 def _mean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     arr = asanyarray(a)
diff --git a/numpy/core/_type_aliases.pyi b/numpy/core/_type_aliases.pyi
index bbead0cb5..c0b6f1a80 100644
--- a/numpy/core/_type_aliases.pyi
+++ b/numpy/core/_type_aliases.pyi
@@ -1,12 +1,12 @@
-from typing import TypedDict
+from typing import Any, TypedDict
 
 from numpy import generic, signedinteger, unsignedinteger, floating, complexfloating
 
 class _SCTypes(TypedDict):
-    int: list[type[signedinteger]]
-    uint: list[type[unsignedinteger]]
-    float: list[type[floating]]
-    complex: list[type[complexfloating]]
+    int: list[type[signedinteger[Any]]]
+    uint: list[type[unsignedinteger[Any]]]
+    float: list[type[floating[Any]]]
+    complex: list[type[complexfloating[Any, Any]]]
     others: list[type]
 
 sctypeDict: dict[int | str, type[generic]]
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 957cecf1c..62cd52707 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -169,7 +169,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
         - 'longfloat' : 128-bit floats
         - 'complexfloat'
         - 'longcomplexfloat' : composed of two 128-bit floats
-        - 'numpystr' : types `numpy.string_` and `numpy.unicode_`
+        - 'numpystr' : types `numpy.bytes_` and `numpy.str_`
         - 'object' : `np.object_` arrays
 
         Other keys that can be used to set a group of types at once are:
@@ -475,7 +475,7 @@ def _get_format_function(data, **options):
             return formatdict['longcomplexfloat']()
         else:
             return formatdict['complexfloat']()
-    elif issubclass(dtypeobj, (_nt.unicode_, _nt.string_)):
+    elif issubclass(dtypeobj, (_nt.str_, _nt.bytes_)):
         return formatdict['numpystr']()
     elif issubclass(dtypeobj, _nt.datetime64):
         return formatdict['datetime']()
@@ -616,7 +616,7 @@ def array2string(a, max_line_width=None, precision=None,
         - 'complexfloat'
         - 'longcomplexfloat' : composed of two 128-bit floats
         - 'void' : type `numpy.void`
-        - 'numpystr' : types `numpy.string_` and `numpy.unicode_`
+        - 'numpystr' : types `numpy.bytes_` and `numpy.str_`
 
         Other keys that can be used to set a group of types at once are:
 
@@ -724,7 +724,7 @@ def array2string(a, max_line_width=None, precision=None,
         # Deprecation 11-9-2017  v1.14
         warnings.warn("'style' argument is deprecated and no longer functional"
                       " except in 1.13 'legacy' mode",
-                      DeprecationWarning, stacklevel=3)
+                      DeprecationWarning, stacklevel=2)
 
     if options['legacy'] > 113:
         options['linewidth'] -= len(suffix)
@@ -1096,7 +1096,7 @@ def format_float_scientific(x, precision=None, unique=True, trim='k',
         identify the value may be printed and rounded unbiased.
 
         -- versionadded:: 1.21.0
-        
+
     Returns
     -------
     rep : string
@@ -1181,7 +1181,7 @@ def format_float_positional(x, precision=None, unique=True,
         Minimum number of digits to print. Only has an effect if `unique=True`
         in which case additional digits past those necessary to uniquely
         identify the value may be printed, rounding the last additional digit.
-        
+
         -- versionadded:: 1.21.0
 
     Returns
@@ -1339,13 +1339,29 @@ class TimedeltaFormat(_TimelikeFormat):
 
 
 class SubArrayFormat:
-    def __init__(self, format_function):
+    def __init__(self, format_function, **options):
         self.format_function = format_function
+        self.threshold = options['threshold']
+        self.edge_items = options['edgeitems']
+
+    def __call__(self, a):
+        self.summary_insert = "..." if a.size > self.threshold else ""
+        return self.format_array(a)
+
+    def format_array(self, a):
+        if np.ndim(a) == 0:
+            return self.format_function(a)
+
+        if self.summary_insert and a.shape[0] > 2*self.edge_items:
+            formatted = (
+                [self.format_array(a_) for a_ in a[:self.edge_items]]
+                + [self.summary_insert]
+                + [self.format_array(a_) for a_ in a[-self.edge_items:]]
+            )
+        else:
+            formatted = [self.format_array(a_) for a_ in a]
 
-    def __call__(self, arr):
-        if arr.ndim <= 1:
-            return "[" + ", ".join(self.format_function(a) for a in arr) + "]"
-        return "[" + ", ".join(self.__call__(a) for a in arr) + "]"
+        return "[" + ", ".join(formatted) + "]"
 
 
 class StructuredVoidFormat:
@@ -1369,7 +1385,7 @@ class StructuredVoidFormat:
         for field_name in data.dtype.names:
             format_function = _get_format_function(data[field_name], **options)
             if data.dtype[field_name].shape != ():
-                format_function = SubArrayFormat(format_function)
+                format_function = SubArrayFormat(format_function, **options)
             format_functions.append(format_function)
         return cls(format_functions)
 
@@ -1429,6 +1445,10 @@ def dtype_is_implied(dtype):
     if dtype.names is not None:
         return False
 
+    # should care about endianness *unless size is 1* (e.g., int8, bool)
+    if not dtype.isnative:
+        return False
+
     return dtype.type in _typelessdata
 
 
@@ -1453,10 +1473,14 @@ def dtype_short_repr(dtype):
         return "'%s'" % str(dtype)
 
     typename = dtype.name
+    if not dtype.isnative:
+        # deal with cases like dtype('<u2') that are identical to an
+        # established dtype (in this case uint16)
+        # except that they have a different endianness.
+        return "'%s'" % str(dtype)
     # quote typenames which can't be represented as python variable names
     if typename and not (typename[0].isalpha() and typename.isalnum()):
         typename = repr(typename)
-
     return typename
 
 
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index bd4b71180..e52193d7a 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -67,5 +67,7 @@
 # Version 16 (NumPy 1.23)
 # NonNull attributes removed from numpy_api.py
 # Version 16 (NumPy 1.24) No change.
-# Version 16 (NumPy 1.25) No change.
 0x00000010 = 04a7bf1e65350926a0e528798da263c0
+
+# Version 17 (NumPy 1.25) No actual change.
+0x00000011 = ca1aebdad799358149567d9d93cbca09
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index c23fd6c72..2cdaba52d 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -11,8 +11,8 @@ import io
 import os
 import re
 import sys
-import textwrap
 import importlib.util
+import textwrap
 
 from os.path import join
 
@@ -99,6 +99,27 @@ def _repl(str):
     return str.replace('Bool', 'npy_bool')
 
 
+class MinVersion:
+    def __init__(self, version):
+        """ Version should be the normal NumPy version, e.g. "1.25" """
+        major, minor = version.split(".")
+        self.version = f"NPY_{major}_{minor}_API_VERSION"
+
+    def __str__(self):
+        # Used by version hashing:
+        return self.version
+
+    def add_guard(self, name, normal_define):
+        """Wrap a definition behind a version guard"""
+        wrap = textwrap.dedent(f"""
+            #if NPY_FEATURE_VERSION >= {self.version}
+            {{define}}
+            #endif""")
+
+        # we only insert `define` later to avoid confusing dedent:
+        return wrap.format(define=normal_define)
+
+
 class StealRef:
     def __init__(self, arg):
         self.arg = arg # counting from 1
@@ -131,21 +152,6 @@ class Function:
             doccomment = ''
         return '%s%s %s(%s)' % (doccomment, self.return_type, self.name, argstr)
 
-    def to_ReST(self):
-        lines = ['::', '', '  ' + self.return_type]
-        argstr = ',\000'.join([self._format_arg(*a) for a in self.args])
-        name = '  %s' % (self.name,)
-        s = textwrap.wrap('(%s)' % (argstr,), width=72,
-                          initial_indent=name,
-                          subsequent_indent=' ' * (len(name)+1),
-                          break_long_words=False)
-        for l in s:
-            lines.append(l.replace('\000', ' ').rstrip())
-        lines.append('')
-        if self.doc:
-            lines.append(textwrap.dedent(self.doc))
-        return '\n'.join(lines)
-
     def api_hash(self):
         m = hashlib.md5()
         m.update(remove_whitespace(self.return_type))
@@ -407,7 +413,20 @@ class FunctionApi:
     def __init__(self, name, index, annotations, return_type, args, api_name):
         self.name = name
         self.index = index
-        self.annotations = annotations
+
+        self.min_version = None
+        self.annotations = []
+        for annotation in annotations:
+            # String checks, because manual import breaks isinstance
+            if type(annotation).__name__ == "StealRef":
+                self.annotations.append(annotation)
+            elif type(annotation).__name__ == "MinVersion":
+                if self.min_version is not None:
+                    raise ValueError("Two minimum versions specified!")
+                self.min_version = annotation
+            else:
+                raise ValueError(f"unknown annotation {annotation}")
+
         self.return_type = return_type
         self.args = args
         self.api_name = api_name
@@ -419,13 +438,14 @@ class FunctionApi:
         return argstr
 
     def define_from_array_api_string(self):
-        define = """\
-#define %s \\\n        (*(%s (*)(%s)) \\
-         %s[%d])""" % (self.name,
-                                self.return_type,
-                                self._argtypes_string(),
-                                self.api_name,
-                                self.index)
+        arguments = self._argtypes_string()
+        define = textwrap.dedent(f"""\
+            #define {self.name} \\
+                    (*({self.return_type} (*)({arguments})) \\
+                {self.api_name}[{self.index}])""")
+
+        if self.min_version is not None:
+            define = self.min_version.add_guard(self.name, define)
         return define
 
     def array_api_define(self):
@@ -516,7 +536,7 @@ def get_versions_hash():
     d = []
 
     file = os.path.join(os.path.dirname(__file__), 'cversions.txt')
-    with open(file, 'r') as fid:
+    with open(file) as fid:
         for line in fid:
             m = VERRE.match(line)
             if m:
diff --git a/numpy/core/code_generators/generate_numpy_api.py b/numpy/core/code_generators/generate_numpy_api.py
index bdfd635e4..bfcb0d0e5 100644
--- a/numpy/core/code_generators/generate_numpy_api.py
+++ b/numpy/core/code_generators/generate_numpy_api.py
@@ -141,19 +141,12 @@ void *PyArray_API[] = {
 };
 """
 
-c_api_header = """
-===========
-NumPy C-API
-===========
-"""
-
 def generate_api(output_dir, force=False):
     basename = 'multiarray_api'
 
     h_file = os.path.join(output_dir, '__%s.h' % basename)
     c_file = os.path.join(output_dir, '__%s.c' % basename)
-    d_file = os.path.join(output_dir, '%s.txt' % basename)
-    targets = (h_file, c_file, d_file)
+    targets = (h_file, c_file)
 
     sources = numpy_api.multiarray_api
 
@@ -167,7 +160,6 @@ def generate_api(output_dir, force=False):
 def do_generate_api(targets, sources):
     header_file = targets[0]
     c_file = targets[1]
-    doc_file = targets[2]
 
     global_vars = sources[0]
     scalar_bool_values = sources[1]
@@ -236,13 +228,6 @@ def do_generate_api(targets, sources):
     s = c_template % ',\n'.join(init_list)
     genapi.write_file(c_file, s)
 
-    # write to documentation
-    s = c_api_header
-    for func in numpyapi_list:
-        s += func.to_ReST()
-        s += '\n\n'
-    genapi.write_file(doc_file, s)
-
     return targets
 
 
diff --git a/numpy/core/code_generators/generate_ufunc_api.py b/numpy/core/code_generators/generate_ufunc_api.py
index ef3cb0bb5..e03299a52 100644
--- a/numpy/core/code_generators/generate_ufunc_api.py
+++ b/numpy/core/code_generators/generate_ufunc_api.py
@@ -122,8 +122,7 @@ def generate_api(output_dir, force=False):
 
     h_file = os.path.join(output_dir, '__%s.h' % basename)
     c_file = os.path.join(output_dir, '__%s.c' % basename)
-    d_file = os.path.join(output_dir, '%s.txt' % basename)
-    targets = (h_file, c_file, d_file)
+    targets = (h_file, c_file)
 
     sources = ['ufunc_api_order.txt']
 
@@ -137,7 +136,6 @@ def generate_api(output_dir, force=False):
 def do_generate_api(targets, sources):
     header_file = targets[0]
     c_file = targets[1]
-    doc_file = targets[2]
 
     ufunc_api_index = genapi.merge_api_dicts((
             numpy_api.ufunc_funcs_api,
@@ -179,17 +177,6 @@ def do_generate_api(targets, sources):
     s = c_template % ',\n'.join(init_list)
     genapi.write_file(c_file, s)
 
-    # Write to documentation
-    s = '''
-=================
-NumPy Ufunc C-API
-=================
-'''
-    for func in ufunc_api_list:
-        s += func.to_ReST()
-        s += '\n\n'
-    genapi.write_file(doc_file, s)
-
     return targets
 
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 768c8deee..a170f83be 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -1,3 +1,8 @@
+"""
+Generate the code to build all the internal ufuncs. At the base is the defdict:
+a dictionary ofUfunc classes. This is fed to make_code to generate
+__umath_generated.c
+"""
 import os
 import re
 import struct
@@ -5,6 +10,7 @@ import sys
 import textwrap
 import argparse
 
+# identity objects
 Zero = "PyLong_FromLong(0)"
 One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
@@ -55,9 +61,6 @@ class TypeDescription:
     cfunc_alias : str or none, optional
         Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays.
         NOTE: it doesn't support 'astype'
-    simd : list
-        Available SIMD ufunc loops, dispatched at runtime in specified order
-        Currently only supported for simples types (see make_arrays)
     dispatch : str or None, optional
         Dispatch-able source name without its extension '.dispatch.c' that
         contains the definition of ufunc, dispatched at runtime depending on the
@@ -65,7 +68,7 @@ class TypeDescription:
         NOTE: it doesn't support 'astype'
     """
     def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
-                 simd=None, dispatch=None):
+                 dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -78,7 +81,6 @@ class TypeDescription:
             out = out.replace('P', type)
         self.out = out
         self.cfunc_alias = cfunc_alias
-        self.simd = simd
         self.dispatch = dispatch
 
     def finish_signature(self, nin, nout):
@@ -90,7 +92,47 @@ class TypeDescription:
         assert len(self.out) == nout
         self.astype = self.astype_dict.get(self.type, None)
 
-_fdata_map = dict(
+
+def _check_order(types1, types2):
+    dtype_order = allP + "O"
+    for t1, t2 in zip(types1, types2):
+        # We have no opinion on object or time ordering for now:
+        if t1 in "OP" or t2 in "OP":
+            return True
+        if t1 in "mM" or t2 in "mM":
+            return True
+
+        t1i = dtype_order.index(t1)
+        t2i = dtype_order.index(t2)
+        if t1i < t2i:
+            return
+        if t2i > t1i:
+            break
+
+    if types1 == "QQ?" and types2 == "qQ?":
+        # Explicitly allow this mixed case, rather than figure out what order
+        # is nicer or how to encode it.
+        return
+
+    raise TypeError(
+            f"Input dtypes are unsorted or duplicate: {types1} and {types2}")
+
+
+def check_td_order(tds):
+    # A quick check for whether the signatures make sense, it happened too
+    # often that SIMD additions added loops that do not even make some sense.
+    # TODO: This should likely be a test and it would be nice if it rejected
+    #       duplicate entries as well (but we have many as of writing this).
+    signatures = [t.in_+t.out for t in tds]
+
+    for prev_i, sign in enumerate(signatures[1:]):
+        if sign in signatures[:prev_i+1]:
+            continue  # allow duplicates...
+
+        _check_order(signatures[prev_i], sign)
+
+
+_floatformat_map = dict(
     e='npy_%sf',
     f='npy_%sf',
     d='npy_%s',
@@ -101,11 +143,14 @@ _fdata_map = dict(
 )
 
 def build_func_data(types, f):
-    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
 def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
-       simd=None, dispatch=None):
+       dispatch=None):
+    """
+    Generate a TypeDescription instance for each item in types
+    """
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -129,12 +174,6 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
         raise ValueError("Number of types and outputs do not match")
     tds = []
     for t, fd, i, o in zip(types, func_data, in_, out):
-        # [(simd-name, list of types)]
-        if simd is not None:
-            simdt = [k for k, v in simd if t in v]
-        else:
-            simdt = []
-
         # [(dispatch file name without extension '.dispatch.c*', list of types)]
         if dispatch:
             dispt = ([k for k, v in dispatch if t in v]+[None])[0]
@@ -142,7 +181,7 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
             dispt = None
         tds.append(TypeDescription(
             t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
-            simd=simdt, dispatch=dispt
+            dispatch=dispt
         ))
     return tds
 
@@ -153,12 +192,15 @@ class Ufunc:
     ----------
     nin : number of input arguments
     nout : number of output arguments
-    identity : identity element for a two-argument function
+    identity : identity element for a two-argument function (like Zero)
     docstring : docstring for the ufunc
-    type_descriptions : list of TypeDescription objects
+    typereso: type resolver function of type PyUFunc_TypeResolutionFunc
+    type_descriptions : TypeDescription objects
+    signature: a generalized ufunc signature (like for matmul)
+    indexed: add indexed loops (ufunc.at) for these type characters
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions, signature=None):
+                 *type_descriptions, signature=None, indexed=''):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -168,11 +210,15 @@ class Ufunc:
         self.typereso = typereso
         self.type_descriptions = []
         self.signature = signature
+        self.indexed = indexed
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
             td.finish_signature(self.nin, self.nout)
 
+        check_td_order(self.type_descriptions)
+
+
 # String-handling utilities to avoid locale-dependence.
 
 import string
@@ -246,7 +292,8 @@ chartoname = {
     'P': 'OBJECT',
 }
 
-noobj = '?bBhHiIlLqQefdgFDGmM'
+no_obj_bool = 'bBhHiIlLqQefdgFDGmM'
+noobj = '?' + no_obj_bool
 all = '?bBhHiIlLqQefdgFDGOmM'
 
 O = 'O'
@@ -280,6 +327,7 @@ nocmplxO = nocmplx+O
 nocmplxP = nocmplx+P
 notimes_or_obj = bints + inexact
 nodatetime_or_obj = bints + inexact
+no_bool_times_obj = ints + inexact
 
 # Find which code corresponds to int64.
 int64 = ''
@@ -299,35 +347,50 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
           ],
           TD(O, f='PyNumber_Add'),
+          indexed=intfltcmplx
           ),
 'subtract':
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
           ],
           TD(O, f='PyNumber_Subtract'),
+          indexed=intfltcmplx
           ),
 'multiply':
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
            TypeDescription('m', FullTypeDescr, 'dm', 'm'),
           ],
           TD(O, f='PyNumber_Multiply'),
+          indexed=intfltcmplx
           ),
 #'true_divide' : aliased to divide in umathmodule.c:initumath
 'floor_divide':
@@ -342,6 +405,7 @@ defdict = {
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
+          indexed=flts + ints
           ),
 'divide':
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
@@ -353,12 +417,16 @@ defdict = {
            TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
+          indexed=flts
           ),
 'conjugate':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
+          TD(ints+flts+cmplx, dispatch=[
+              ('loops_arithm_fp', 'FD'),
+              ('loops_autovec', ints),
+          ]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -373,14 +441,21 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
+          TD(ints+inexact, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_arithm_fp', 'FD'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
+          TD(ints+inexact, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -412,8 +487,13 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
-          TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
+          TD(bints+flts+timedeltaonly, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_logical', '?'),
+              ('loops_autovec', ints + 'e'),
+          ]),
+          TD(cmplx, dispatch=[('loops_unary_complex', 'FD')],
+             out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
 '_arg':
@@ -442,82 +522,113 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sign'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(nobool_or_datetime),
+          TD(nobool_or_datetime, dispatch=[('loops_autovec', ints)]),
           ),
 'greater':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'greater_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'less':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'less_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'not_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'logical_and':
     Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalAnd'),
           ),
 'logical_not':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_xor'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?'),
+          TD('?', out='?', cfunc_alias='not_equal',
+                           dispatch=[('loops_comparison', '?')]),
+          TD(no_bool_times_obj, out='?', dispatch=[
+              ('loops_autovec', ints),
+          ]),
           # TODO: using obj.logical_xor() seems pretty much useless:
           TD(P, f='logical_xor'),
           ),
@@ -525,15 +636,20 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
-          TD(O, f='npy_ObjectMax')
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD(O, f='npy_ObjectMax'),
+          indexed=flts + ints,
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
-          TD(O, f='npy_ObjectMin')
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD(O, f='npy_ObjectMin'),
+          indexed=flts + ints,
           ),
 'clip':
     Ufunc(3, 1, ReorderableNone,
@@ -546,15 +662,20 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
-          TD(O, f='npy_ObjectMax')
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
+          TD(O, f='npy_ObjectMax'),
+          indexed=flts + ints,
           ),
 'fmin':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
-          TD(O, f='npy_ObjectMin')
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
+          TD(O, f='npy_ObjectMin'),
+          indexed=flts + ints,
           ),
 'logaddexp':
     Ufunc(2, 1, MinusInfinity,
@@ -572,42 +693,49 @@ defdict = {
     Ufunc(2, 1, AllOnes,
           docstrings.get('numpy.core.umath.bitwise_and'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_And'),
           ),
 'bitwise_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_or'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Or'),
           ),
 'bitwise_xor':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_xor'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='not_equal',
+                  dispatch=[('loops_comparison', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Xor'),
           ),
 'invert':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.invert'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_not',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Invert'),
           ),
 'left_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.left_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Lshift'),
           ),
 'right_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.right_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Rshift'),
           ),
 'heaviside':
@@ -692,18 +820,20 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
+          TD('e', dispatch=[('loops_umath_fp', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('ed', dispatch=[('loops_umath_fp', 'ed')]),
-          TD('fdg' + cmplx, f='cos'),
+          TD('d', dispatch=[('loops_trigonometric', 'd')]),
+          TD('g' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
 'sin':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
+          TD('e', dispatch=[('loops_umath_fp', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('ed', dispatch=[('loops_umath_fp', 'ed')]),
-          TD('fdg' + cmplx, f='sin'),
+          TD('d', dispatch=[('loops_trigonometric', 'd')]),
+          TD('g' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
 'tan':
@@ -861,8 +991,9 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.arctan2'),
           None,
+          TD('e', f='atan2', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
-          TD(flts, f='atan2', astype={'e': 'f'}),
+          TD('g', f='atan2', astype={'e': 'f'}),
           TD(P, f='arctan2'),
           ),
 'remainder':
@@ -894,7 +1025,10 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isnan'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints),
+          ]),
           ),
 'isnat':
     Ufunc(1, 1, None,
@@ -906,19 +1040,25 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isinf'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints + 'mM'),
+          ]),
           ),
 'isfinite':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isfinite'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints),
+          ]),
           ),
 'signbit':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.signbit'),
           None,
-          TD(flts, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(flts, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
           ),
 'copysign':
     Ufunc(2, 1, None,
@@ -1055,7 +1195,10 @@ def make_arrays(funcdict):
             if t.func_data is FullTypeDescr:
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
-                cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
+                if t.out == "?":
+                    cfunc_fname = f"{tname}_{t.in_}_bool_{cfunc_alias}"
+                else:
+                    cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
@@ -1064,18 +1207,6 @@ def make_arrays(funcdict):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
                 cfunc_fname = f"{tname}_{cfunc_alias}"
-                if t.simd is not None:
-                    for vt in t.simd:
-                        code2list.append(textwrap.dedent("""\
-                        #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
-                        if (NPY_CPU_HAVE({ISA})) {{
-                            {fname}_functions[{idx}] = {cname}_{isa};
-                        }}
-                        #endif
-                        """).format(
-                            ISA=vt.upper(), isa=vt,
-                            fname=name, cname=cfunc_fname, idx=k
-                        ))
             else:
                 try:
                     thedict = arity_lookup[uf.nin, uf.nout]
@@ -1187,6 +1318,40 @@ def make_ufuncs(funcdict):
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        for c in uf.indexed:
+            # Handle indexed loops by getting the underlying ArrayMethodObject
+            # from the list in f._loops and setting its field appropriately
+            fmt = textwrap.dedent("""
+            {{
+                PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                                   dtype, {count});
+                if (info == NULL) {{
+                    return -1;
+                }}
+                if (info == Py_None) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "cannot add indexed loop to ufunc "
+                        "{name} with {typenum}");
+                    return -1;
+                }}
+                if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "Not a PyArrayMethodObject in ufunc "
+                        "{name} with {typenum}");
+                }}
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                                 {funcname};
+                /* info is borrowed, no need to decref*/
+            }}
+            """)
+            mlist.append(fmt.format(
+                typenum=f"NPY_{english_upper(chartoname[c])}",
+                count=uf.nin+uf.nout,
+                name=name,
+                funcname = f"{english_upper(chartoname[c])}_{name}_indexed",
+            ))
+
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1209,8 +1374,14 @@ def make_code(funcdict, filename):
     #include "loops.h"
     #include "matmul.h"
     #include "clip.h"
+    #include "dtypemeta.h"
     #include "_umath_doc_generated.h"
+
     %s
+    /* Returns a borrowed ref of the second value in the matching info tuple */
+    PyObject *
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                     int ndtypes);
 
     static int
     InitOperators(PyObject *dictionary) {
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 6b9080403..bb3b15806 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -18,17 +18,17 @@ import os
 import importlib.util
 
 
-def get_StealRef():
+def get_annotations():
     # Convoluted because we can't import from numpy.distutils
     # (numpy is not yet built)
     genapi_py = os.path.join(os.path.dirname(__file__), 'genapi.py')
     spec = importlib.util.spec_from_file_location('conv_template', genapi_py)
     mod = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(mod)
-    return mod.StealRef
+    return mod.StealRef, mod.MinVersion
 
 
-StealRef = get_StealRef()
+StealRef, MinVersion = get_annotations()
 #from code_generators.genapi import StealRef
 
 # index, type
@@ -367,8 +367,8 @@ multiarray_funcs_api = {
     'PyArray_ResolveWritebackIfCopy':       (302,),
     'PyArray_SetWritebackIfCopyBase':       (303,),
     # End 1.14 API
-    'PyDataMem_SetHandler':                 (304,),
-    'PyDataMem_GetHandler':                 (305,),
+    'PyDataMem_SetHandler':                 (304, MinVersion("1.22")),
+    'PyDataMem_GetHandler':                 (305, MinVersion("1.22")),
     # End 1.22 API
 }
 
@@ -422,7 +422,7 @@ ufunc_funcs_api = {
     # End 1.7 API
     'PyUFunc_RegisterLoopForDescr':             (41,),
     # End 1.8 API
-    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42,),
+    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42, MinVersion("1.16")),
     # End 1.16 API
 }
 
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index efc43b01c..1695b4d27 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -667,7 +667,7 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
     --------
     The number 13 has the binary representation ``00001101``. Likewise,
     16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is
-    then ``000111011``, or 29:
+    then ``00011101``, or 29:
 
     >>> np.bitwise_or(13, 16)
     29
@@ -2021,7 +2021,7 @@ add_newdoc('numpy.core.umath', 'log',
     has a branch cut `[-inf, 0]` and is continuous from above on it. `log`
     handles the floating-point negative zero as an infinitesimal negative
     number, conforming to the C99 standard.
-    
+
     In the cases where the input has a negative real part and a very small
     negative complex part (approaching 0), the result is so close to `-pi`
     that it evaluates to exactly `-pi`.
@@ -2457,7 +2457,7 @@ add_newdoc('numpy.core.umath', 'maximum',
     """
     Element-wise maximum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     maxima. If one of the elements being compared is a NaN, then that
     element is returned. If both elements are NaNs then the first is
     returned. The latter distinction is important for complex NaNs, which
@@ -2516,7 +2516,7 @@ add_newdoc('numpy.core.umath', 'minimum',
     """
     Element-wise minimum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     minima. If one of the elements being compared is a NaN, then that
     element is returned. If both elements are NaNs then the first is
     returned. The latter distinction is important for complex NaNs, which
@@ -2575,7 +2575,7 @@ add_newdoc('numpy.core.umath', 'fmax',
     """
     Element-wise maximum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     maxima. If one of the elements being compared is a NaN, then the
     non-nan element is returned. If both elements are NaNs then the first
     is returned.  The latter distinction is important for complex NaNs,
@@ -2633,7 +2633,7 @@ add_newdoc('numpy.core.umath', 'fmin',
     """
     Element-wise minimum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     minima. If one of the elements being compared is a NaN, then the
     non-nan element is returned. If both elements are NaNs then the first
     is returned.  The latter distinction is important for complex NaNs,
@@ -3833,7 +3833,7 @@ add_newdoc('numpy.core.umath', 'sqrt',
     --------
     emath.sqrt
         A version which returns complex numbers when given negative reals.
-        Note: 0.0 and -0.0 are handled differently for complex inputs.
+        Note that 0.0 and -0.0 are handled differently for complex inputs.
 
     Notes
     -----
diff --git a/numpy/core/config.h.in b/numpy/core/config.h.in
index a47968a7d..e3b559753 100644
--- a/numpy/core/config.h.in
+++ b/numpy/core/config.h.in
@@ -29,28 +29,12 @@
 #mesondefine HAVE___BUILTIN_BSWAP64
 #mesondefine HAVE___BUILTIN_EXPECT
 #mesondefine HAVE___BUILTIN_MUL_OVERFLOW
-#mesondefine HAVE__M_FROM_INT64
-#mesondefine HAVE__MM_LOAD_PS
-#mesondefine HAVE__MM_PREFETCH
-#mesondefine HAVE__MM_LOAD_PD
 #mesondefine HAVE___BUILTIN_PREFETCH
-#mesondefine HAVE_LINK_AVX
-#mesondefine HAVE_LINK_AVX2
-#mesondefine HAVE_LINK_AVX512F
-#mesondefine HAVE_LINK_AVX512_SKX
-#mesondefine HAVE_XGETBV
 
 #mesondefine HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS
 #mesondefine HAVE_ATTRIBUTE_OPTIMIZE_OPT_3
 #mesondefine HAVE_ATTRIBUTE_OPTIMIZE_OPT_2
 #mesondefine HAVE_ATTRIBUTE_NONNULL
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX2
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX512F
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX512_SKX
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-#mesondefine HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS
 
 /* C99 complex support and complex.h are not universal */
 #mesondefine HAVE_COMPLEX_H
@@ -90,6 +74,25 @@
 #mesondefine HAVE_CLOGL
 #mesondefine HAVE_CPOWL
 #mesondefine HAVE_CSQRTL
+/* FreeBSD */
+#mesondefine HAVE_CSINF
+#mesondefine HAVE_CSINHF
+#mesondefine HAVE_CCOSF
+#mesondefine HAVE_CCOSHF
+#mesondefine HAVE_CTANF
+#mesondefine HAVE_CTANHF
+#mesondefine HAVE_CSIN
+#mesondefine HAVE_CSINH
+#mesondefine HAVE_CCOS
+#mesondefine HAVE_CCOSH
+#mesondefine HAVE_CTAN
+#mesondefine HAVE_CTANH
+#mesondefine HAVE_CSINL
+#mesondefine HAVE_CSINHL
+#mesondefine HAVE_CCOSL
+#mesondefine HAVE_CCOSHL
+#mesondefine HAVE_CTANL
+#mesondefine HAVE_CTANHL
 
 #mesondefine NPY_CAN_LINK_SVML
 #mesondefine NPY_RELAXED_STRIDES_DEBUG
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index d312506ff..11c5a30bf 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -6,7 +6,7 @@ operations and methods.
    The `chararray` class exists for backwards compatibility with
    Numarray, it is not recommended for new development. Starting from numpy
    1.4, if one needs arrays of strings, it is recommended to use arrays of
-   `dtype` `object_`, `string_` or `unicode_`, and use the free functions
+   `dtype` `object_`, `bytes_` or `str_`, and use the free functions
    in the `numpy.char` module for fast vectorized string operations.
 
 Some methods will only be available if the corresponding string method is
@@ -19,7 +19,7 @@ import functools
 
 from .._utils import set_module
 from .numerictypes import (
-    string_, unicode_, integer, int_, object_, bool_, character)
+    bytes_, str_, integer, int_, object_, bool_, character)
 from .numeric import ndarray, compare_chararrays
 from .numeric import array as narray
 from numpy.core.multiarray import _vec_string
@@ -46,26 +46,29 @@ array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy.char')
 
 
-def _use_unicode(*args):
-    """
-    Helper function for determining the output type of some string
-    operations.
+def _is_unicode(arr):
+    """Returns True if arr is a string or a string array with a dtype that
+    represents a unicode string, otherwise returns False.
 
-    For an operation on two ndarrays, if at least one is unicode, the
-    result should be unicode.
     """
-    for x in args:
-        if (isinstance(x, str) or
-                issubclass(numpy.asarray(x).dtype.type, unicode_)):
-            return unicode_
-    return string_
+    if (isinstance(arr, str) or
+            issubclass(numpy.asarray(arr).dtype.type, str)):
+        return True
+    return False
+
 
-def _to_string_or_unicode_array(result):
+def _to_bytes_or_str_array(result, output_dtype_like=None):
     """
-    Helper function to cast a result back into a string or unicode array
-    if an object array must be used as an intermediary.
+    Helper function to cast a result back into an array
+    with the appropriate dtype if an object array must be used
+    as an intermediary.
     """
-    return numpy.asarray(result.tolist())
+    ret = numpy.asarray(result.tolist())
+    dtype = getattr(output_dtype_like, 'dtype', None)
+    if dtype is not None:
+        return ret.astype(type(dtype)(_get_num_chars(ret)), copy=False)
+    return ret
+
 
 def _clean_args(*args):
     """
@@ -89,7 +92,7 @@ def _get_num_chars(a):
     a string or unicode array.  This is to abstract out the fact that
     for a unicode array this is itemsize / 4.
     """
-    if issubclass(a.dtype.type, unicode_):
+    if issubclass(a.dtype.type, str_):
         return a.itemsize // 4
     return a.itemsize
 
@@ -275,7 +278,7 @@ def str_len(a):
 
     See Also
     --------
-    builtins.len
+    len
 
     Examples
     --------
@@ -312,16 +315,26 @@ def add(x1, x2):
     Returns
     -------
     add : ndarray
-        Output array of `string_` or `unicode_`, depending on input types
+        Output array of `bytes_` or `str_`, depending on input types
         of the same shape as `x1` and `x2`.
 
     """
     arr1 = numpy.asarray(x1)
     arr2 = numpy.asarray(x2)
     out_size = _get_num_chars(arr1) + _get_num_chars(arr2)
-    dtype = _use_unicode(arr1, arr2)
-    return _vec_string(arr1, (dtype, out_size), '__add__', (arr2,))
 
+    if type(arr1.dtype) != type(arr2.dtype):
+        # Enforce this for now.  The solution to it will be implement add
+        # as a ufunc.  It never worked right on Python 3: bytes + unicode gave
+        # nonsense unicode + bytes errored, and unicode + object used the
+        # object dtype itemsize as num chars (worked on short strings).
+        # bytes + void worked but promoting void->bytes is dubious also.
+        raise TypeError(
+            "np.char.add() requires both arrays of the same dtype kind, but "
+            f"got dtypes: '{arr1.dtype}' and '{arr2.dtype}' (the few cases "
+            "where this used to work often lead to incorrect results).")
+
+    return _vec_string(arr1, type(arr1.dtype)(out_size), '__add__', (arr2,))
 
 def _multiply_dispatcher(a, i):
     return (a,)
@@ -371,7 +384,7 @@ def multiply(a, i):
         raise ValueError("Can only multiply by integers")
     out_size = _get_num_chars(a_arr) * max(int(i_arr.max()), 0)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, out_size), '__mul__', (i_arr,))
+        a_arr, type(a_arr.dtype)(out_size), '__mul__', (i_arr,))
 
 
 def _mod_dispatcher(a, values):
@@ -402,8 +415,8 @@ def mod(a, values):
     str.__mod__
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, '__mod__', (values,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, '__mod__', (values,)), a)
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -496,10 +509,10 @@ def center(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'center', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'center', (width_arr, fillchar))
 
 
 def _count_dispatcher(a, sub, start=None, end=None):
@@ -598,7 +611,7 @@ def decode(a, encoding=None, errors=None):
     array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
 
     """
-    return _to_string_or_unicode_array(
+    return _to_bytes_or_str_array(
         _vec_string(a, object_, 'decode', _clean_args(encoding, errors)))
 
 
@@ -634,7 +647,7 @@ def encode(a, encoding=None, errors=None):
     The type of the result will depend on the encoding specified.
 
     """
-    return _to_string_or_unicode_array(
+    return _to_bytes_or_str_array(
         _vec_string(a, object_, 'encode', _clean_args(encoding, errors)))
 
 
@@ -722,8 +735,8 @@ def expandtabs(a, tabsize=8):
     str.expandtabs
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'expandtabs', (tabsize,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'expandtabs', (tabsize,)), a)
 
 
 @array_function_dispatch(_count_dispatcher)
@@ -1042,8 +1055,8 @@ def join(sep, seq):
     array(['g-h-c', 'o.s.d'], dtype='<U5')
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(sep, object_, 'join', (seq,)))
+    return _to_bytes_or_str_array(
+        _vec_string(sep, object_, 'join', (seq,)), seq)
 
 
 
@@ -1081,10 +1094,10 @@ def ljust(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'ljust', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'ljust', (width_arr, fillchar))
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -1217,8 +1230,8 @@ def partition(a, sep):
     str.partition
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'partition', (sep,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'partition', (sep,)), a)
 
 
 def _replace_dispatcher(a, old, new, count=None):
@@ -1262,9 +1275,8 @@ def replace(a, old, new, count=None):
     >>> np.char.replace(a, 'is', 'was')
     array(['The dwash was fresh', 'Thwas was it'], dtype='<U19')
     """
-    return _to_string_or_unicode_array(
-        _vec_string(
-            a, object_, 'replace', [old, new] + _clean_args(count)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'replace', [old, new] + _clean_args(count)), a)
 
 
 @array_function_dispatch(_count_dispatcher)
@@ -1360,10 +1372,10 @@ def rjust(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'rjust', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'rjust', (width_arr, fillchar))
 
 
 @array_function_dispatch(_partition_dispatcher)
@@ -1398,8 +1410,8 @@ def rpartition(a, sep):
     str.rpartition
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'rpartition', (sep,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'rpartition', (sep,)), a)
 
 
 def _split_dispatcher(a, sep=None, maxsplit=None):
@@ -1754,7 +1766,7 @@ def translate(a, table, deletechars=None):
 
     """
     a_arr = numpy.asarray(a)
-    if issubclass(a_arr.dtype.type, unicode_):
+    if issubclass(a_arr.dtype.type, str_):
         return _vec_string(
             a_arr, a_arr.dtype, 'translate', (table,))
     else:
@@ -1829,7 +1841,7 @@ def zfill(a, width):
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'zfill', (width_arr,))
+        a_arr, type(a_arr.dtype)(size), 'zfill', (width_arr,))
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -1838,7 +1850,7 @@ def isnumeric(a):
     For each element, return True if there are only numeric
     characters in the element.
 
-    Calls `unicode.isnumeric` element-wise.
+    Calls `str.isnumeric` element-wise.
 
     Numeric characters include digit characters, and all characters
     that have the Unicode numeric value property, e.g. ``U+2155,
@@ -1856,7 +1868,7 @@ def isnumeric(a):
 
     See Also
     --------
-    unicode.isnumeric
+    str.isnumeric
 
     Examples
     --------
@@ -1864,7 +1876,7 @@ def isnumeric(a):
     array([ True, False, False, False, False])
 
     """
-    if _use_unicode(a) != unicode_:
+    if not _is_unicode(a):
         raise TypeError("isnumeric is only available for Unicode strings and arrays")
     return _vec_string(a, bool_, 'isnumeric')
 
@@ -1875,7 +1887,7 @@ def isdecimal(a):
     For each element, return True if there are only decimal
     characters in the element.
 
-    Calls `unicode.isdecimal` element-wise.
+    Calls `str.isdecimal` element-wise.
 
     Decimal characters include digit characters, and all characters
     that can be used to form decimal-radix numbers,
@@ -1893,7 +1905,7 @@ def isdecimal(a):
 
     See Also
     --------
-    unicode.isdecimal
+    str.isdecimal
 
     Examples
     --------
@@ -1901,8 +1913,9 @@ def isdecimal(a):
     array([ True, False, False, False])
 
     """ 
-    if _use_unicode(a) != unicode_:
-        raise TypeError("isnumeric is only available for Unicode strings and arrays")
+    if not _is_unicode(a):
+        raise TypeError(
+            "isdecimal is only available for Unicode strings and arrays")
     return _vec_string(a, bool_, 'isdecimal')
 
 
@@ -1918,7 +1931,7 @@ class chararray(ndarray):
        The `chararray` class exists for backwards compatibility with
        Numarray, it is not recommended for new development. Starting from numpy
        1.4, if one needs arrays of strings, it is recommended to use arrays of
-       `dtype` `object_`, `string_` or `unicode_`, and use the free functions
+       `dtype` `object_`, `bytes_` or `str_`, and use the free functions
        in the `numpy.char` module for fast vectorized string operations.
 
     Versus a regular NumPy array of type `str` or `unicode`, this
@@ -2052,9 +2065,9 @@ class chararray(ndarray):
         global _globalvar
 
         if unicode:
-            dtype = unicode_
+            dtype = str_
         else:
-            dtype = string_
+            dtype = bytes_
 
         # force itemsize to be a Python int, since using NumPy integer
         # types results in itemsize.itemsize being used as the size of
@@ -2178,7 +2191,7 @@ class chararray(ndarray):
     def __radd__(self, other):
         """
         Return (other + self), that is string concatenation,
-        element-wise for a pair of array_likes of `string_` or `unicode_`.
+        element-wise for a pair of array_likes of `bytes_` or `str_`.
 
         See Also
         --------
@@ -2211,8 +2224,8 @@ class chararray(ndarray):
     def __mod__(self, i):
         """
         Return (self % i), that is pre-Python 2.6 string formatting
-        (interpolation), element-wise for a pair of array_likes of `string_`
-        or `unicode_`.
+        (interpolation), element-wise for a pair of array_likes of `bytes_`
+        or `str_`.
 
         See Also
         --------
@@ -2722,7 +2735,7 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
     .. note::
        This class is provided for numarray backward-compatibility.
        New code (not concerned with numarray compatibility) should use
-       arrays of type `string_` or `unicode_` and use the free functions
+       arrays of type `bytes_` or `str_` and use the free functions
        in :mod:`numpy.char <numpy.core.defchararray>` for fast
        vectorized string operations instead.
 
@@ -2805,26 +2818,26 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
             # itemsize is in 8-bit chars, so for Unicode, we need
             # to divide by the size of a single Unicode character,
             # which for NumPy is always 4
-            if issubclass(obj.dtype.type, unicode_):
+            if issubclass(obj.dtype.type, str_):
                 itemsize //= 4
 
         if unicode is None:
-            if issubclass(obj.dtype.type, unicode_):
+            if issubclass(obj.dtype.type, str_):
                 unicode = True
             else:
                 unicode = False
 
         if unicode:
-            dtype = unicode_
+            dtype = str_
         else:
-            dtype = string_
+            dtype = bytes_
 
         if order is not None:
             obj = numpy.asarray(obj, order=order)
         if (copy or
                 (itemsize != obj.itemsize) or
-                (not unicode and isinstance(obj, unicode_)) or
-                (unicode and isinstance(obj, string_))):
+                (not unicode and isinstance(obj, str_)) or
+                (unicode and isinstance(obj, bytes_))):
             obj = obj.astype((dtype, int(itemsize)))
         return obj
 
@@ -2837,9 +2850,9 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
             # Fall through to the default case
 
     if unicode:
-        dtype = unicode_
+        dtype = str_
     else:
-        dtype = string_
+        dtype = bytes_
 
     if itemsize is None:
         val = narray(obj, dtype=dtype, order=order, subok=True)
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index d6c5885b8..01966f0fe 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -728,7 +728,7 @@ def einsum_path(*operands, optimize='greedy', einsum_call=False):
         * if False no optimization is taken
         * if True defaults to the 'greedy' algorithm
         * 'optimal' An algorithm that combinatorially explores all possible
-          ways of contracting the listed tensors and choosest the least costly
+          ways of contracting the listed tensors and chooses the least costly
           path. Scales exponentially with the number of terms in the
           contraction.
         * 'greedy' An algorithm that chooses the best pair contraction
diff --git a/numpy/core/einsumfunc.pyi b/numpy/core/einsumfunc.pyi
index c811a5783..ad483bb90 100644
--- a/numpy/core/einsumfunc.pyi
+++ b/numpy/core/einsumfunc.pyi
@@ -5,10 +5,6 @@ from numpy import (
     ndarray,
     dtype,
     bool_,
-    unsignedinteger,
-    signedinteger,
-    floating,
-    complexfloating,
     number,
     _OrderKACF,
 )
@@ -18,12 +14,14 @@ from numpy._typing import (
     _ArrayLikeInt_co,
     _ArrayLikeFloat_co,
     _ArrayLikeComplex_co,
+    _ArrayLikeObject_co,
     _DTypeLikeBool,
     _DTypeLikeUInt,
     _DTypeLikeInt,
     _DTypeLikeFloat,
     _DTypeLikeComplex,
     _DTypeLikeComplex_co,
+    _DTypeLikeObject,
 )
 
 _ArrayType = TypeVar(
@@ -132,6 +130,51 @@ def einsum(
     optimize: _OptimizeKind = ...,
 ) -> _ArrayType: ...
 
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: _ArrayLikeObject_co,
+    out: None = ...,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    casting: _CastingSafe = ...,
+    optimize: _OptimizeKind = ...,
+) -> Any: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: Any,
+    casting: _CastingUnsafe,
+    dtype: None | _DTypeLikeObject = ...,
+    out: None = ...,
+    order: _OrderKACF = ...,
+    optimize: _OptimizeKind = ...,
+) -> Any: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: _ArrayLikeObject_co,
+    out: _ArrayType,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    casting: _CastingSafe = ...,
+    optimize: _OptimizeKind = ...,
+) -> _ArrayType: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: Any,
+    out: _ArrayType,
+    casting: _CastingUnsafe,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    optimize: _OptimizeKind = ...,
+) -> _ArrayType: ...
+
 # NOTE: `einsum_call` is a hidden kwarg unavailable for public use.
 # It is therefore excluded from the signatures below.
 # NOTE: In practice the list consists of a `str` (first element)
@@ -139,6 +182,6 @@ def einsum(
 def einsum_path(
     subscripts: str | _ArrayLikeInt_co,
     /,
-    *operands: _ArrayLikeComplex_co,
+    *operands: _ArrayLikeComplex_co | _DTypeLikeObject,
     optimize: _OptimizeKind = ...,
 ) -> tuple[list[Any], str]: ...
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index ca9d55cb7..69cabb33e 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -6,6 +6,7 @@ import types
 import warnings
 
 import numpy as np
+from .._utils import set_module
 from . import multiarray as mu
 from . import overrides
 from . import umath as um
@@ -20,8 +21,9 @@ __all__ = [
     'all', 'alltrue', 'amax', 'amin', 'any', 'argmax',
     'argmin', 'argpartition', 'argsort', 'around', 'choose', 'clip',
     'compress', 'cumprod', 'cumproduct', 'cumsum', 'diagonal', 'mean',
+    'max', 'min',
     'ndim', 'nonzero', 'partition', 'prod', 'product', 'ptp', 'put',
-    'ravel', 'repeat', 'reshape', 'resize', 'round_',
+    'ravel', 'repeat', 'reshape', 'resize', 'round', 'round_',
     'searchsorted', 'shape', 'size', 'sometrue', 'sort', 'squeeze',
     'std', 'sum', 'swapaxes', 'take', 'trace', 'transpose', 'var',
 ]
@@ -236,24 +238,9 @@ def reshape(a, newshape, order='C'):
 
     Notes
     -----
-    It is not always possible to change the shape of an array without
-    copying the data. If you want an error to be raised when the data is copied,
-    you should assign the new shape to the shape attribute of the array::
-
-     >>> a = np.zeros((10, 2))
-
-     # A transpose makes the array non-contiguous
-     >>> b = a.T
-
-     # Taking a view makes it possible to modify the shape without modifying
-     # the initial object.
-     >>> c = b.view()
-     >>> c.shape = (20)
-     Traceback (most recent call last):
-        ...
-     AttributeError: Incompatible shape for in-place modification. Use
-     `.reshape()` to make a copy with the desired shape.
-
+    It is not always possible to change the shape of an array without copying
+    the data.
+    
     The `order` keyword gives the index ordering both for *fetching* the values
     from `a`, and then *placing* the values into the output array.
     For example, let's say you have an array:
@@ -436,7 +423,7 @@ def _repeat_dispatcher(a, repeats, axis=None):
 @array_function_dispatch(_repeat_dispatcher)
 def repeat(a, repeats, axis=None):
     """
-    Repeat elements of an array.
+    Repeat each element of an array after themselves
 
     Parameters
     ----------
@@ -1814,9 +1801,10 @@ def ravel(a, order='C'):
     Returns
     -------
     y : array_like
-        y is an array of the same subtype as `a`, with shape ``(a.size,)``.
-        Note that matrices are special cased for backward compatibility, if `a`
-        is a matrix, then y is a 1-D ndarray.
+        y is a contiguous 1-D array of the same subtype as `a`,
+        with shape ``(a.size,)``.
+        Note that matrices are special cased for backward compatibility,
+        if `a` is a matrix, then y is a 1-D ndarray.
 
     See Also
     --------
@@ -1835,7 +1823,8 @@ def ravel(a, order='C'):
     column-major, Fortran-style index ordering.
 
     When a view is desired in as many cases as possible, ``arr.reshape(-1)``
-    may be preferable.
+    may be preferable. However, ``ravel`` supports ``K`` in the optional
+    ``order`` argument while ``reshape`` does not.
 
     Examples
     --------
@@ -2313,7 +2302,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
         warnings.warn(
             "Calling np.sum(generator) is deprecated, and in the future will give a different result. "
             "Use np.sum(np.fromiter(generator)) or the python sum builtin instead.",
-            DeprecationWarning, stacklevel=3)
+            DeprecationWarning, stacklevel=2)
 
         res = _sum_(a)
         if out is not None:
@@ -2695,13 +2684,14 @@ def ptp(a, axis=None, out=None, keepdims=np._NoValue):
     return _methods._ptp(a, axis=axis, out=out, **kwargs)
 
 
-def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
-                     where=None):
+def _max_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                    where=None):
     return (a, out)
 
 
-@array_function_dispatch(_amax_dispatcher)
-def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+@array_function_dispatch(_max_dispatcher)
+@set_module('numpy')
+def max(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
          where=np._NoValue):
     """
     Return the maximum of an array or maximum along an axis.
@@ -2729,7 +2719,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         the result will broadcast correctly against the input array.
 
         If the default value is passed, then `keepdims` will not be
-        passed through to the `amax` method of sub-classes of
+        passed through to the ``max`` method of sub-classes of
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
@@ -2748,7 +2738,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Returns
     -------
-    amax : ndarray or scalar
+    max : ndarray or scalar
         Maximum of `a`. If `axis` is None, the result is a scalar value.
         If `axis` is an int, the result is an array of dimension
         ``a.ndim - 1``. If `axis` is a tuple, the result is an array of 
@@ -2775,9 +2765,9 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     corresponding max value will be NaN as well. To ignore NaN values
     (MATLAB behavior), please use nanmax.
 
-    Don't use `amax` for element-wise comparison of 2 arrays; when
+    Don't use `~numpy.max` for element-wise comparison of 2 arrays; when
     ``a.shape[0]`` is 2, ``maximum(a[0], a[1])`` is faster than
-    ``amax(a, axis=0)``.
+    ``max(a, axis=0)``.
 
     Examples
     --------
@@ -2785,19 +2775,19 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     >>> a
     array([[0, 1],
            [2, 3]])
-    >>> np.amax(a)           # Maximum of the flattened array
+    >>> np.max(a)           # Maximum of the flattened array
     3
-    >>> np.amax(a, axis=0)   # Maxima along the first axis
+    >>> np.max(a, axis=0)   # Maxima along the first axis
     array([2, 3])
-    >>> np.amax(a, axis=1)   # Maxima along the second axis
+    >>> np.max(a, axis=1)   # Maxima along the second axis
     array([1, 3])
-    >>> np.amax(a, where=[False, True], initial=-1, axis=0)
+    >>> np.max(a, where=[False, True], initial=-1, axis=0)
     array([-1,  3])
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
-    >>> np.amax(b)
+    >>> np.max(b)
     nan
-    >>> np.amax(b, where=~np.isnan(b), initial=-1)
+    >>> np.max(b, where=~np.isnan(b), initial=-1)
     4.0
     >>> np.nanmax(b)
     4.0
@@ -2805,14 +2795,14 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     You can use an initial value to compute the maximum of an empty slice, or
     to initialize it to a different value:
 
-    >>> np.amax([[-50], [10]], axis=-1, initial=0)
+    >>> np.max([[-50], [10]], axis=-1, initial=0)
     array([ 0, 10])
 
     Notice that the initial value is used as one of the elements for which the
     maximum is determined, unlike for the default argument Python's max
     function, which is only used for empty iterables.
 
-    >>> np.amax([5], initial=6)
+    >>> np.max([5], initial=6)
     6
     >>> max([5], default=6)
     5
@@ -2821,14 +2811,31 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
                           keepdims=keepdims, initial=initial, where=where)
 
 
-def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
-                     where=None):
+@array_function_dispatch(_max_dispatcher)
+def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
+    """
+    Return the maximum of an array or maximum along an axis.
+
+    `amax` is an alias of `~numpy.max`.
+
+    See Also
+    --------
+    max : alias of this function
+    ndarray.max : equivalent method
+    """
+    return _wrapreduction(a, np.maximum, 'max', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
+
+
+def _min_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                    where=None):
     return (a, out)
 
 
-@array_function_dispatch(_amin_dispatcher)
-def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
-         where=np._NoValue):
+@array_function_dispatch(_min_dispatcher)
+def min(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+        where=np._NoValue):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -2855,7 +2862,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         the result will broadcast correctly against the input array.
 
         If the default value is passed, then `keepdims` will not be
-        passed through to the `amin` method of sub-classes of
+        passed through to the ``min`` method of sub-classes of
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
@@ -2874,7 +2881,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Returns
     -------
-    amin : ndarray or scalar
+    min : ndarray or scalar
         Minimum of `a`. If `axis` is None, the result is a scalar value.
         If `axis` is an int, the result is an array of dimension
         ``a.ndim - 1``.  If `axis` is a tuple, the result is an array of 
@@ -2901,9 +2908,9 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     corresponding min value will be NaN as well. To ignore NaN values
     (MATLAB behavior), please use nanmin.
 
-    Don't use `amin` for element-wise comparison of 2 arrays; when
+    Don't use `~numpy.min` for element-wise comparison of 2 arrays; when
     ``a.shape[0]`` is 2, ``minimum(a[0], a[1])`` is faster than
-    ``amin(a, axis=0)``.
+    ``min(a, axis=0)``.
 
     Examples
     --------
@@ -2911,25 +2918,25 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     >>> a
     array([[0, 1],
            [2, 3]])
-    >>> np.amin(a)           # Minimum of the flattened array
+    >>> np.min(a)           # Minimum of the flattened array
     0
-    >>> np.amin(a, axis=0)   # Minima along the first axis
+    >>> np.min(a, axis=0)   # Minima along the first axis
     array([0, 1])
-    >>> np.amin(a, axis=1)   # Minima along the second axis
+    >>> np.min(a, axis=1)   # Minima along the second axis
     array([0, 2])
-    >>> np.amin(a, where=[False, True], initial=10, axis=0)
+    >>> np.min(a, where=[False, True], initial=10, axis=0)
     array([10,  1])
 
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
-    >>> np.amin(b)
+    >>> np.min(b)
     nan
-    >>> np.amin(b, where=~np.isnan(b), initial=10)
+    >>> np.min(b, where=~np.isnan(b), initial=10)
     0.0
     >>> np.nanmin(b)
     0.0
 
-    >>> np.amin([[-50], [10]], axis=-1, initial=0)
+    >>> np.min([[-50], [10]], axis=-1, initial=0)
     array([-50,   0])
 
     Notice that the initial value is used as one of the elements for which the
@@ -2938,7 +2945,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Notice that this isn't the same as Python's ``default`` argument.
 
-    >>> np.amin([6], initial=5)
+    >>> np.min([6], initial=5)
     5
     >>> min([6], default=5)
     6
@@ -2947,6 +2954,23 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
                           keepdims=keepdims, initial=initial, where=where)
 
 
+@array_function_dispatch(_min_dispatcher)
+def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
+    """
+    Return the minimum of an array or minimum along an axis.
+
+    `amin` is an alias of `~numpy.min`.
+
+    See Also
+    --------
+    min : alias of this function
+    ndarray.min : equivalent method
+    """
+    return _wrapreduction(a, np.minimum, 'min', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
+
+
 def _prod_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
                      initial=None, where=None):
     return (a, out)
@@ -3238,12 +3262,12 @@ def size(a, axis=None):
             return asarray(a).shape[axis]
 
 
-def _around_dispatcher(a, decimals=None, out=None):
+def _round_dispatcher(a, decimals=None, out=None):
     return (a, out)
 
 
-@array_function_dispatch(_around_dispatcher)
-def around(a, decimals=0, out=None):
+@array_function_dispatch(_round_dispatcher)
+def round(a, decimals=0, out=None):
     """
     Evenly round to the given number of decimals.
 
@@ -3274,18 +3298,17 @@ def around(a, decimals=0, out=None):
     See Also
     --------
     ndarray.round : equivalent method
+    around : an alias for this function
     ceil, fix, floor, rint, trunc
 
 
     Notes
     -----
-    `~numpy.round` is often used as an alias for `~numpy.around`.
-    
     For values exactly halfway between rounded decimal values, NumPy
     rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,
     -0.5 and 0.5 round to 0.0, etc.
 
-    ``np.around`` uses a fast but sometimes inexact algorithm to round
+    ``np.round`` uses a fast but sometimes inexact algorithm to round
     floating-point datatypes. For positive `decimals` it is equivalent to
     ``np.true_divide(np.rint(a * 10**decimals), 10**decimals)``, which has
     error due to the inexact representation of decimal fractions in the IEEE
@@ -3322,21 +3345,38 @@ def around(a, decimals=0, out=None):
 
     Examples
     --------
-    >>> np.around([0.37, 1.64])
+    >>> np.round([0.37, 1.64])
     array([0., 2.])
-    >>> np.around([0.37, 1.64], decimals=1)
+    >>> np.round([0.37, 1.64], decimals=1)
     array([0.4, 1.6])
-    >>> np.around([.5, 1.5, 2.5, 3.5, 4.5]) # rounds to nearest even value
+    >>> np.round([.5, 1.5, 2.5, 3.5, 4.5]) # rounds to nearest even value
     array([0., 2., 2., 4., 4.])
-    >>> np.around([1,2,3,11], decimals=1) # ndarray of ints is returned
+    >>> np.round([1,2,3,11], decimals=1) # ndarray of ints is returned
     array([ 1,  2,  3, 11])
-    >>> np.around([1,2,3,11], decimals=-1)
+    >>> np.round([1,2,3,11], decimals=-1)
     array([ 0,  0,  0, 10])
 
     """
     return _wrapfunc(a, 'round', decimals=decimals, out=out)
 
 
+@array_function_dispatch(_round_dispatcher)
+def around(a, decimals=0, out=None):
+    """
+    Round an array to the given number of decimals.
+
+    `around` is an alias of `~numpy.round`.
+
+    See Also
+    --------
+    ndarray.round : equivalent method
+    round : alias for this function
+    ceil, fix, floor, rint, trunc
+
+    """
+    return _wrapfunc(a, 'round', decimals=decimals, out=out)
+
+
 def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None, *,
                      where=None):
     return (a, where, out)
@@ -3748,14 +3788,31 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
                          **kwargs)
 
 
-# Aliases of other functions. These have their own definitions only so that
-# they can have unique docstrings.
+# Aliases of other functions. Provided unique docstrings 
+# are for reference purposes only. Wherever possible,
+# avoid using them.
+
+
+def _round__dispatcher(a, decimals=None, out=None):
+    # 2023-02-28, 1.25.0
+    warnings.warn("`round_` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `round` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
 
-@array_function_dispatch(_around_dispatcher)
+@array_function_dispatch(_round__dispatcher)
 def round_(a, decimals=0, out=None):
     """
     Round an array to the given number of decimals.
 
+    `~numpy.round_` is a disrecommended backwards-compatibility
+    alias of `~numpy.around` and `~numpy.round`.
+
+    .. deprecated:: 1.25.0
+        ``round_`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `round` instead.
+
     See Also
     --------
     around : equivalent function; see for details.
@@ -3763,11 +3820,24 @@ def round_(a, decimals=0, out=None):
     return around(a, decimals=decimals, out=out)
 
 
-@array_function_dispatch(_prod_dispatcher, verify=False)
+def _product_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
+                        initial=None, where=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`product` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `prod` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
+
+@array_function_dispatch(_product_dispatcher, verify=False)
 def product(*args, **kwargs):
     """
     Return the product of array elements over a given axis.
 
+    .. deprecated:: 1.25.0
+        ``product`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `prod` instead.
+
     See Also
     --------
     prod : equivalent function; see for details.
@@ -3775,11 +3845,23 @@ def product(*args, **kwargs):
     return prod(*args, **kwargs)
 
 
-@array_function_dispatch(_cumprod_dispatcher, verify=False)
+def _cumproduct_dispatcher(a, axis=None, dtype=None, out=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`cumproduct` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `cumprod` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
+
+@array_function_dispatch(_cumproduct_dispatcher, verify=False)
 def cumproduct(*args, **kwargs):
     """
     Return the cumulative product over the given axis.
 
+    .. deprecated:: 1.25.0
+        ``cumproduct`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `cumprod` instead.
+
     See Also
     --------
     cumprod : equivalent function; see for details.
@@ -3787,13 +3869,26 @@ def cumproduct(*args, **kwargs):
     return cumprod(*args, **kwargs)
 
 
-@array_function_dispatch(_any_dispatcher, verify=False)
+def _sometrue_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                         where=np._NoValue):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`sometrue` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `any` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, where, out)
+
+
+@array_function_dispatch(_sometrue_dispatcher, verify=False)
 def sometrue(*args, **kwargs):
     """
     Check whether some values are true.
 
     Refer to `any` for full documentation.
 
+    .. deprecated:: 1.25.0
+        ``sometrue`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `any` instead.
+
     See Also
     --------
     any : equivalent function; see for details.
@@ -3801,11 +3896,23 @@ def sometrue(*args, **kwargs):
     return any(*args, **kwargs)
 
 
-@array_function_dispatch(_all_dispatcher, verify=False)
+def _alltrue_dispatcher(a, axis=None, out=None, keepdims=None, *, where=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`alltrue` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `all` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, where, out)
+
+
+@array_function_dispatch(_alltrue_dispatcher, verify=False)
 def alltrue(*args, **kwargs):
     """
     Check if all elements of input array are true.
 
+    .. deprecated:: 1.25.0
+        ``alltrue`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `all` instead.
+
     See Also
     --------
     numpy.all : Equivalent function; see for details.
diff --git a/numpy/core/fromnumeric.pyi b/numpy/core/fromnumeric.pyi
index 17b17819d..43d178557 100644
--- a/numpy/core/fromnumeric.pyi
+++ b/numpy/core/fromnumeric.pyi
@@ -1047,3 +1047,7 @@ def var(
     *,
     where: _ArrayLikeBool_co = ...,
 ) -> _ArrayType: ...
+
+max = amax
+min = amin
+round = around
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 3dc51a81b..00e4e6b0e 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -3,6 +3,7 @@ import warnings
 import operator
 import types
 
+import numpy as np
 from . import numeric as _nx
 from .numeric import result_type, NaN, asanyarray, ndim
 from numpy.core.multiarray import add_docstring
@@ -167,7 +168,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     y += start
 
     if endpoint and num > 1:
-        y[-1] = stop
+        y[-1, ...] = stop
 
     if axis != 0:
         y = _nx.moveaxis(y, 0, axis)
@@ -183,7 +184,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
 
 def _logspace_dispatcher(start, stop, num=None, endpoint=None, base=None,
                          dtype=None, axis=None):
-    return (start, stop)
+    return (start, stop, base)
 
 
 @array_function_dispatch(_logspace_dispatcher)
@@ -199,6 +200,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     .. versionchanged:: 1.16.0
         Non-scalar `start` and `stop` are now supported.
 
+    .. versionchanged:: 1.25.0
+        Non-scalar 'base` is now supported
+
     Parameters
     ----------
     start : array_like
@@ -223,9 +227,10 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
         an integer; `float` is chosen even if the arguments would produce an
         array of integers.
     axis : int, optional
-        The axis in the result to store the samples.  Relevant only if start
-        or stop are array-like.  By default (0), the samples will be along a
-        new axis inserted at the beginning. Use -1 to get an axis at the end.
+        The axis in the result to store the samples.  Relevant only if start,
+        stop, or base are array-like.  By default (0), the samples will be
+        along a new axis inserted at the beginning. Use -1 to get an axis at
+        the end.
 
         .. versionadded:: 1.16.0
 
@@ -247,7 +252,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
 
     Notes
     -----
-    Logspace is equivalent to the code
+    If base is a scalar, logspace is equivalent to the code
 
     >>> y = np.linspace(start, stop, num=num, endpoint=endpoint)
     ... # doctest: +SKIP
@@ -262,6 +267,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     array([100.        ,  177.827941  ,  316.22776602,  562.34132519])
     >>> np.logspace(2.0, 3.0, num=4, base=2.0)
     array([4.        ,  5.0396842 ,  6.34960421,  8.        ])
+    >>> np.logspace(2.0, 3.0, num=4, base=[2.0, 3.0], axis=-1)
+    array([[ 4.        ,  5.0396842 ,  6.34960421,  8.        ],
+           [ 9.        , 12.98024613, 18.72075441, 27.        ]])
 
     Graphical illustration:
 
@@ -279,7 +287,13 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     >>> plt.show()
 
     """
+    ndmax = np.broadcast(start, stop, base).ndim
+    start, stop, base = (
+        np.array(a, copy=False, subok=True, ndmin=ndmax)
+        for a in (start, stop, base)
+    )
     y = linspace(start, stop, num=num, endpoint=endpoint, axis=axis)
+    base = np.expand_dims(base, axis=axis)
     if dtype is None:
         return _nx.power(base, y)
     return _nx.power(base, y).astype(dtype, copy=False)
diff --git a/numpy/core/getlimits.py b/numpy/core/getlimits.py
index 5baeb97fe..da9e1d7f3 100644
--- a/numpy/core/getlimits.py
+++ b/numpy/core/getlimits.py
@@ -147,8 +147,12 @@ _MACHAR_PARAMS = {
 
 # Key to identify the floating point type.  Key is result of
 # ftype('-0.1').newbyteorder('<').tobytes()
+#
+# 20230201 - use (ftype(-1.0) / ftype(10.0)).newbyteorder('<').tobytes()
+#            instead because stold may have deficiencies on some platforms.
 # See:
 # https://perl5.git.perl.org/perl.git/blob/3118d7d684b56cbeb702af874f4326683c45f045:/Configure
+
 _KNOWN_TYPES = {}
 def _register_type(machar, bytepat):
     _KNOWN_TYPES[bytepat] = machar
@@ -240,8 +244,6 @@ def _register_known_types():
     # IEEE 754 128-bit binary float
     _register_type(float128_ma,
         b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
-    _register_type(float128_ma,
-        b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
     _float_ma[128] = float128_ma
 
     # Known parameters for float80 (Intel 80-bit extended precision)
@@ -329,7 +331,9 @@ def _get_machar(ftype):
     if params is None:
         raise ValueError(repr(ftype))
     # Detect known / suspected types
-    key = ftype('-0.1').newbyteorder('<').tobytes()
+    # ftype(-1.0) / ftype(10.0) is better than ftype('-0.1') because stold
+    # may be deficient
+    key = (ftype(-1.0) / ftype(10.)).newbyteorder('<').tobytes()
     ma_like = None
     if ftype == ntypes.longdouble:
         # Could be 80 bit == 10 byte extended precision, where last bytes can
@@ -338,7 +342,14 @@ def _get_machar(ftype):
         # random garbage.
         ma_like = _KNOWN_TYPES.get(key[:10])
     if ma_like is None:
+        # see if the full key is known.
         ma_like = _KNOWN_TYPES.get(key)
+    if ma_like is None and len(key) == 16:
+        # machine limits could be f80 masquerading as np.float128,
+        # find all keys with length 16 and make new dict, but make the keys
+        # only 10 bytes long, the last bytes can be random garbage
+        _kt = {k[:10]: v for k, v in _KNOWN_TYPES.items() if len(k) == 16}
+        ma_like = _kt.get(key[:10])
     if ma_like is not None:
         return ma_like
     # Fall back to parameter discovery
@@ -471,13 +482,26 @@ class finfo:
     _finfo_cache = {}
 
     def __new__(cls, dtype):
+        obj = cls._finfo_cache.get(dtype)  # most common path
+        if obj is not None:
+            return obj
+
+        if dtype is None:
+            # Deprecated in NumPy 1.25, 2023-01-16
+            warnings.warn(
+                "finfo() dtype cannot be None. This behavior will "
+                "raise an error in the future. (Deprecated in NumPy 1.25)",
+                DeprecationWarning,
+                stacklevel=2
+            )
+
         try:
             dtype = numeric.dtype(dtype)
         except TypeError:
             # In case a float instance was given
             dtype = numeric.dtype(type(dtype))
 
-        obj = cls._finfo_cache.get(dtype, None)
+        obj = cls._finfo_cache.get(dtype)
         if obj is not None:
             return obj
         dtypes = [dtype]
@@ -487,17 +511,24 @@ class finfo:
             dtype = newdtype
         if not issubclass(dtype, numeric.inexact):
             raise ValueError("data type %r not inexact" % (dtype))
-        obj = cls._finfo_cache.get(dtype, None)
+        obj = cls._finfo_cache.get(dtype)
         if obj is not None:
             return obj
         if not issubclass(dtype, numeric.floating):
             newdtype = _convert_to_float[dtype]
             if newdtype is not dtype:
+                # dtype changed, for example from complex128 to float64
                 dtypes.append(newdtype)
                 dtype = newdtype
-        obj = cls._finfo_cache.get(dtype, None)
-        if obj is not None:
-            return obj
+
+                obj = cls._finfo_cache.get(dtype, None)
+                if obj is not None:
+                    # the original dtype was not in the cache, but the new
+                    # dtype is in the cache. we add the original dtypes to
+                    # the cache and return the result
+                    for dt in dtypes:
+                        cls._finfo_cache[dt] = obj
+                    return obj
         obj = object.__new__(cls)._init(dtype)
         for dt in dtypes:
             cls._finfo_cache[dt] = obj
diff --git a/numpy/core/include/meson.build b/numpy/core/include/meson.build
index 0f9fbe76b..be07a07b8 100644
--- a/numpy/core/include/meson.build
+++ b/numpy/core/include/meson.build
@@ -2,6 +2,7 @@ installed_headers = [
   'numpy/_neighborhood_iterator_imp.h',
   'numpy/arrayobject.h',
   'numpy/arrayscalars.h',
+  'numpy/_dtype_api.h',
   'numpy/experimental_dtype_api.h',
   'numpy/halffloat.h',
   'numpy/ndarrayobject.h',
diff --git a/numpy/core/include/numpy/_dtype_api.h b/numpy/core/include/numpy/_dtype_api.h
new file mode 100644
index 000000000..c397699c7
--- /dev/null
+++ b/numpy/core/include/numpy/_dtype_api.h
@@ -0,0 +1,408 @@
+/*
+ * DType related API shared by the (experimental) public API And internal API.
+ */
+
+#ifndef NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_
+#define NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_
+
+#define __EXPERIMENTAL_DTYPE_API_VERSION 10
+
+struct PyArrayMethodObject_tag;
+
+/*
+ * Largely opaque struct for DType classes (i.e. metaclass instances).
+ * The internal definition is currently in `ndarraytypes.h` (export is a bit
+ * more complex because `PyArray_Descr` is a DTypeMeta internally but not
+ * externally).
+ */
+#if !(defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD)
+
+    typedef struct PyArray_DTypeMeta_tag {
+        PyHeapTypeObject super;
+
+        /*
+        * Most DTypes will have a singleton default instance, for the
+        * parametric legacy DTypes (bytes, string, void, datetime) this
+        * may be a pointer to the *prototype* instance?
+        */
+        PyArray_Descr *singleton;
+        /* Copy of the legacy DTypes type number, usually invalid. */
+        int type_num;
+
+        /* The type object of the scalar instances (may be NULL?) */
+        PyTypeObject *scalar_type;
+        /*
+        * DType flags to signal legacy, parametric, or
+        * abstract.  But plenty of space for additional information/flags.
+        */
+        npy_uint64 flags;
+
+        /*
+        * Use indirection in order to allow a fixed size for this struct.
+        * A stable ABI size makes creating a static DType less painful
+        * while also ensuring flexibility for all opaque API (with one
+        * indirection due the pointer lookup).
+        */
+        void *dt_slots;
+        /* Allow growing (at the moment also beyond this) */
+        void *reserved[3];
+    } PyArray_DTypeMeta;
+
+#endif  /* not internal build */
+
+/*
+ * ******************************************************
+ *         ArrayMethod API (Casting and UFuncs)
+ * ******************************************************
+ */
+/*
+ * NOTE: Expected changes:
+ *       * probably split runtime and general flags into two
+ *       * should possibly not use an enum for typedef for more stable ABI?
+ */
+typedef enum {
+    /* Flag for whether the GIL is required */
+    NPY_METH_REQUIRES_PYAPI = 1 << 0,
+    /*
+     * Some functions cannot set floating point error flags, this flag
+     * gives us the option (not requirement) to skip floating point error
+     * setup/check. No function should set error flags and ignore them
+     * since it would interfere with chaining operations (e.g. casting).
+     */
+    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 1,
+    /* Whether the method supports unaligned access (not runtime) */
+    NPY_METH_SUPPORTS_UNALIGNED = 1 << 2,
+    /*
+     * Used for reductions to allow reordering the operation.  At this point
+     * assume that if set, it also applies to normal operations though!
+     */
+    NPY_METH_IS_REORDERABLE = 1 << 3,
+    /*
+     * Private flag for now for *logic* functions.  The logical functions
+     * `logical_or` and `logical_and` can always cast the inputs to booleans
+     * "safely" (because that is how the cast to bool is defined).
+     * @seberg: I am not sure this is the best way to handle this, so its
+     * private for now (also it is very limited anyway).
+     * There is one "exception". NA aware dtypes cannot cast to bool
+     * (hopefully), so the `??->?` loop should error even with this flag.
+     * But a second NA fallback loop will be necessary.
+     */
+    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
+
+    /* All flags which can change at runtime */
+    NPY_METH_RUNTIME_FLAGS = (
+            NPY_METH_REQUIRES_PYAPI |
+            NPY_METH_NO_FLOATINGPOINT_ERRORS),
+} NPY_ARRAYMETHOD_FLAGS;
+
+
+typedef struct PyArrayMethod_Context_tag {
+    /* The caller, which is typically the original ufunc.  May be NULL */
+    PyObject *caller;
+    /* The method "self".  Publically currentl an opaque object. */
+    struct PyArrayMethodObject_tag *method;
+
+    /* Operand descriptors, filled in by resolve_descriptors */
+    PyArray_Descr **descriptors;
+    /* Structure may grow (this is harmless for DType authors) */
+} PyArrayMethod_Context;
+
+
+/*
+ * The main object for creating a new ArrayMethod. We use the typical `slots`
+ * mechanism used by the Python limited API (see below for the slot defs).
+ */
+typedef struct {
+    const char *name;
+    int nin, nout;
+    NPY_CASTING casting;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    PyArray_DTypeMeta **dtypes;
+    PyType_Slot *slots;
+} PyArrayMethod_Spec;
+
+
+/*
+ * ArrayMethod slots
+ * -----------------
+ *
+ * SLOTS IDs For the ArrayMethod creation, once fully public, IDs are fixed
+ * but can be deprecated and arbitrarily extended.
+ */
+#define NPY_METH_resolve_descriptors 1
+/* We may want to adapt the `get_loop` signature a bit: */
+#define _NPY_METH_get_loop 2
+#define NPY_METH_get_reduction_initial 3
+/* specific loops for constructions/default get_loop: */
+#define NPY_METH_strided_loop 4
+#define NPY_METH_contiguous_loop 5
+#define NPY_METH_unaligned_strided_loop 6
+#define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
+
+/*
+ * The resolve descriptors function, must be able to handle NULL values for
+ * all output (but not input) `given_descrs` and fill `loop_descrs`.
+ * Return -1 on error or 0 if the operation is not possible without an error
+ * set.  (This may still be in flux.)
+ * Otherwise must return the "casting safety", for normal functions, this is
+ * almost always "safe" (or even "equivalent"?).
+ *
+ * `resolve_descriptors` is optional if all output DTypes are non-parametric.
+ */
+typedef NPY_CASTING (resolve_descriptors_function)(
+        /* "method" is currently opaque (necessary e.g. to wrap Python) */
+        struct PyArrayMethodObject_tag *method,
+        /* DTypes the method was created for */
+        PyArray_DTypeMeta **dtypes,
+        /* Input descriptors (instances).  Outputs may be NULL. */
+        PyArray_Descr **given_descrs,
+        /* Exact loop descriptors to use, must not hold references on error */
+        PyArray_Descr **loop_descrs,
+        npy_intp *view_offset);
+
+
+typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
+        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *transferdata);
+
+
+typedef int (get_loop_function)(
+        PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        const npy_intp *strides,
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/**
+ * Query an ArrayMethod for the initial value for use in reduction.
+ *
+ * @param context The arraymethod context, mainly to access the descriptors.
+ * @param reduction_is_empty Whether the reduction is empty. When it is, the
+ *     value returned may differ.  In this case it is a "default" value that
+ *     may differ from the "identity" value normally used.  For example:
+ *     - `0.0` is the default for `sum([])`.  But `-0.0` is the correct
+ *       identity otherwise as it preserves the sign for `sum([-0.0])`.
+ *     - We use no identity for object, but return the default of `0` and `1`
+ *       for the empty `sum([], dtype=object)` and `prod([], dtype=object)`.
+ *       This allows `np.sum(np.array(["a", "b"], dtype=object))` to work.
+ *     - `-inf` or `INT_MIN` for `max` is an identity, but at least `INT_MIN`
+ *       not a good *default* when there are no items.
+ * @param initial Pointer to initial data to be filled (if possible)
+ *
+ * @returns -1, 0, or 1 indicating error, no initial value, and initial being
+ *     successfully filled.  Errors must not be given where 0 is correct, NumPy
+ *     may call this even when not strictly necessary.
+ */
+typedef int (get_reduction_initial_function)(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *initial);
+
+/*
+ * The following functions are only used by the wrapping array method defined
+ * in umath/wrapping_array_method.c
+ */
+
+/*
+ * The function to convert the given descriptors (passed in to
+ * `resolve_descriptors`) and translates them for the wrapped loop.
+ * The new descriptors MUST be viewable with the old ones, `NULL` must be
+ * supported (for outputs) and should normally be forwarded.
+ *
+ * The function must clean up on error.
+ *
+ * NOTE: We currently assume that this translation gives "viewable" results.
+ *       I.e. there is no additional casting related to the wrapping process.
+ *       In principle that could be supported, but not sure it is useful.
+ *       This currently also means that e.g. alignment must apply identically
+ *       to the new dtypes.
+ *
+ * TODO: Due to the fact that `resolve_descriptors` is also used for `can_cast`
+ *       there is no way to "pass out" the result of this function.  This means
+ *       it will be called twice for every ufunc call.
+ *       (I am considering including `auxdata` as an "optional" parameter to
+ *       `resolve_descriptors`, so that it can be filled there if not NULL.)
+ */
+typedef int translate_given_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
+
+/**
+ * The function to convert the actual loop descriptors (as returned by the
+ * original `resolve_descriptors` function) to the ones the output array
+ * should use.
+ * This function must return "viewable" types, it must not mutate them in any
+ * form that would break the inner-loop logic.  Does not need to support NULL.
+ *
+ * The function must clean up on error.
+ *
+ * @param nargs Number of arguments
+ * @param new_dtypes The DTypes of the output (usually probably not needed)
+ * @param given_descrs Original given_descrs to the resolver, necessary to
+ *        fetch any information related to the new dtypes from the original.
+ * @param original_descrs The `loop_descrs` returned by the wrapped loop.
+ * @param loop_descrs The output descriptors, compatible to `original_descrs`.
+ *
+ * @returns 0 on success, -1 on failure.
+ */
+typedef int translate_loop_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
+        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
+
+
+/*
+ * A traverse loop working on a single array. This is similar to the general
+ * strided-loop function. This is designed for loops that need to visit every
+ * element of a single array.
+ *
+ * Currently this is used for array clearing, via the NPY_DT_get_clear_loop
+ * API hook, and zero-filling, via the NPY_DT_get_fill_zero_loop API hook.
+ * These are most useful for handling arrays storing embedded references to
+ * python objects or heap-allocated data.
+ *
+ * The `void *traverse_context` is passed in because we may need to pass in
+ * Intepreter state or similar in the future, but we don't want to pass in
+ * a full context (with pointers to dtypes, method, caller which all make
+ * no sense for a traverse function).
+ *
+ * We assume for now that this context can be just passed through in the
+ * the future (for structured dtypes).
+ *
+ */
+typedef int (traverse_loop_function)(
+        void *traverse_context, PyArray_Descr *descr, char *data,
+        npy_intp size, npy_intp stride, NpyAuxData *auxdata);
+
+
+/*
+ * Simplified get_loop function specific to dtype traversal
+ *
+ * It should set the flags needed for the traversal loop and set out_loop to the
+ * loop function, which must be a valid traverse_loop_function
+ * pointer. Currently this is used for zero-filling and clearing arrays storing
+ * embedded references.
+ *
+ */
+typedef int (get_traverse_loop_function)(
+        void *traverse_context, PyArray_Descr *descr,
+        int aligned, npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_auxdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/*
+ * ****************************
+ *          DTYPE API
+ * ****************************
+ */
+
+#define NPY_DT_ABSTRACT 1 << 1
+#define NPY_DT_PARAMETRIC 1 << 2
+#define NPY_DT_NUMERIC 1 << 3
+
+/*
+ * These correspond to slots in the NPY_DType_Slots struct and must
+ * be in the same order as the members of that struct. If new slots
+ * get added or old slots get removed NPY_NUM_DTYPE_SLOTS must also
+ * be updated
+ */
+
+#define NPY_DT_discover_descr_from_pyobject 1
+// this slot is considered private because its API hasn't beed decided
+#define _NPY_DT_is_known_scalar_type 2
+#define NPY_DT_default_descr 3
+#define NPY_DT_common_dtype 4
+#define NPY_DT_common_instance 5
+#define NPY_DT_ensure_canonical 6
+#define NPY_DT_setitem 7
+#define NPY_DT_getitem 8
+#define NPY_DT_get_clear_loop 9
+#define NPY_DT_get_fill_zero_loop 10
+
+// These PyArray_ArrFunc slots will be deprecated and replaced eventually
+// getitem and setitem can be defined as a performance optimization;
+// by default the user dtypes call `legacy_getitem_using_DType` and
+// `legacy_setitem_using_DType`, respectively. This functionality is
+// only supported for basic NumPy DTypes.
+
+
+// used to separate dtype slots from arrfuncs slots
+// intended only for internal use but defined here for clarity
+#define _NPY_DT_ARRFUNCS_OFFSET (1 << 10)
+
+// Cast is disabled
+// #define NPY_DT_PyArray_ArrFuncs_cast 0 + _NPY_DT_ARRFUNCS_OFFSET
+
+#define NPY_DT_PyArray_ArrFuncs_getitem 1 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_setitem 2 + _NPY_DT_ARRFUNCS_OFFSET
+
+#define NPY_DT_PyArray_ArrFuncs_copyswapn 3 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_copyswap 4 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_compare 5 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argmax 6 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_dotfunc 7 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_scanfunc 8 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fromstr 9 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_nonzero 10 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fill 11 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fillwithscalar 12 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_sort 13 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argsort 14 + _NPY_DT_ARRFUNCS_OFFSET
+
+// Casting related slots are disabled. See
+// https://github.com/numpy/numpy/pull/23173#discussion_r1101098163
+// #define NPY_DT_PyArray_ArrFuncs_castdict 15 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_scalarkind 16 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_cancastscalarkindto 17 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_cancastto 18 + _NPY_DT_ARRFUNCS_OFFSET
+
+// These are deprecated in NumPy 1.19, so are disabled here.
+// #define NPY_DT_PyArray_ArrFuncs_fastclip 19 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_fastputmask 20 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_fasttake 21 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argmin 22 + _NPY_DT_ARRFUNCS_OFFSET
+
+// TODO: These slots probably still need some thought, and/or a way to "grow"?
+typedef struct {
+    PyTypeObject *typeobj;    /* type of python scalar or NULL */
+    int flags;                /* flags, including parametric and abstract */
+    /* NULL terminated cast definitions. Use NULL for the newly created DType */
+    PyArrayMethod_Spec **casts;
+    PyType_Slot *slots;
+    /* Baseclass or NULL (will always subclass `np.dtype`) */
+    PyTypeObject *baseclass;
+} PyArrayDTypeMeta_Spec;
+
+
+typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
+        PyArray_DTypeMeta *cls, PyObject *obj);
+
+/*
+ * Before making this public, we should decide whether it should pass
+ * the type, or allow looking at the object. A possible use-case:
+ * `np.array(np.array([0]), dtype=np.ndarray)`
+ * Could consider arrays that are not `dtype=ndarray` "scalars".
+ */
+typedef int (is_known_scalar_type_function)(
+        PyArray_DTypeMeta *cls, PyTypeObject *obj);
+
+typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
+typedef PyArray_DTypeMeta *(common_dtype_function)(
+        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
+typedef PyArray_Descr *(common_instance_function)(
+        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
+typedef PyArray_Descr *(ensure_canonical_function)(PyArray_Descr *dtype);
+
+/*
+ * TODO: These two functions are currently only used for experimental DType
+ *       API support.  Their relation should be "reversed": NumPy should
+ *       always use them internally.
+ *       There are open points about "casting safety" though, e.g. setting
+ *       elements is currently always unsafe.
+ */
+typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
+typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_ */
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index a20a68016..258bf95b6 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -139,7 +139,9 @@ typedef struct {
         /* note that the PyObject_HEAD macro lives right here */
         PyUnicodeObject base;
         Py_UCS4 *obval;
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
         char *buffer_fmt;
+    #endif
 } PyUnicodeScalarObject;
 
 
@@ -149,7 +151,9 @@ typedef struct {
         PyArray_Descr *descr;
         int flags;
         PyObject *base;
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
         void *_buffer_info;  /* private buffer info, tagged to allow warning */
+    #endif
 } PyVoidScalarObject;
 
 /* Macros
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index 5f5f24317..120d33324 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -3,6 +3,9 @@
  * NEPs 41 to 43.  For background, please check these NEPs.  Otherwise,
  * this header also serves as documentation for the time being.
  *
+ * The header includes `_dtype_api.h` which holds most definition while this
+ * header mainly wraps functions for public consumption.
+ *
  * Please do not hesitate to contact @seberg with questions.  This is
  * developed together with https://github.com/seberg/experimental_user_dtypes
  * and those interested in experimenting are encouraged to contribute there.
@@ -114,7 +117,13 @@
 
 #include <Python.h>
 #include "ndarraytypes.h"
+#include "_dtype_api.h"
 
+/*
+ * The contents of PyArrayMethodObject are currently opaque (is there a way
+ * good way to make them be `PyObject *`?)
+ */
+typedef struct PyArrayMethodObject_tag PyArrayMethodObject;
 
 /*
  * There must be a better way?! -- Oh well, this is experimental
@@ -154,66 +163,6 @@
 #endif
 
 
-/*
- * DTypeMeta struct, the content may be made fully opaque (except the size).
- * We may also move everything into a single `void *dt_slots`.
- */
-typedef struct {
-    PyHeapTypeObject super;
-    PyArray_Descr *singleton;
-    int type_num;
-    PyTypeObject *scalar_type;
-    npy_uint64 flags;
-    void *dt_slots;
-    void *reserved[3];
-} PyArray_DTypeMeta;
-
-
-/*
- * ******************************************************
- *         ArrayMethod API (Casting and UFuncs)
- * ******************************************************
- */
-/*
- * NOTE: Expected changes:
- *       * invert logic of floating point error flag
- *       * probably split runtime and general flags into two
- *       * should possibly not use an enum for typedef for more stable ABI?
- */
-typedef enum {
-    /* Flag for whether the GIL is required */
-    NPY_METH_REQUIRES_PYAPI = 1 << 1,
-    /*
-     * Some functions cannot set floating point error flags, this flag
-     * gives us the option (not requirement) to skip floating point error
-     * setup/check. No function should set error flags and ignore them
-     * since it would interfere with chaining operations (e.g. casting).
-     */
-    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
-    /* Whether the method supports unaligned access (not runtime) */
-    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
-
-    /* All flags which can change at runtime */
-    NPY_METH_RUNTIME_FLAGS = (
-            NPY_METH_REQUIRES_PYAPI |
-            NPY_METH_NO_FLOATINGPOINT_ERRORS),
-} NPY_ARRAYMETHOD_FLAGS;
-
-
-/*
- * The main object for creating a new ArrayMethod. We use the typical `slots`
- * mechanism used by the Python limited API (see below for the slot defs).
- */
-typedef struct {
-    const char *name;
-    int nin, nout;
-    NPY_CASTING casting;
-    NPY_ARRAYMETHOD_FLAGS flags;
-    PyArray_DTypeMeta **dtypes;
-    PyType_Slot *slots;
-} PyArrayMethod_Spec;
-
-
 typedef int _ufunc_addloop_fromspec_func(
         PyObject *ufunc, PyArrayMethod_Spec *spec);
 /*
@@ -267,92 +216,6 @@ typedef int _ufunc_addpromoter_func(
 #define PyUFunc_AddPromoter \
     (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
 
-
-/*
- * The resolve descriptors function, must be able to handle NULL values for
- * all output (but not input) `given_descrs` and fill `loop_descrs`.
- * Return -1 on error or 0 if the operation is not possible without an error
- * set.  (This may still be in flux.)
- * Otherwise must return the "casting safety", for normal functions, this is
- * almost always "safe" (or even "equivalent"?).
- *
- * `resolve_descriptors` is optional if all output DTypes are non-parametric.
- */
-#define NPY_METH_resolve_descriptors 1
-typedef NPY_CASTING (resolve_descriptors_function)(
-        /* "method" is currently opaque (necessary e.g. to wrap Python) */
-        PyObject *method,
-        /* DTypes the method was created for */
-        PyObject **dtypes,
-        /* Input descriptors (instances).  Outputs may be NULL. */
-        PyArray_Descr **given_descrs,
-        /* Exact loop descriptors to use, must not hold references on error */
-        PyArray_Descr **loop_descrs,
-        npy_intp *view_offset);
-
-/* NOT public yet: Signature needs adapting as external API. */
-#define _NPY_METH_get_loop 2
-
-/*
- * Current public API to define fast inner-loops.  You must provide a
- * strided loop.  If this is a cast between two "versions" of the same dtype
- * you must also provide an unaligned strided loop.
- * Other loops are useful to optimize the very common contiguous case.
- *
- * NOTE: As of now, NumPy will NOT use unaligned loops in ufuncs!
- */
-#define NPY_METH_strided_loop 3
-#define NPY_METH_contiguous_loop 4
-#define NPY_METH_unaligned_strided_loop 5
-#define NPY_METH_unaligned_contiguous_loop 6
-
-
-typedef struct {
-    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
-    PyObject *method;  /* The method "self". Currently an opaque object */
-
-    /* Operand descriptors, filled in by resolve_descriptors */
-    PyArray_Descr **descriptors;
-    /* Structure may grow (this is harmless for DType authors) */
-} PyArrayMethod_Context;
-
-typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
-        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
-        NpyAuxData *transferdata);
-
-
-
-/*
- * ****************************
- *          DTYPE API
- * ****************************
- */
-
-#define NPY_DT_ABSTRACT 1 << 1
-#define NPY_DT_PARAMETRIC 1 << 2
-
-#define NPY_DT_discover_descr_from_pyobject 1
-#define _NPY_DT_is_known_scalar_type 2
-#define NPY_DT_default_descr 3
-#define NPY_DT_common_dtype 4
-#define NPY_DT_common_instance 5
-#define NPY_DT_ensure_canonical 6
-#define NPY_DT_setitem 7
-#define NPY_DT_getitem 8
-
-
-// TODO: These slots probably still need some thought, and/or a way to "grow"?
-typedef struct{
-    PyTypeObject *typeobj;    /* type of python scalar or NULL */
-    int flags;                /* flags, including parametric and abstract */
-    /* NULL terminated cast definitions. Use NULL for the newly created DType */
-    PyArrayMethod_Spec **casts;
-    PyType_Slot *slots;
-    /* Baseclass or NULL (will always subclass `np.dtype`) */
-    PyTypeObject *baseclass;
-} PyArrayDTypeMeta_Spec;
-
-
 #define PyArrayDTypeMeta_Type \
     (*(PyTypeObject *)__experimental_dtype_api_table[2])
 typedef int __dtypemeta_fromspec(
@@ -458,16 +321,14 @@ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
  */
 #if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
 
-#define __EXPERIMENTAL_DTYPE_VERSION 5
-
 static int
 import_experimental_dtype_api(int version)
 {
-    if (version != __EXPERIMENTAL_DTYPE_VERSION) {
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
         PyErr_Format(PyExc_RuntimeError,
                 "DType API version %d did not match header version %d. Please "
                 "update the import statement and check for API changes.",
-                version, __EXPERIMENTAL_DTYPE_VERSION);
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
         return -1;
     }
     if (__experimental_dtype_api_table != __uninitialized_table) {
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 45ecb6955..742ba5261 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -44,23 +44,6 @@
 #define NPY_FAIL 0
 #define NPY_SUCCEED 1
 
-/*
- * Binary compatibility version number.  This number is increased
- * whenever the C-API is changed such that binary compatibility is
- * broken, i.e. whenever a recompile of extension modules is needed.
- */
-#define NPY_VERSION NPY_ABI_VERSION
-
-/*
- * Minor API version.  This number is increased whenever a change is
- * made to the C-API -- whether it breaks binary compatibility or not.
- * Some changes, such as adding a function pointer to the end of the
- * function table, can be made without breaking binary compatibility.
- * In this case, only the NPY_FEATURE_VERSION (*not* NPY_VERSION)
- * would be increased.  Whenever binary compatibility is broken, both
- * NPY_VERSION and NPY_FEATURE_VERSION should be increased.
- */
-#define NPY_FEATURE_VERSION NPY_API_VERSION
 
 enum NPY_TYPES {    NPY_BOOL=0,
                     NPY_BYTE, NPY_UBYTE,
@@ -729,11 +712,15 @@ typedef struct tagPyArrayObject_fields {
     int flags;
     /* For weak references */
     PyObject *weakreflist;
+#if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
     void *_buffer_info;  /* private buffer info, tagged to allow warning */
+#endif
     /*
      * For malloc/calloc/realloc/free per object
      */
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
     PyObject *mem_handler;
+#endif
 } PyArrayObject_fields;
 
 /*
@@ -1671,11 +1658,13 @@ PyArray_CLEARFLAGS(PyArrayObject *arr, int flags)
     ((PyArrayObject_fields *)arr)->flags &= ~flags;
 }
 
-static inline NPY_RETURNS_BORROWED_REF PyObject *
-PyArray_HANDLER(PyArrayObject *arr)
-{
-    return ((PyArrayObject_fields *)arr)->mem_handler;
-}
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
+    static inline NPY_RETURNS_BORROWED_REF PyObject *
+    PyArray_HANDLER(PyArrayObject *arr)
+    {
+        return ((PyArrayObject_fields *)arr)->mem_handler;
+    }
+#endif
 
 #define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL)
 
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index 8e5e4000a..62fde943a 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -167,6 +167,26 @@ static inline int PyInt_Check(PyObject *op) {
 
 #endif /* NPY_PY3K */
 
+/*
+ * Macros to protect CRT calls against instant termination when passed an
+ * invalid parameter (https://bugs.python.org/issue23524).
+ */
+#if defined _MSC_VER && _MSC_VER >= 1900
+
+#include <stdlib.h>
+
+extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler;
+#define NPY_BEGIN_SUPPRESS_IPH { _invalid_parameter_handler _Py_old_handler = \
+    _set_thread_local_invalid_parameter_handler(_Py_silent_invalid_parameter_handler);
+#define NPY_END_SUPPRESS_IPH _set_thread_local_invalid_parameter_handler(_Py_old_handler); }
+
+#else
+
+#define NPY_BEGIN_SUPPRESS_IPH
+#define NPY_END_SUPPRESS_IPH
+
+#endif /* _MSC_VER >= 1900 */
+
 
 static inline void
 PyUnicode_ConcatAndDel(PyObject **left, PyObject *right)
@@ -242,13 +262,17 @@ npy_PyFile_Dup2(PyObject *file, char *mode, npy_off_t *orig_pos)
 
     /* Convert to FILE* handle */
 #ifdef _WIN32
+    NPY_BEGIN_SUPPRESS_IPH
     handle = _fdopen(fd2, mode);
+    NPY_END_SUPPRESS_IPH
 #else
     handle = fdopen(fd2, mode);
 #endif
     if (handle == NULL) {
         PyErr_SetString(PyExc_IOError,
-                        "Getting a FILE* from a Python file object failed");
+                        "Getting a FILE* from a Python file object via "
+                        "_fdopen failed. If you built NumPy, you probably "
+                        "linked with the wrong debug/release runtime");
         return NULL;
     }
 
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index ea4a818c8..fb976aa6a 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -40,39 +40,6 @@
 #define NPY_GCC_OPT_3
 #endif
 
-/* compile target attributes */
-#if defined HAVE_ATTRIBUTE_TARGET_AVX && defined HAVE_LINK_AVX
-#define NPY_GCC_TARGET_AVX __attribute__((target("avx")))
-#else
-#define NPY_GCC_TARGET_AVX
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define HAVE_ATTRIBUTE_TARGET_FMA
-#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#else
-#define NPY_GCC_TARGET_AVX2
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
-#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
-#else
-#define NPY_GCC_TARGET_AVX512F
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX && defined HAVE_LINK_AVX512_SKX
-#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
-#else
-#define NPY_GCC_TARGET_AVX512_SKX
-#endif
 /*
  * mark an argument (starting from 1) that must not be NULL and is not checked
  * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -83,21 +50,6 @@
 #define NPY_GCC_NONNULL(n)
 #endif
 
-#if defined HAVE_XMMINTRIN_H && defined HAVE__MM_LOAD_PS
-#define NPY_HAVE_SSE_INTRINSICS
-#endif
-
-#if defined HAVE_EMMINTRIN_H && defined HAVE__MM_LOAD_PD
-#define NPY_HAVE_SSE2_INTRINSICS
-#endif
-
-#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
-#define NPY_HAVE_AVX2_INTRINSICS
-#endif
-
-#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
-#define NPY_HAVE_AVX512F_INTRINSICS
-#endif
 /*
  * give a hint to the compiler which branch is more likely or unlikely
  * to occur, e.g. rare error cases:
@@ -120,7 +72,7 @@
 /* unlike _mm_prefetch also works on non-x86 */
 #define NPY_PREFETCH(x, rw, loc) __builtin_prefetch((x), (rw), (loc))
 #else
-#ifdef HAVE__MM_PREFETCH
+#ifdef NPY_HAVE_SSE
 /* _MM_HINT_ET[01] (rw = 1) unsupported, only available in gcc >= 4.9 */
 #define NPY_PREFETCH(x, rw, loc) _mm_prefetch((x), loc == 0 ? _MM_HINT_NTA : \
                                              (loc == 1 ? _MM_HINT_T2 : \
@@ -153,6 +105,14 @@
     #define NPY_FINLINE static
 #endif
 
+#if defined(_MSC_VER)
+    #define NPY_NOINLINE static __declspec(noinline)
+#elif defined(__GNUC__) || defined(__clang__)
+    #define NPY_NOINLINE static __attribute__((noinline))
+#else
+    #define NPY_NOINLINE static
+#endif
+
 #ifdef HAVE___THREAD
     #define NPY_TLS __thread
 #else
@@ -177,7 +137,7 @@
  #define NPY_STEALS_REF_TO_ARG(n)
 #endif
 
-/* 64 bit file position support, also on win-amd64. Ticket #1660 */
+/* 64 bit file position support, also on win-amd64. Issue gh-2256 */
 #if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400) || \
     defined(__MINGW32__) || defined(__MINGW64__)
     #include <io.h>
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 78d229e7d..a19f8e6bb 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -77,7 +77,7 @@
     #elif defined(__ARMEL__) || defined(__AARCH64EL__) || defined(_M_ARM64)
         #if defined(__ARM_32BIT_STATE)
             #define NPY_CPU_ARMEL_AARCH32
-        #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+        #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(__AARCH64EL__)
             #define NPY_CPU_ARMEL_AARCH64
         #else
             #define NPY_CPU_ARMEL
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index a1fd11396..2fcd41eb0 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -184,30 +184,22 @@ NPY_INPLACE double npy_atan2(double x, double y);
 #define npy_fmod fmod
 #define npy_floor floor
 #define npy_expm1 expm1
+#define npy_log1p log1p
 #define npy_acosh acosh
+#define npy_asinh asinh
 #define npy_atanh atanh
 #define npy_rint rint
 #define npy_trunc trunc
 #define npy_exp2 exp2
 #define npy_frexp frexp
 #define npy_ldexp ldexp
+#define npy_copysign copysign
 #define npy_exp exp
 #define npy_sqrt sqrt
 #define npy_pow pow
 #define npy_modf modf
 #define npy_nextafter nextafter
 
-#if defined(__arm64__) && defined(__APPLE__)
-/* due to a build problem with scipy, export these as functions */
-NPY_INPLACE double npy_asinh(double x);
-NPY_INPLACE double npy_copysign(double y, double x);
-NPY_INPLACE double npy_log1p(double x);
-#else
-#define npy_asinh asinh
-#define npy_copysign copysign
-#define npy_log1p log1p
-#endif
-
 double npy_spacing(double x);
 
 /*
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 308923b12..1c25aa5fc 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -3,31 +3,136 @@
 
 #include "_numpyconfig.h"
 
+/*
+ * On Mac OS X, because there is only one configuration stage for all the archs
+ * in universal builds, any macro which depends on the arch needs to be
+ * hardcoded.
+ *
+ * Note that distutils/pip will attempt a universal2 build when Python itself
+ * is built as universal2, hence this hardcoding is needed even if we do not
+ * support universal2 wheels anymore (see gh-22796).
+ * This code block can be removed after we have dropped the setup.py based
+ * build completely.
+ */
+#ifdef __APPLE__
+    #undef NPY_SIZEOF_LONG
+    #undef NPY_SIZEOF_PY_INTPTR_T
+
+    #ifdef __LP64__
+        #define NPY_SIZEOF_LONG         8
+        #define NPY_SIZEOF_PY_INTPTR_T  8
+    #else
+        #define NPY_SIZEOF_LONG         4
+        #define NPY_SIZEOF_PY_INTPTR_T  4
+    #endif
+
+    #undef NPY_SIZEOF_LONGDOUBLE
+    #undef NPY_SIZEOF_COMPLEX_LONGDOUBLE
+    #ifdef HAVE_LDOUBLE_IEEE_DOUBLE_LE
+      #undef HAVE_LDOUBLE_IEEE_DOUBLE_LE
+    #endif
+    #ifdef HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
+      #undef HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
+    #endif
+
+    #if defined(__arm64__)
+        #define NPY_SIZEOF_LONGDOUBLE         8
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 16
+        #define HAVE_LDOUBLE_IEEE_DOUBLE_LE 1
+    #elif defined(__x86_64)
+        #define NPY_SIZEOF_LONGDOUBLE         16
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32
+        #define HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE 1
+    #elif defined (__i386)
+        #define NPY_SIZEOF_LONGDOUBLE         12
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 24
+    #elif defined(__ppc__) || defined (__ppc64__)
+        #define NPY_SIZEOF_LONGDOUBLE         16
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32
+    #else
+        #error "unknown architecture"
+    #endif
+#endif
+
+
 /**
- * To help with the NPY_NO_DEPRECATED_API macro, we include API version
- * numbers for specific versions of NumPy. To exclude all API that was
- * deprecated as of 1.7, add the following before #including any NumPy
- * headers:
+ * To help with both NPY_TARGET_VERSION and the NPY_NO_DEPRECATED_API macro,
+ * we include API version numbers for specific versions of NumPy.
+ * To exclude all API that was deprecated as of 1.7, add the following before
+ * #including any NumPy headers:
  *   #define NPY_NO_DEPRECATED_API  NPY_1_7_API_VERSION
+ * The same is true for NPY_TARGET_VERSION, although NumPy will default to
+ * a backwards compatible build anyway.
  */
 #define NPY_1_7_API_VERSION 0x00000007
 #define NPY_1_8_API_VERSION 0x00000008
-#define NPY_1_9_API_VERSION 0x00000008
-#define NPY_1_10_API_VERSION 0x00000008
-#define NPY_1_11_API_VERSION 0x00000008
-#define NPY_1_12_API_VERSION 0x00000008
-#define NPY_1_13_API_VERSION 0x00000008
-#define NPY_1_14_API_VERSION 0x00000008
-#define NPY_1_15_API_VERSION 0x00000008
-#define NPY_1_16_API_VERSION 0x00000008
-#define NPY_1_17_API_VERSION 0x00000008
-#define NPY_1_18_API_VERSION 0x00000008
-#define NPY_1_19_API_VERSION 0x00000008
+#define NPY_1_9_API_VERSION 0x00000009
+#define NPY_1_10_API_VERSION 0x0000000a
+#define NPY_1_11_API_VERSION 0x0000000a
+#define NPY_1_12_API_VERSION 0x0000000a
+#define NPY_1_13_API_VERSION 0x0000000b
+#define NPY_1_14_API_VERSION 0x0000000c
+#define NPY_1_15_API_VERSION 0x0000000c
+#define NPY_1_16_API_VERSION 0x0000000d
+#define NPY_1_17_API_VERSION 0x0000000d
+#define NPY_1_18_API_VERSION 0x0000000d
+#define NPY_1_19_API_VERSION 0x0000000d
 #define NPY_1_20_API_VERSION 0x0000000e
 #define NPY_1_21_API_VERSION 0x0000000e
 #define NPY_1_22_API_VERSION 0x0000000f
 #define NPY_1_23_API_VERSION 0x00000010
 #define NPY_1_24_API_VERSION 0x00000010
-#define NPY_1_25_API_VERSION 0x00000010
+#define NPY_1_25_API_VERSION 0x00000011
+
+
+/*
+ * Binary compatibility version number.  This number is increased
+ * whenever the C-API is changed such that binary compatibility is
+ * broken, i.e. whenever a recompile of extension modules is needed.
+ */
+#define NPY_VERSION NPY_ABI_VERSION
+
+/*
+ * Minor API version we are compiling to be compatible with.  The version
+ * Number is always increased when the API changes via: `NPY_API_VERSION`
+ * (and should maybe just track the NumPy version).
+ *
+ * If we have an internal build, we always target the current version of
+ * course.
+ *
+ * For downstream users, we default to an older version to provide them with
+ * maximum compatibility by default.  Downstream can choose to extend that
+ * default, or narrow it down if they wish to use newer API.  If you adjust
+ * this, consider the Python version support (example for 1.25.x):
+ *
+ * NumPy 1.25.x supports Python:                     3.9  3.10  3.11  (3.12)
+ * NumPy 1.19.x supports Python:      3.6  3.7  3.8  3.9
+ * NumPy 1.17.x supports Python: 3.5  3.6  3.7  3.8
+ * NumPy 1.15.x supports Python: ...  3.6  3.7
+ *
+ * Users of the stable ABI may wish to target the last Python that is not
+ * end of life.  This would be 3.8 at NumPy 1.25 release time.
+ * 1.17 as default was the choice of oldest-support-numpy at the time and
+ * has in practice no limit (comapared to 1.19).  Even earlier becomes legacy.
+ */
+#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
+    /* NumPy internal build, always use current version. */
+    #define NPY_FEATURE_VERSION NPY_API_VERSION
+#elif defined(NPY_TARGET_VERSION) && NPY_TARGET_VERSION
+    /* user provided a target version, use it */
+    #define NPY_FEATURE_VERSION NPY_TARGET_VERSION
+#else
+    /* Use the default (increase when dropping Python 3.9 support) */
+    #define NPY_FEATURE_VERSION NPY_1_19_API_VERSION
+#endif
+
+/* Sanity check the (requested) feature version */
+#if NPY_FEATURE_VERSION > NPY_API_VERSION
+    #error "NPY_TARGET_VERSION higher than NumPy headers!"
+#elif NPY_FEATURE_VERSION < NPY_1_15_API_VERSION
+    /* No support for irrelevant old targets, no need for error, but warn. */
+    #warning "Requested NumPy target lower than supported NumPy 1.15."
+#endif
+
 
 #endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_ */
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index bb0633100..9e00f2e56 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -197,7 +197,7 @@ typedef struct _tagPyUFuncObject {
         npy_uint32 iter_flags;
 
         /* New in NPY_API_VERSION 0x0000000D and above */
-
+    #if NPY_FEATURE_VERSION >= NPY_1_16_API_VERSION
         /*
          * for each core_num_dim_ix distinct dimension names,
          * the possible "frozen" size (-1 if not frozen).
@@ -211,13 +211,15 @@ typedef struct _tagPyUFuncObject {
 
         /* Identity for reduction, when identity == PyUFunc_IdentityValue */
         PyObject *identity_value;
+    #endif  /* NPY_FEATURE_VERSION >= NPY_1_16_API_VERSION */
 
         /* New in NPY_API_VERSION 0x0000000F and above */
-
+    #if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
         /* New private fields related to dispatching */
         void *_dispatch_cache;
         /* A PyListObject of `(tuple of DTypes, ArrayMethod/Promoter)` */
         PyObject *_loops;
+    #endif
 } PyUFuncObject;
 
 #include "arrayobject.h"
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 50cd8ccc5..f375e9647 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -44,7 +44,8 @@ C_ABI_VERSION = '0x01000009'
 # 0x0000000f - 1.22.x
 # 0x00000010 - 1.23.x
 # 0x00000010 - 1.24.x
-C_API_VERSION = '0x00000010'
+# 0x00000011 - 1.25.x
+C_API_VERSION = '0x00000011'
 
 # Check whether we have a mismatch between the set C API VERSION and the
 # actual C API VERSION. Will raise a MismatchCAPIError if so.
@@ -83,6 +84,9 @@ if use_svml
     error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.')
   endif
 endif
+if not fs.exists('src/npysort/x86-simd-sort/README.md')
+  error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.')
+endif
 
 # Check sizes of types. Note, some of these landed in config.h before, but were
 # unused. So clean that up and only define the NPY_SIZEOF flavors rather than
@@ -109,11 +113,19 @@ cdata.set('NPY_SIZEOF_PY_LONG_LONG',
 if cc.has_header('complex.h')
   cdata.set10('HAVE_COMPLEX_H', true)
   cdata.set10('NPY_USE_C99_COMPLEX', true)
-  complex_types_to_check = [
-    ['NPY_HAVE_COMPLEX_FLOAT', 'NPY_SIZEOF_COMPLEX_FLOAT', 'complex float', 'float'],
-    ['NPY_HAVE_COMPLEX_DOUBLE', 'NPY_SIZEOF_COMPLEX_DOUBLE', 'complex double', 'double'],
-    ['NPY_HAVE_COMPLEX_LONG_DOUBLE', 'NPY_SIZEOF_COMPLEX_LONGDOUBLE', 'complex long double', 'long double'],
-  ]
+  if cc.get_argument_syntax() == 'msvc'
+    complex_types_to_check = [
+      ['NPY_HAVE_COMPLEX_FLOAT', 'NPY_SIZEOF_COMPLEX_FLOAT', '_Fcomplex', 'float'],
+      ['NPY_HAVE_COMPLEX_DOUBLE', 'NPY_SIZEOF_COMPLEX_DOUBLE', '_Dcomplex', 'double'],
+      ['NPY_HAVE_COMPLEX_LONG_DOUBLE', 'NPY_SIZEOF_COMPLEX_LONGDOUBLE', '_Lcomplex', 'long double'],
+    ]
+  else
+    complex_types_to_check = [
+      ['NPY_HAVE_COMPLEX_FLOAT', 'NPY_SIZEOF_COMPLEX_FLOAT', 'complex float', 'float'],
+      ['NPY_HAVE_COMPLEX_DOUBLE', 'NPY_SIZEOF_COMPLEX_DOUBLE', 'complex double', 'double'],
+      ['NPY_HAVE_COMPLEX_LONG_DOUBLE', 'NPY_SIZEOF_COMPLEX_LONGDOUBLE', 'complex long double', 'long double'],
+    ]
+  endif
   foreach symbol_type: complex_types_to_check
     if cc.has_type(symbol_type[2], prefix: '#include <complex.h>')
       cdata.set10(symbol_type[0], true)
@@ -133,7 +145,7 @@ mandatory_funcs = [
   'floor', 'ceil', 'sqrt', 'log10', 'log', 'exp', 'asin',
   'acos', 'atan', 'fmod', 'modf', 'frexp', 'ldexp',
   'expm1', 'log1p', 'acosh', 'asinh', 'atanh',
-  'rint', 'trunc', 'exp2', 
+  'rint', 'trunc', 'exp2',
   'copysign', 'nextafter', 'strtoll', 'strtoull', 'cbrt',
   'log2', 'pow', 'hypot', 'atan2',
   'csin', 'csinh', 'ccos', 'ccosh', 'ctan', 'ctanh',
@@ -147,7 +159,11 @@ endforeach
 
 c99_complex_funcs = [
   'cabs', 'cacos', 'cacosh', 'carg', 'casin', 'casinh', 'catan',
-  'catanh', 'cexp', 'clog', 'cpow', 'csqrt'
+  'catanh', 'cexp', 'clog', 'cpow', 'csqrt',
+  # The long double variants (like csinl)  should be mandatory on C11,
+  # but are missing in FreeBSD. Issue gh-22850
+  'csin', 'csinh', 'ccos', 'ccosh', 'ctan', 'ctanh',
+
 ]
 foreach func: c99_complex_funcs
   func_single = func + 'f'
@@ -246,7 +262,9 @@ else
   # function is not available in CI. For the latter there is a fallback path,
   # but that is broken because we don't have the exact long double
   # representation checks.
-  cdata.set10('HAVE_STRTOLD_L', false)
+  if cc.get_argument_syntax() != 'msvc'
+    cdata.set10('HAVE_STRTOLD_L', false)
+  endif
 endif
 
 # Other optional functions
@@ -289,16 +307,7 @@ optional_function_attributes = [
   ['optimize("O3")', 'OPTIMIZE_OPT_3'],
   ['optimize("O2")', 'OPTIMIZE_OPT_2'],
   ['optimize("nonnull (1)")', 'NONNULL'],
-  ]
-if host_machine.cpu_family() in ['x86', 'x86_64']
-  optional_function_attributes += [
-    ['target("avx")', 'TARGET_AVX'],
-    ['target("avx2")', 'TARGET_AVX2'],
-    ['target("avx512f")', 'TARGET_AVX512F'],
-    ['target("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")', 'TARGET_AVX512_SKX'],
-  ]
-  # TODO: add the _WITH_INTRINSICS_AVX list
-endif
+]
 #foreach attr: optional_function_attributes
 #  if cc.has_function_attribute(attr[0])
 #    cdata.set10('HAVE_ATTRIBUTE_' + attr[1], true)
@@ -315,27 +324,10 @@ optional_intrinsics = [
   ['__builtin_bswap32', '5u', [], []],
   ['__builtin_bswap64', '5u', [], []],
   ['__builtin_expect', '5, 0', [], []],
-  ['__builtin_mul_overflow', '5, 5, (int*)5', [], []],
+  # Test `long long` for arm+clang 13 (gh-22811, but we use all versions):
+  ['__builtin_mul_overflow', '(long long)5, 5, (int*)5', [], []],
+  ['__builtin_prefetch', '(float*)0, 0, 3', [], []],
 ]
-if host_machine.cpu_family() in ['x86', 'x86_64']
- optional_intrinsics += [
-    # MMX only needed for icc, but some clang's don't have it
-    ['_m_from_int64', '0', ['emmintrin.h'], []],
-    ['_mm_load_ps', '(float*)0', ['xmmintrin.h'], []],  # SSE
-    ['_mm_prefetch', '(float*)0, _MM_HINT_NTA', ['xmmintrin.h'], []],  # SSE
-    ['_mm_load_pd', '(double*)0', ['emmintrin.h'], []],  # SSE2
-    ['__builtin_prefetch', '(float*)0, 0, 3', [], []],
-    # Check that the linker can handle AVX
-    ['__asm__ volatile', '"vpand %xmm1, %xmm2, %xmm3"', ['stdio.h'], ['HAVE_LINK_AVX']],
-    ['__asm__ volatile', '"vpand %ymm1, %ymm2, %ymm3"', ['stdio.h'], ['HAVE_LINK_AVX2']],
-    ['__asm__ volatile', '"vpaddd %zmm1, %zmm2, %zmm3"', ['stdio.h'], ['HAVE_LINK_AVX512F']],
-    ['__asm__ volatile',
-     '"vfpclasspd $0x40, %zmm15, %k6\\n vmovdqu8 %xmm0, %xmm1\\n vpbroadcastmb2q %k0, %xmm0"',
-     ['stdio.h'], ['HAVE_LINK_AVX512_SKX']
-    ],
-    ['__asm__ volatile', '"xgetbv"', ['stdio.h'], ['HAVE_XGETBV']],
-  ]
-endif
 foreach intrin: optional_intrinsics
   func = intrin[0]
   func_args = intrin[1]
@@ -439,6 +431,7 @@ _numpyconfig_h = configure_file(
 # ----------------------------
 
 staticlib_cflags = []
+staticlib_cppflags = []
 if cc.get_id() == 'msvc'
   # Disable voltbl section for vc142 to allow link using mingw-w64; see:
   # https://github.com/matthew-brett/dll_investigation/issues/1#issuecomment-1100468171
@@ -446,7 +439,13 @@ if cc.get_id() == 'msvc'
   # libnpymath and libnpyrandom)
   if cc.has_argument('-d2VolatileMetadata-')
      staticlib_cflags +=  '-d2VolatileMetadata-'
-   endif
+  endif
+endif
+# TODO: change to "feature" option in meson_options.txt? See
+# https://mesonbuild.com/Build-options.html#build-options
+if get_option('disable-simd-optimizations')
+  staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
+  staticlib_cppflags += '-DNPY_DISABLE_OPTIMIZATION'
 endif
 
 npy_math_internal_h = custom_target(
@@ -459,12 +458,13 @@ npymath_sources = [
   src_file.process('src/npymath/ieee754.c.src'),
   src_file.process('src/npymath/npy_math_complex.c.src'),
   npy_math_internal_h,
-  'src/npymath/halffloat.c',
+  'src/npymath/halffloat.cpp',
   'src/npymath/npy_math.c',
 ]
 npymath_lib = static_library('npymath',
   npymath_sources,
   c_args: staticlib_cflags,
+  cpp_args: staticlib_cppflags,
   include_directories: ['include', 'src/npymath', 'src/common'],
   dependencies: py_dep,
   install: true,
@@ -534,7 +534,7 @@ src_umath_doc_h = custom_target('_umath_doc_generated',
 )
 
 src_numpy_api = custom_target('__multiarray_api',
-  output : ['__multiarray_api.c', '__multiarray_api.h', 'multiarray_api.txt'],
+  output : ['__multiarray_api.c', '__multiarray_api.h'],
   input : 'code_generators/generate_numpy_api.py',
   command: [py, '@INPUT@', '-o', '@OUTDIR@', '--ignore', src_umath_api_c],
   install: true,  # NOTE: setup.py build installs all, but just need .h?
@@ -542,7 +542,7 @@ src_numpy_api = custom_target('__multiarray_api',
 )
 
 src_ufunc_api = custom_target('__ufunc_api',
-  output : ['__ufunc_api.c', '__ufunc_api.h', 'ufunc_api.txt'],
+  output : ['__ufunc_api.c', '__ufunc_api.h'],
   input : 'code_generators/generate_ufunc_api.py',
   command: [py, '@INPUT@', '-o', '@OUTDIR@'],
   install: true,  # NOTE: setup.py build installs all, but just need .h?
@@ -571,9 +571,13 @@ c_args_common = [
 # Same as NPY_CXX_FLAGS (TODO: extend for what ccompiler_opt adds)
 cpp_args_common = c_args_common + [
   '-D__STDC_VERSION__=0',  # for compatibility with C headers
-  '-fno-exceptions',  # no exception support
-  '-fno-rtti',  # no runtime type information
 ]
+if cc.get_argument_syntax() != 'msvc'
+  cpp_args_common += [
+    '-fno-exceptions',  # no exception support
+    '-fno-rtti',  # no runtime type information
+  ]
+endif
 
 # Other submodules depend on generated headers and include directories from
 # core, wrap those up into a reusable dependency. Also useful for some test
@@ -589,7 +593,8 @@ np_core_dep = declare_dependency(
     '.',
     'include',
     'src/common',
-  ]
+  ],
+  compile_args: disable_simd_optimizations
 )
 
 
@@ -684,6 +689,7 @@ src_multiarray = [
   'src/multiarray/dtypemeta.c',
   'src/multiarray/dragon4.c',
   'src/multiarray/dtype_transfer.c',
+  'src/multiarray/dtype_traversal.c',
   src_file.process('src/multiarray/einsum.c.src'),
   src_file.process('src/multiarray/einsum_sumprod.c.src'),
   'src/multiarray/experimental_public_dtype_api.c',
@@ -713,7 +719,8 @@ src_multiarray = [
   'src/multiarray/usertypes.c',
   'src/multiarray/vdot.c',
   src_file.process('src/common/npy_sort.h.src'),
-  'src/npysort/x86-qsort.dispatch.cpp',
+  'src/npysort/simd_qsort.dispatch.cpp',
+  'src/npysort/simd_qsort_16bit.dispatch.cpp',
   'src/npysort/quicksort.cpp',
   'src/npysort/mergesort.cpp',
   'src/npysort/timsort.cpp',
@@ -731,6 +738,9 @@ src_multiarray = [
   'src/multiarray/textreading/stream_pyobject.c',
   'src/multiarray/textreading/str_to_int.c',
   'src/multiarray/textreading/tokenize.cpp',
+  # Remove this `arm64_exports.c` file once scipy macos arm64 build correctly
+  # links to the arm64 npymath library, see gh-22673
+  'src/npymath/arm64_exports.c',
 ]
 
 src_umath = [
@@ -743,15 +753,18 @@ src_umath = [
   src_file.process('src/umath/loops_comparison.dispatch.c.src'),
   src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
   src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
+  src_file.process('src/umath/loops_logical.dispatch.c.src'),
   src_file.process('src/umath/loops_minmax.dispatch.c.src'),
   src_file.process('src/umath/loops_modulo.dispatch.c.src'),
   src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
   src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
   src_file.process('src/umath/loops_unary.dispatch.c.src'),
   src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
+  src_file.process('src/umath/loops_autovec.dispatch.c.src'),
   src_file.process('src/umath/matmul.c.src'),
   src_file.process('src/umath/matmul.h.src'),
-  src_file.process('src/umath/simd.inc.src'),
   'src/umath/ufunc_type_resolution.c',
   'src/umath/clip.cpp',
   'src/umath/clip.h',
@@ -864,9 +877,9 @@ py.extension_module('_simd',
     src_file.process('src/_simd/_simd.dispatch.c.src'),
   ],
   c_args: c_args_common,
-  #include_directories: ['src/multiarray', 'src/npymath'],
-  include_directories: ['src/_simd'],
+  include_directories: ['src/_simd', 'src/npymath'],
   dependencies: np_core_dep,
+  link_with: npymath_lib,
   install: true,
   subdir: 'numpy/core',
 )
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 636a6fbbd..d11283345 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -14,10 +14,10 @@ from ._multiarray_umath import *  # noqa: F403
 # do not change them. issue gh-15518
 # _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
-    fastCopyAndTranspose, _flagdict, from_dlpack, _insert, _reconstruct,
+    fastCopyAndTranspose, _flagdict, from_dlpack, _place, _reconstruct,
     _vec_string, _ARRAY_API, _monotonicity, _get_ndarray_c_version,
     _get_madvise_hugepage, _set_madvise_hugepage,
-    _get_promotion_state, _set_promotion_state,
+    _get_promotion_state, _set_promotion_state, _using_numpy2_behavior
     )
 
 __all__ = [
@@ -25,7 +25,7 @@ __all__ = [
     'ITEM_HASOBJECT', 'ITEM_IS_POINTER', 'LIST_PICKLE', 'MAXDIMS',
     'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'NEEDS_INIT', 'NEEDS_PYAPI',
     'RAISE', 'USE_GETITEM', 'USE_SETITEM', 'WRAP',
-    '_flagdict', 'from_dlpack', '_insert', '_reconstruct', '_vec_string',
+    '_flagdict', 'from_dlpack', '_place', '_reconstruct', '_vec_string',
     '_monotonicity', 'add_docstring', 'arange', 'array', 'asarray',
     'asanyarray', 'ascontiguousarray', 'asfortranarray', 'bincount',
     'broadcast', 'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
@@ -42,7 +42,7 @@ __all__ = [
     'set_legacy_print_mode', 'set_numeric_ops', 'set_string_function',
     'set_typeDict', 'shares_memory', 'tracemalloc_domain', 'typeinfo',
     'unpackbits', 'unravel_index', 'vdot', 'where', 'zeros',
-    '_get_promotion_state', '_set_promotion_state']
+    '_get_promotion_state', '_set_promotion_state', '_using_numpy2_behavior']
 
 # For backward compatibility, make sure pickle imports these functions from here
 _reconstruct.__module__ = 'numpy.core.multiarray'
@@ -72,6 +72,7 @@ seterrobj.__module__ = 'numpy'
 zeros.__module__ = 'numpy'
 _get_promotion_state.__module__ = 'numpy'
 _set_promotion_state.__module__ = 'numpy'
+_using_numpy2_behavior.__module__ = 'numpy'
 
 
 # We can't verify dispatcher signatures because NumPy's C functions don't
@@ -714,7 +715,7 @@ def result_type(*arrays_and_dtypes):
     the data types are combined with :func:`promote_types`
     to produce the return value.
 
-    Otherwise, `min_scalar_type` is called on each array, and
+    Otherwise, `min_scalar_type` is called on each scalar, and
     the resulting data types are all combined with :func:`promote_types`
     to produce the return value.
 
@@ -1128,7 +1129,7 @@ def copyto(dst, src, casting=None, where=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.putmask)
-def putmask(a, mask, values):
+def putmask(a, /, mask, values):
     """
     putmask(a, mask, values)
 
diff --git a/numpy/core/multiarray.pyi b/numpy/core/multiarray.pyi
index 0701085b7..dc05f8126 100644
--- a/numpy/core/multiarray.pyi
+++ b/numpy/core/multiarray.pyi
@@ -428,6 +428,7 @@ def copyto(
 
 def putmask(
     a: NDArray[Any],
+    /,
     mask: _ArrayLikeBool_co,
     values: ArrayLike,
 ) -> None: ...
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 577f8e7cd..91ac3f860 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -4,6 +4,7 @@ import operator
 import sys
 import warnings
 import numbers
+import builtins
 
 import numpy as np
 from . import multiarray
@@ -17,7 +18,8 @@ from .multiarray import (
     fromstring, inner, lexsort, matmul, may_share_memory,
     min_scalar_type, ndarray, nditer, nested_iters, promote_types,
     putmask, result_type, set_numeric_ops, shares_memory, vdot, where,
-    zeros, normalize_axis_index, _get_promotion_state, _set_promotion_state)
+    zeros, normalize_axis_index, _get_promotion_state, _set_promotion_state,
+    _using_numpy2_behavior)
 
 from . import overrides
 from . import umath
@@ -54,7 +56,8 @@ __all__ = [
     'False_', 'True_', 'bitwise_not', 'CLIP', 'RAISE', 'WRAP', 'MAXDIMS',
     'BUFSIZE', 'ALLOW_THREADS', 'full', 'full_like',
     'matmul', 'shares_memory', 'may_share_memory', 'MAY_SHARE_BOUNDS',
-    'MAY_SHARE_EXACT', '_get_promotion_state', '_set_promotion_state']
+    'MAY_SHARE_EXACT', '_get_promotion_state', '_set_promotion_state',
+    '_using_numpy2_behavior']
 
 
 def _zeros_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
@@ -130,10 +133,6 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
-def _ones_dispatcher(shape, dtype=None, order=None, *, like=None):
-    return(like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def ones(shape, dtype=None, order='C', *, like=None):
@@ -187,16 +186,14 @@ def ones(shape, dtype=None, order='C', *, like=None):
 
     """
     if like is not None:
-        return _ones_with_like(shape, dtype=dtype, order=order, like=like)
+        return _ones_with_like(like, shape, dtype=dtype, order=order)
 
     a = empty(shape, dtype, order)
     multiarray.copyto(a, 1, casting='unsafe')
     return a
 
 
-_ones_with_like = array_function_dispatch(
-    _ones_dispatcher
-)(ones)
+_ones_with_like = array_function_dispatch()(ones)
 
 
 def _ones_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
@@ -323,7 +320,8 @@ def full(shape, fill_value, dtype=None, order='C', *, like=None):
 
     """
     if like is not None:
-        return _full_with_like(shape, fill_value, dtype=dtype, order=order, like=like)
+        return _full_with_like(
+                like, shape, fill_value, dtype=dtype, order=order)
 
     if dtype is None:
         fill_value = asarray(fill_value)
@@ -333,9 +331,7 @@ def full(shape, fill_value, dtype=None, order='C', *, like=None):
     return a
 
 
-_full_with_like = array_function_dispatch(
-    _full_dispatcher
-)(full)
+_full_with_like = array_function_dispatch()(full)
 
 
 def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None, shape=None):
@@ -847,14 +843,13 @@ def outer(a, b, out=None):
     """
     Compute the outer product of two vectors.
 
-    Given two vectors, ``a = [a0, a1, ..., aM]`` and
-    ``b = [b0, b1, ..., bN]``,
+    Given two vectors `a` and `b` of length ``M`` and ``N``, repsectively,
     the outer product [1]_ is::
 
-      [[a0*b0  a0*b1 ... a0*bN ]
-       [a1*b0    .
+      [[a_0*b_0  a_0*b_1 ... a_0*b_{N-1} ]
+       [a_1*b_0    .
        [ ...          .
-       [aM*b0            aM*bN ]]
+       [a_{M-1}*b_0            a_{M-1}*b_{N-1} ]]
 
     Parameters
     ----------
@@ -886,9 +881,9 @@ def outer(a, b, out=None):
 
     References
     ----------
-    .. [1] : G. H. Golub and C. F. Van Loan, *Matrix Computations*, 3rd
-             ed., Baltimore, MD, Johns Hopkins University Press, 1996,
-             pg. 8.
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*, 3rd
+           ed., Baltimore, MD, Johns Hopkins University Press, 1996,
+           pg. 8.
 
     Examples
     --------
@@ -1778,10 +1773,6 @@ def indices(dimensions, dtype=int, sparse=False):
     return res
 
 
-def _fromfunction_dispatcher(function, shape, *, dtype=None, like=None, **kwargs):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
@@ -1847,15 +1838,14 @@ def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
 
     """
     if like is not None:
-        return _fromfunction_with_like(function, shape, dtype=dtype, like=like, **kwargs)
+        return _fromfunction_with_like(
+                like, function, shape, dtype=dtype, **kwargs)
 
     args = indices(shape, dtype=dtype)
     return function(*args, **kwargs)
 
 
-_fromfunction_with_like = array_function_dispatch(
-    _fromfunction_dispatcher
-)(fromfunction)
+_fromfunction_with_like = array_function_dispatch()(fromfunction)
 
 
 def _frombuffer(buf, dtype, shape, order):
@@ -2033,7 +2023,7 @@ def binary_repr(num, width=None):
         binary = bin(num)[2:]
         binwidth = len(binary)
         outwidth = (binwidth if width is None
-                    else max(binwidth, width))
+                    else builtins.max(binwidth, width))
         warn_if_insufficient(width, binwidth)
         return binary.zfill(outwidth)
 
@@ -2053,7 +2043,7 @@ def binary_repr(num, width=None):
             binary = bin(twocomp)[2:]
             binwidth = len(binary)
 
-            outwidth = max(binwidth, width)
+            outwidth = builtins.max(binwidth, width)
             warn_if_insufficient(width, binwidth)
             return '1' * (outwidth - binwidth) + binary
 
@@ -2130,10 +2120,6 @@ def _maketup(descr, val):
         return tuple(res)
 
 
-def _identity_dispatcher(n, dtype=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def identity(n, dtype=None, *, like=None):
@@ -2168,15 +2154,13 @@ def identity(n, dtype=None, *, like=None):
 
     """
     if like is not None:
-        return _identity_with_like(n, dtype=dtype, like=like)
+        return _identity_with_like(like, n, dtype=dtype)
 
     from numpy import eye
     return eye(n, dtype=dtype, like=like)
 
 
-_identity_with_like = array_function_dispatch(
-    _identity_dispatcher
-)(identity)
+_identity_with_like = array_function_dispatch()(identity)
 
 
 def _allclose_dispatcher(a, b, rtol=None, atol=None, equal_nan=None):
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 7a5948025..aea41bc2e 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -80,6 +80,7 @@ Exported symbols include:
 
 """
 import numbers
+import warnings
 
 from .multiarray import (
         ndarray, array, dtype, datetime_data, datetime_as_string,
@@ -599,6 +600,16 @@ def find_common_type(array_types, scalar_types):
     """
     Determine common type following standard coercion rules.
 
+    .. deprecated:: NumPy 1.25
+
+        This function is deprecated, use `numpy.promote_types` or
+        `numpy.result_type` instead.  To achieve semantics for the
+        `scalar_types` argument, use `numpy.result_type` and pass the Python
+        values `0`, `0.0`, or `0j`.
+        This will give the same results in almost all cases.
+        More information and rare exception can be found in the
+        `NumPy 1.25 release notes <https://numpy.org/devdocs/release/1.25.0-notes.html>`_.
+
     Parameters
     ----------
     array_types : sequence
@@ -646,6 +657,14 @@ def find_common_type(array_types, scalar_types):
     dtype('complex128')
 
     """
+    # Deprecated 2022-11-07, NumPy 1.25
+    warnings.warn(
+            "np.find_common_type is deprecated.  Please use `np.result_type` "
+            "or `np.promote_types`.\n"
+            "See https://numpy.org/devdocs/release/1.25.0-notes.html and the "
+            "docs for more information.  (Deprecated NumPy 1.25)",
+            DeprecationWarning, stacklevel=2)
+
     array_types = [dtype(x) for x in array_types]
     scalar_types = [dtype(x) for x in scalar_types]
 
diff --git a/numpy/core/numerictypes.pyi b/numpy/core/numerictypes.pyi
index d10e4822a..d05861b2e 100644
--- a/numpy/core/numerictypes.pyi
+++ b/numpy/core/numerictypes.pyi
@@ -118,11 +118,6 @@ def issubdtype(arg1: DTypeLike, arg2: DTypeLike) -> bool: ...
 
 def sctype2char(sctype: DTypeLike) -> str: ...
 
-def find_common_type(
-    array_types: Iterable[DTypeLike],
-    scalar_types: Iterable[DTypeLike],
-) -> dtype[Any]: ...
-
 cast: _typedict[_CastFunc]
 nbytes: _typedict[int]
 typecodes: _TypeCodes
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 46e1fbe2c..6403e65b0 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -6,14 +6,11 @@ import os
 from .._utils import set_module
 from .._utils._inspect import getargspec
 from numpy.core._multiarray_umath import (
-    add_docstring, implement_array_function, _get_implementing_args)
+    add_docstring,  _get_implementing_args, _ArrayFunctionDispatcher)
 
 
 ARRAY_FUNCTIONS = set()
 
-ARRAY_FUNCTION_ENABLED = bool(
-    int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 1)))
-
 array_function_like_doc = (
     """like : array_like, optional
         Reference object to allow the creation of arrays which are not
@@ -33,40 +30,35 @@ def set_array_function_like_doc(public_api):
 
 
 add_docstring(
-    implement_array_function,
+    _ArrayFunctionDispatcher,
     """
-    Implement a function with checks for __array_function__ overrides.
+    Class to wrap functions with checks for __array_function__ overrides.
 
     All arguments are required, and can only be passed by position.
 
     Parameters
     ----------
+    dispatcher : function or None
+        The dispatcher function that returns a single sequence-like object
+        of all arguments relevant.  It must have the same signature (except
+        the default values) as the actual implementation.
+        If ``None``, this is a ``like=`` dispatcher and the
+        ``_ArrayFunctionDispatcher`` must be called with ``like`` as the
+        first (additional and positional) argument.
     implementation : function
-        Function that implements the operation on NumPy array without
-        overrides when called like ``implementation(*args, **kwargs)``.
-    public_api : function
-        Function exposed by NumPy's public API originally called like
-        ``public_api(*args, **kwargs)`` on which arguments are now being
-        checked.
-    relevant_args : iterable
-        Iterable of arguments to check for __array_function__ methods.
-    args : tuple
-        Arbitrary positional arguments originally passed into ``public_api``.
-    kwargs : dict
-        Arbitrary keyword arguments originally passed into ``public_api``.
-
-    Returns
-    -------
-    Result from calling ``implementation()`` or an ``__array_function__``
-    method, as appropriate.
+        Function that implements the operation on NumPy arrays without
+        overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``
+        will be forwarded to this (and the ``dispatcher``) as if using
+        ``*args, **kwargs``.
 
-    Raises
-    ------
-    TypeError : if no implementation is found.
+    Attributes
+    ----------
+    _implementation : function
+        The original implementation passed in.
     """)
 
 
-# exposed for testing purposes; used internally by implement_array_function
+# exposed for testing purposes; used internally by _ArrayFunctionDispatcher
 add_docstring(
     _get_implementing_args,
     """
@@ -110,7 +102,7 @@ def verify_matching_signatures(implementation, dispatcher):
                                'default argument values')
 
 
-def array_function_dispatch(dispatcher, module=None, verify=True,
+def array_function_dispatch(dispatcher=None, module=None, verify=True,
                             docs_from_dispatcher=False):
     """Decorator for adding dispatch with the __array_function__ protocol.
 
@@ -118,10 +110,14 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
 
     Parameters
     ----------
-    dispatcher : callable
+    dispatcher : callable or None
         Function that when called like ``dispatcher(*args, **kwargs)`` with
         arguments from the NumPy function call returns an iterable of
         array-like arguments to check for ``__array_function__``.
+
+        If `None`, the first argument is used as the single `like=` argument
+        and not passed on.  A function implementing `like=` must call its
+        dispatcher with `like` as the first non-keyword argument.
     module : str, optional
         __module__ attribute to set on new function, e.g., ``module='numpy'``.
         By default, module is copied from the decorated function.
@@ -141,58 +137,32 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
     Returns
     -------
     Function suitable for decorating the implementation of a NumPy function.
-    """
-
-    if not ARRAY_FUNCTION_ENABLED:
-        def decorator(implementation):
-            if docs_from_dispatcher:
-                add_docstring(implementation, dispatcher.__doc__)
-            if module is not None:
-                implementation.__module__ = module
-            return implementation
-        return decorator
 
+    """
     def decorator(implementation):
         if verify:
-            verify_matching_signatures(implementation, dispatcher)
+            if dispatcher is not None:
+                verify_matching_signatures(implementation, dispatcher)
+            else:
+                # Using __code__ directly similar to verify_matching_signature
+                co = implementation.__code__
+                last_arg = co.co_argcount + co.co_kwonlyargcount - 1
+                last_arg = co.co_varnames[last_arg]
+                if last_arg != "like" or co.co_kwonlyargcount == 0:
+                    raise RuntimeError(
+                        "__array_function__ expects `like=` to be the last "
+                        "argument and a keyword-only argument. "
+                        f"{implementation} does not seem to comply.")
 
         if docs_from_dispatcher:
             add_docstring(implementation, dispatcher.__doc__)
 
-        @functools.wraps(implementation)
-        def public_api(*args, **kwargs):
-            try:
-                relevant_args = dispatcher(*args, **kwargs)
-            except TypeError as exc:
-                # Try to clean up a signature related TypeError.  Such an
-                # error will be something like:
-                #     dispatcher.__name__() got an unexpected keyword argument
-                #
-                # So replace the dispatcher name in this case.  In principle
-                # TypeErrors may be raised from _within_ the dispatcher, so
-                # we check that the traceback contains a string that starts
-                # with the name.  (In principle we could also check the
-                # traceback length, as it would be deeper.)
-                msg = exc.args[0]
-                disp_name = dispatcher.__name__
-                if not isinstance(msg, str) or not msg.startswith(disp_name):
-                    raise
-
-                # Replace with the correct name and re-raise:
-                new_msg = msg.replace(disp_name, public_api.__name__)
-                raise TypeError(new_msg) from None
-
-            return implement_array_function(
-                implementation, public_api, relevant_args, args, kwargs)
-
-        public_api.__code__ = public_api.__code__.replace(
-                co_name=implementation.__name__,
-                co_filename='<__array_function__ internals>')
+        public_api = _ArrayFunctionDispatcher(dispatcher, implementation)
+        public_api = functools.wraps(implementation)(public_api)
+
         if module is not None:
             public_api.__module__ = module
 
-        public_api._implementation = implementation
-
         ARRAY_FUNCTIONS.add(public_api)
 
         return public_api
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index da5bc64c0..8ef012f70 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -20,7 +20,7 @@ from setup_common import *  # noqa: F403
 NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0")
 if not NPY_RELAXED_STRIDES_CHECKING:
     raise SystemError(
-        "Support for NPY_RELAXED_STRIDES_CHECKING=0 has been remove as of "
+        "Support for NPY_RELAXED_STRIDES_CHECKING=0 has been removed as of "
         "NumPy 1.23.  This error will eventually be removed entirely.")
 
 # Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a
@@ -79,11 +79,13 @@ def can_link_svml():
             and "linux" in platform
             and sys.maxsize > 2**31)
 
-def check_svml_submodule(svmlpath):
-    if not os.path.exists(svmlpath + "/README.md"):
-        raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
-                           "update --init` to fix this.")
-    return True
+def check_git_submodules():
+    out = os.popen("git submodule status")
+    modules = out.readlines()
+    for submodule in modules:
+        if (submodule.strip()[0] == "-"):
+            raise RuntimeError("git submodules are not initialized."
+                    "Please run `git submodule update --init` to fix this.")
 
 def pythonlib_dir():
     """return path where libpython* is."""
@@ -171,18 +173,6 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
         else:
             return 1
 
-    # NOTE: not needed in Meson build, we set the minimum
-    #       compiler version to 8.4 to avoid this bug
-    # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
-    # support on Windows-based platforms
-    def check_gh14787(fn):
-        if fn == 'attribute_target_avx512f':
-            if (sys.platform in ('win32', 'cygwin') and
-                    config.check_compiler_gcc() and
-                    not config.check_gcc_version_at_least(8, 4)):
-                ext.extra_compile_args.extend(
-                        ['-ffixed-xmm%s' % n for n in range(16, 32)])
-
     #use_msvc = config.check_decl("_MSC_VER")
     if not check_funcs_once(MANDATORY_FUNCS, add_to_moredefs=False):
         raise SystemError("One of the required function to build numpy is not"
@@ -233,20 +223,8 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
     for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
-            check_gh14787(fn)
 
     platform = sysconfig.get_platform()
-    if ("x86_64" in platform):
-        for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES_AVX:
-            if config.check_gcc_function_attribute(dec, fn):
-                moredefs.append((fname2def(fn), 1))
-                check_gh14787(fn)
-        for dec, fn, code, header in (
-        OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX):
-            if config.check_gcc_function_attribute_with_intrinsics(
-                    dec, fn, code, header):
-                moredefs.append((fname2def(fn), 1))
-
     for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
         if config.check_gcc_variable_attribute(fn):
             m = fn.replace("(", "_").replace(")", "_")
@@ -427,7 +405,6 @@ def configuration(parent_package='',top_path=None):
                                            exec_mod_from_location)
     from numpy.distutils.system_info import (get_info, blas_opt_info,
                                              lapack_opt_info)
-    from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
     from numpy.version import release as is_released
 
     config = Configuration('core', parent_package, top_path)
@@ -438,6 +415,8 @@ def configuration(parent_package='',top_path=None):
     # actual C API VERSION. Will raise a MismatchCAPIError if so.
     check_api_version(C_API_VERSION, codegen_dir)
 
+    check_git_submodules()
+
     generate_umath_py = join(codegen_dir, 'generate_umath.py')
     n = dot_join(config.name, 'generate_umath')
     generate_umath = exec_mod_from_location('_'.join(n.split('.')),
@@ -630,11 +609,11 @@ def configuration(parent_package='',top_path=None):
             try:
                 m = __import__(module_name)
                 log.info('executing %s', script)
-                h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir))
+                h_file, c_file = m.generate_api(os.path.join(build_dir, header_dir))
             finally:
                 del sys.path[0]
             config.add_data_files((header_dir, h_file),
-                                  (header_dir, doc_file))
+                                 )
             return (h_file,)
         return generate_api
 
@@ -678,44 +657,6 @@ def configuration(parent_package='',top_path=None):
         # but we cannot use add_installed_pkg_config here either, so we only
         # update the substitution dictionary during npymath build
         config_cmd = config.get_config_cmd()
-        # Check that the toolchain works, to fail early if it doesn't
-        # (avoid late errors with MATHLIB which are confusing if the
-        # compiler does not work).
-        for lang, test_code, note in (
-            ('c', 'int main(void) { return 0;}', ''),
-            ('c++', (
-                    'int main(void)'
-                    '{ auto x = 0.0; return static_cast<int>(x); }'
-                ), (
-                    'note: A compiler with support for C++11 language '
-                    'features is required.'
-                )
-             ),
-        ):
-            is_cpp = lang == 'c++'
-            if is_cpp:
-                # this a workaround to get rid of invalid c++ flags
-                # without doing big changes to config.
-                # c tested first, compiler should be here
-                bk_c = config_cmd.compiler
-                config_cmd.compiler = bk_c.cxx_compiler()
-
-                # Check that Linux compiler actually support the default flags
-                if hasattr(config_cmd.compiler, 'compiler'):
-                    config_cmd.compiler.compiler.extend(NPY_CXX_FLAGS)
-                    config_cmd.compiler.compiler_so.extend(NPY_CXX_FLAGS)
-
-            st = config_cmd.try_link(test_code, lang=lang)
-            if not st:
-                # rerun the failing command in verbose mode
-                config_cmd.compiler.verbose = True
-                config_cmd.try_link(test_code, lang=lang)
-                raise RuntimeError(
-                    f"Broken toolchain: cannot link a simple {lang.upper()} "
-                    f"program. {note}"
-                )
-            if is_cpp:
-                config_cmd.compiler = bk_c
         mlibs = check_mathlib(config_cmd)
 
         posix_mlib = ' '.join(['-l%s' % l for l in mlibs])
@@ -728,11 +669,7 @@ def configuration(parent_package='',top_path=None):
                        # join('src', 'npymath', 'ieee754.cpp'),
                        join('src', 'npymath', 'ieee754.c.src'),
                        join('src', 'npymath', 'npy_math_complex.c.src'),
-                       join('src', 'npymath', 'halffloat.c'),
-                       # Remove this once scipy macos arm64 build correctly
-                       # links to the arm64 npymath library,
-                       # see gh-22673
-                       join('src', 'npymath', 'arm64_exports.c'),
+                       join('src', 'npymath', 'halffloat.cpp'),
                        ]
 
     config.add_installed_library('npymath',
@@ -790,7 +727,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'numpyos.h'),
             join('src', 'common', 'npy_cpu_dispatch.h'),
             join('src', 'common', 'simd', 'simd.h'),
-            ]
+            join('src', 'common', 'common.hpp'),
+        ]
 
     common_src = [
             join('src', 'common', 'array_assign.c'),
@@ -844,6 +782,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'descriptor.h'),
             join('src', 'multiarray', 'dtypemeta.h'),
             join('src', 'multiarray', 'dtype_transfer.h'),
+            join('src', 'multiarray', 'dtype_traversal.h'),
             join('src', 'multiarray', 'dragon4.h'),
             join('src', 'multiarray', 'einsum_debug.h'),
             join('src', 'multiarray', 'einsum_sumprod.h'),
@@ -917,6 +856,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dtypemeta.c'),
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
+            join('src', 'multiarray', 'dtype_traversal.c'),
             join('src', 'multiarray', 'einsum.c.src'),
             join('src', 'multiarray', 'einsum_sumprod.c.src'),
             join('src', 'multiarray', 'experimental_public_dtype_api.c'),
@@ -946,7 +886,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
-            join('src', 'npysort', 'x86-qsort.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
@@ -964,6 +903,12 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
             join('src', 'multiarray', 'textreading', 'str_to_int.c'),
             join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
+            # Remove this once scipy macos arm64 build correctly
+            # links to the arm64 npymath library,
+            # see gh-22673
+            join('src', 'npymath', 'arm64_exports.c'),
+            join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
+            join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
             ]
 
     #######################################################################
@@ -1001,14 +946,15 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'umathmodule.c'),
             join('src', 'umath', 'reduction.c'),
             join('src', 'umath', 'funcs.inc.src'),
-            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops_utils.h.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary.dispatch.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_unary_fp_le.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_logical.dispatch.c.src'),
             join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
@@ -1016,6 +962,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
             join('src', 'umath', 'loops_modulo.dispatch.c.src'),
             join('src', 'umath', 'loops_comparison.dispatch.c.src'),
+            join('src', 'umath', 'loops_unary_complex.dispatch.c.src'),
+            join('src', 'umath', 'loops_autovec.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h'),
@@ -1040,7 +988,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'common.h'),
             join('src', 'multiarray', 'number.h'),
             join('src', 'common', 'templ_common.h.src'),
-            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'override.h'),
             join(codegen_dir, 'generate_ufunc_api.py'),
             join(codegen_dir, 'ufunc_docstrings.py'),
@@ -1055,7 +1002,7 @@ def configuration(parent_package='',top_path=None):
     # after all maintainable code.
     svml_filter = (
     )
-    if can_link_svml() and check_svml_submodule(svml_path):
+    if can_link_svml():
         svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
         svml_objs = [o for o in svml_objs if not o.endswith(svml_filter)]
 
@@ -1064,9 +1011,6 @@ def configuration(parent_package='',top_path=None):
         svml_objs.sort()
 
     config.add_extension('_multiarray_umath',
-                         # Forcing C language even though we have C++ sources.
-                         # It forces the C linker and don't link C++ runtime.
-                         language = 'c',
                          sources=multiarray_src + umath_src +
                                  common_src +
                                  [generate_config_h,
@@ -1082,8 +1026,7 @@ def configuration(parent_package='',top_path=None):
                                 common_deps,
                          libraries=['npymath'],
                          extra_objects=svml_objs,
-                         extra_info=extra_info,
-                         extra_cxx_compile_args=NPY_CXX_FLAGS)
+                         extra_info=extra_info)
 
     #######################################################################
     #                        umath_tests module                           #
@@ -1121,23 +1064,26 @@ def configuration(parent_package='',top_path=None):
     #                        SIMD module                                  #
     #######################################################################
 
-    config.add_extension('_simd', sources=[
-        join('src', 'common', 'npy_cpu_features.c'),
-        join('src', '_simd', '_simd.c'),
-        join('src', '_simd', '_simd_inc.h.src'),
-        join('src', '_simd', '_simd_data.inc.src'),
-        join('src', '_simd', '_simd.dispatch.c.src'),
-    ], depends=[
-        join('src', 'common', 'npy_cpu_dispatch.h'),
-        join('src', 'common', 'simd', 'simd.h'),
-        join('src', '_simd', '_simd.h'),
-        join('src', '_simd', '_simd_inc.h.src'),
-        join('src', '_simd', '_simd_data.inc.src'),
-        join('src', '_simd', '_simd_arg.inc'),
-        join('src', '_simd', '_simd_convert.inc'),
-        join('src', '_simd', '_simd_easyintrin.inc'),
-        join('src', '_simd', '_simd_vector.inc'),
-    ])
+    config.add_extension('_simd',
+        sources=[
+            join('src', 'common', 'npy_cpu_features.c'),
+            join('src', '_simd', '_simd.c'),
+            join('src', '_simd', '_simd_inc.h.src'),
+            join('src', '_simd', '_simd_data.inc.src'),
+            join('src', '_simd', '_simd.dispatch.c.src'),
+        ], depends=[
+            join('src', 'common', 'npy_cpu_dispatch.h'),
+            join('src', 'common', 'simd', 'simd.h'),
+            join('src', '_simd', '_simd.h'),
+            join('src', '_simd', '_simd_inc.h.src'),
+            join('src', '_simd', '_simd_data.inc.src'),
+            join('src', '_simd', '_simd_arg.inc'),
+            join('src', '_simd', '_simd_convert.inc'),
+            join('src', '_simd', '_simd_easyintrin.inc'),
+            join('src', '_simd', '_simd_vector.inc'),
+        ],
+        libraries=['npymath']
+    )
 
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 085c0baf5..b5bc0dec1 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -48,7 +48,13 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000f - 1.22.x
 # 0x00000010 - 1.23.x
 # 0x00000010 - 1.24.x
-C_API_VERSION = 0x00000010
+# 0x00000011 - 1.25.x
+C_API_VERSION = 0x00000011
+
+# By default, when compiling downstream libraries against NumPy,```
+# pick an older feature version.  For example, for 1.25.x we default to the
+# 1.19 API and support going back all the way to 1.15.x (if so desired).
+# This is set up in `numpyconfig.h`.
 
 class MismatchCAPIError(ValueError):
     pass
@@ -91,6 +97,10 @@ def check_api_version(apiversion, codegen_dir):
                f"checksum in core/codegen_dir/cversions.txt is {api_hash}. If "
                "functions were added in the C API, you have to update "
                f"C_API_VERSION in {__file__}."
+               "\n"
+               "Please make sure that new additions are guarded with "
+               "MinVersion() to make them unavailable when wishing support "
+               "for older NumPy versions."
                )
         raise MismatchCAPIError(msg)
 
@@ -129,10 +139,9 @@ MANDATORY_FUNCS = [
     "floor", "ceil", "sqrt", "log10", "log", "exp", "asin",
     "acos", "atan", "fmod", 'modf', 'frexp', 'ldexp',
     "expm1", "log1p", "acosh", "asinh", "atanh",
-    "rint", "trunc", "exp2", 
+    "rint", "trunc", "exp2",
     "copysign", "nextafter", "strtoll", "strtoull", "cbrt",
     "log2", "pow", "hypot", "atan2",
-    "csin", "csinh", "ccos", "ccosh", "ctan", "ctanh",
     "creal", "cimag", "conj"
 ]
 
@@ -154,6 +163,9 @@ C99_COMPLEX_TYPES = [
 C99_COMPLEX_FUNCS = [
     "cabs", "cacos", "cacosh", "carg", "casin", "casinh", "catan",
     "catanh", "cexp", "clog", "cpow", "csqrt",
+    # The long double variants (like csinl)  should be mandatory on C11,
+    # but are missing in FreeBSD. Issue gh-22850
+    "csin", "csinh", "ccos", "ccosh", "ctan", "ctanh",
     ]
 
 OPTIONAL_HEADERS = [
@@ -178,26 +190,10 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
-                       ("__builtin_mul_overflow", '5, 5, (int*)5'),
-                       # MMX only needed for icc, but some clangs don't have it
-                       ("_m_from_int64", '0', "emmintrin.h"),
-                       ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
-                       ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
-                        "xmmintrin.h"),  # SSE
-                       ("_mm_load_pd", '(double*)0', "emmintrin.h"),  # SSE2
+                       # Test `long long` for arm+clang 13 (gh-22811,
+                       # but we use all versions of __builtin_mul_overflow):
+                       ("__builtin_mul_overflow", '(long long)5, 5, (int*)5'),
                        ("__builtin_prefetch", "(float*)0, 0, 3"),
-                       # check that the linker can handle avx
-                       ("__asm__ volatile", '"vpand %xmm1, %xmm2, %xmm3"',
-                        "stdio.h", "LINK_AVX"),
-                       ("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
-                        "stdio.h", "LINK_AVX2"),
-                       ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
-                        "stdio.h", "LINK_AVX512F"),
-                       ("__asm__ volatile", '"vfpclasspd $0x40, %zmm15, %k6\\n"\
-                                             "vmovdqu8 %xmm0, %xmm1\\n"\
-                                             "vpbroadcastmb2q %k0, %xmm0\\n"',
-                        "stdio.h", "LINK_AVX512_SKX"),
-                       ("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
                        ]
 
 # function attributes
@@ -212,44 +208,6 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                 ('__attribute__((nonnull (1)))',
                                  'attribute_nonnull'),
                                 ]
-
-OPTIONAL_FUNCTION_ATTRIBUTES_AVX = [('__attribute__((target ("avx")))',
-    'attribute_target_avx'),
-    ('__attribute__((target ("avx2")))',
-    'attribute_target_avx2'),
-    ('__attribute__((target ("avx512f")))',
-    'attribute_target_avx512f'),
-    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-    'attribute_target_avx512_skx'),
-    ]
-
-# function attributes with intrinsics
-# To ensure your compiler can compile avx intrinsics with just the attributes
-# gcc 4.8.4 support attributes but not with intrisics
-# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
-# function name will be converted to HAVE_<upper-case-name> preprocessor macro
-# The _mm512_castps_si512 instruction is specific check for AVX-512F support
-# in gcc-4.9 which is missing a subset of intrinsics. See
-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX = [
-    ('__attribute__((target("avx2,fma")))',
-    'attribute_target_avx2_with_intrinsics',
-    '__m256 temp = _mm256_set1_ps(1.0); temp = \
-    _mm256_fmadd_ps(temp, temp, temp)',
-    'immintrin.h'),
-    ('__attribute__((target("avx512f")))',
-    'attribute_target_avx512f_with_intrinsics',
-    '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
-    'immintrin.h'),
-    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-    'attribute_target_avx512_skx_with_intrinsics',
-    '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
-    __m512i unused_temp = \
-        _mm512_castps_si512(_mm512_set1_ps(1.0));\
-    _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
-    'immintrin.h'),
-    ]
-
 def fname2def(name):
     return "HAVE_%s" % name.upper()
 
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 84a6bd671..250fffd42 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -204,19 +204,15 @@ def atleast_3d(*arys):
         return res
 
 
-def _arrays_for_stack_dispatcher(arrays, stacklevel=4):
-    if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
-        warnings.warn('arrays to stack must be passed as a "sequence" type '
-                      'such as list or tuple. Support for non-sequence '
-                      'iterables such as generators is deprecated as of '
-                      'NumPy 1.16 and will raise an error in the future.',
-                      FutureWarning, stacklevel=stacklevel)
-        return ()
-    return arrays
+def _arrays_for_stack_dispatcher(arrays):
+    if not hasattr(arrays, "__getitem__"):
+        raise TypeError('arrays to stack must be passed as a "sequence" type '
+                        'such as list or tuple.')
+
+    return tuple(arrays)
 
 
-def _vhstack_dispatcher(tup, *, 
-                        dtype=None, casting=None):
+def _vhstack_dispatcher(tup, *, dtype=None, casting=None):
     return _arrays_for_stack_dispatcher(tup)
 
 
@@ -287,9 +283,6 @@ def vstack(tup, *, dtype=None, casting="same_kind"):
            [6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
     arrs = atleast_2d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
@@ -356,10 +349,6 @@ def hstack(tup, *, dtype=None, casting="same_kind"):
            [3, 6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrs = atleast_1d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
@@ -372,7 +361,7 @@ def hstack(tup, *, dtype=None, casting="same_kind"):
 
 def _stack_dispatcher(arrays, axis=None, out=None, *,
                       dtype=None, casting=None):
-    arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6)
+    arrays = _arrays_for_stack_dispatcher(arrays)
     if out is not None:
         # optimize for the typical case where only arrays is provided
         arrays = list(arrays)
@@ -451,10 +440,6 @@ def stack(arrays, axis=0, out=None, *, dtype=None, casting="same_kind"):
            [3, 6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(arrays, stacklevel=2)
-
     arrays = [asanyarray(arr) for arr in arrays]
     if not arrays:
         raise ValueError('need at least one array to stack')
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
index b1fdd4478..52b66e765 100644
--- a/numpy/core/src/_simd/_simd.c
+++ b/numpy/core/src/_simd/_simd.c
@@ -1,11 +1,33 @@
 #include "_simd.h"
 
+#include "numpy/npy_math.h"
+
+static PyObject *
+get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyLong_FromLong(npy_get_floatstatus());
+}
+
+static PyObject *
+clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    npy_clear_floatstatus();
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef _simd_methods[] = {
+    {"get_floatstatus", get_floatstatus, METH_NOARGS, NULL},
+    {"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
 PyMODINIT_FUNC PyInit__simd(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
         .m_name = "numpy.core._simd",
-        .m_size = -1
+        .m_size = -1,
+        .m_methods = _simd_methods
     };
     if (npy_cpu_init() < 0) {
         return NULL;
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 48023af80..f532c9e02 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -42,23 +42,26 @@
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
 /**end repeat1**/
+SIMD_IMPL_INTRIN_1(load_@sfx@x2, v@sfx@x2, q@sfx@)
+
 /**begin repeat1
- * # intrin = store, storea, stores, storel, storeh#
+ * # intrin = store, storea, stores, storel, storeh, store#
+ * # x = ,,,,, x2#
  */
 // special definition due to the nature of @intrin@
 static PyObject *
-simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@@x@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
-    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@@x@};
     if (!PyArg_ParseTuple(
-        args, "O&O&:@intrin@_@sfx@",
+        args, "O&O&:@intrin@_@sfx@@x@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+    npyv_@intrin@_@sfx@@x@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@@x@);
     // write-back
     if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
         simd_arg_free(&seq_arg);
@@ -76,23 +79,35 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 // Partial Load
 SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
 SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#if @size@ == 32
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#endif
 
 // Partial Store
+/**begin repeat1
+ * #intrin = store_till, store2_till, store2_till#
+ * #chksize= 0,          32,           64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
-simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
     simd_arg nlane_arg = {.dtype = simd_data_u32};
     simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
     if (!PyArg_ParseTuple(
-        args, "O&O&O&:store_till_@sfx@",
+        args, "O&O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &nlane_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_store_till_@sfx@(
+    npyv_@intrin@_@sfx@(
         seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
     );
     // write-back
@@ -103,14 +118,22 @@ simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     simd_arg_free(&seq_arg);
     Py_RETURN_NONE;
 }
+#endif // chksize
 
+/**end repeat1**/
 // Non-contiguous Load
 /**begin repeat1
- * #intrin = loadn, loadn_till, loadn_tillz#
- * #till   = 0,     1,          1#
- * #fill   = 0,     1,          0#
- * #format = ,    O&O&,         O&#
- */
+ * #intrin = loadn,       loadn2,       loadn2,
+ *           loadn_till,  loadn2_till,  loadn2_till,
+ *           loadn_tillz, loadn2_tillz, loadn2_tillz#
+ * #scale  = 1,2,2,       1,2,2,         1,2,2#
+ * #till   = 0*3,         1*3,           1*3#
+ * #fill   = 0*3,         1*3,           0*3#
+ # #fill2  = 0*3,         0,1,1,         0*3#
+ * #format = ,,,          O&O&, O&O&O&*2,O&*3#
+ * #chksize= 0,32,64,     0,32,64,       0,32,64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -122,6 +145,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
     simd_arg fill_arg = {.dtype = simd_data_@sfx@};
 #endif
+#if @fill2@
+    simd_arg fill2_arg = {.dtype = simd_data_@sfx@};
+#endif
     if (!PyArg_ParseTuple(
         args, "@format@O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
@@ -132,6 +158,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
         ,simd_arg_converter, &fill_arg
 #endif
+#if @fill2@
+        ,simd_arg_converter, &fill2_arg
+#endif
     )) {
         return NULL;
     }
@@ -140,7 +169,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1 * @scale@;
         min_seq_len = -min_seq_len;
     }
     if (cur_seq_len < min_seq_len) {
@@ -159,6 +188,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     #if @fill@
         , fill_arg.data.@sfx@
     #endif
+    #if @fill2@
+        , fill2_arg.data.@sfx@
+    #endif
     );
     simd_arg ret = {
         .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
@@ -169,14 +201,19 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 
 // Non-contiguous Store
 /**begin repeat1
- * #intrin = storen, storen_till#
- * #till   = 0,      1#
- * #format = ,       O&#
+ * #intrin = storen,      storen2,      storen2,
+             storen_till, storen2_till, storen2_till#
+ * #scale  = 1,2,2,       1,2,2#
+ * #till   = 0*3,         1*3#
+ * #format = ,,,          O&*3#
+ * #chksize= 0,32,64,     0,32,64#
  */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -202,7 +239,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1*@scale@;
         min_seq_len = -min_seq_len;
     }
     // overflow guard
@@ -231,6 +268,7 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 #endif // @ncont_sup@
 
@@ -300,7 +338,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
 /**begin repeat1
- * # intrin = combine, zip#
+ * # intrin = combine, zip, unzip#
  */
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -309,6 +347,60 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
 #endif
 
+// special implementation to convert runtime constants to immediate values
+#if @size@ == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   /**begin repeat1
+    * # en = e0, e1, e2, e3#
+    */
+    npyv_@sfx@ v@en@;
+    npyv_lanetype_@sfx@ d@en@[npyv_nlanes_@sfx@];
+    if (0) {}
+   /**begin repeat2
+    * # imm = 1, 2, 3#
+    */
+    else if (@en@ == @imm@) {
+        v@en@ = npyv_permi128_@sfx@(a, @imm@, @imm@, @imm@, @imm@);
+    }
+    /**end repeat2**/
+    else {
+        v@en@ = npyv_permi128_@sfx@(a, 0, 0, 0, 0);
+    }
+    npyv_store_@sfx@(d@en@, v@en@);
+    /**end repeat1**/
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_@sfx@; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_@sfx@(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8, u8, u8)
+#elif @size@ == 64
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_@sfx@(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 1, 1);
+    }
+    return npyv_permi128_@sfx@(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8)
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -387,7 +479,7 @@ SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -438,6 +530,11 @@ SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
  SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+SIMD_IMPL_INTRIN_4(ifdiv_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_3(ifdivz_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
@@ -541,6 +638,12 @@ static PyMethodDef simd__intrinsics_methods[] = {
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+/**begin repeat1
+ * # intrin = load, store#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@x2)
+/**end repeat1**/
+
 /****************************************
  * Non-contiguous/Partial Memory access
  ****************************************/
@@ -551,6 +654,21 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
+#if @size@ == 32
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#else
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#endif
 #endif // ncont_sup
 
 /****************************
@@ -584,7 +702,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  * Reorder
  ***************************/
 /**begin repeat1
- * # intrin = combinel, combineh, combine, zip#
+ * # intrin = combinel, combineh, combine, zip, unzip#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -593,6 +711,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(rev64_@sfx@)
 #endif
 
+#if @size@ > 16
+{ "permi128_@sfx@", simd__intrin_permi128_@sfx@_, METH_VARARGS, NULL },
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -658,7 +780,7 @@ SIMD_INTRIN_DEF(divc_@sfx@)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -708,6 +830,14 @@ SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
  SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+/**begin repeat1
+ * #intrin = ifdiv, ifdivz#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index f2e0da26e..e300e5484 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -197,6 +197,39 @@
         return simd_arg_to_obj(&ret);                     \
     }
 
+#define SIMD_IMPL_INTRIN_5(NAME, RET, IN0, IN1, IN2, IN3, IN4) \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        simd_arg arg4 = {.dtype = simd_data_##IN3};       \
+        simd_arg arg5 = {.dtype = simd_data_##IN4};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&O&O&:"NPY_TOSTRING(NAME),        \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3,                    \
+            simd_arg_converter, &arg4,                    \
+            simd_arg_converter, &arg5                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2, arg4.data.IN3,                 \
+            arg5.data.IN4                                 \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg_free(&arg4);                             \
+        simd_arg_free(&arg5);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
 /**
  * Helper macros for repeating and expand a certain macro.
  * Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..44ba449d8
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,14 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npdef.hpp"
+#include "utils.hpp"
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+#include "float_status.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
index 29209aee1..f0cbf6136 100644
--- a/numpy/core/src/common/dlpack/dlpack.h
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -1,5 +1,5 @@
 // Taken from:
-// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
 /*!
  *  Copyright (c) 2017 by Contributors
  * \file dlpack.h
@@ -8,14 +8,20 @@
 #ifndef DLPACK_DLPACK_H_
 #define DLPACK_DLPACK_H_
 
+/**
+ * \brief Compatibility with C++
+ */
 #ifdef __cplusplus
 #define DLPACK_EXTERN_C extern "C"
 #else
 #define DLPACK_EXTERN_C
 #endif
 
-/*! \brief The current version of dlpack */
-#define DLPACK_VERSION 050
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -34,10 +40,41 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
 /*!
  * \brief The device type in DLDevice.
  */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
 typedef enum {
+#endif
   /*! \brief CPU device */
   kDLCPU = 1,
   /*! \brief CUDA GPU device */
@@ -70,6 +107,17 @@ typedef enum {
    * \brief CUDA managed/unified memory allocated by cudaMallocManaged
    */
   kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
 } DLDeviceType;
 
 /*!
@@ -82,7 +130,7 @@ typedef struct {
    * \brief The device index.
    * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
    */
-  int device_id;
+  int32_t device_id;
 } DLDevice;
 
 /*!
@@ -107,16 +155,21 @@ typedef enum {
    * (C/C++/Python layout: compact struct per complex number)
    */
   kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
 } DLDataTypeCode;
 
 /*!
- * \brief The data type the tensor can hold.
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
  *
  *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
  */
 typedef struct {
   /*!
@@ -138,9 +191,16 @@ typedef struct {
  */
 typedef struct {
   /*!
-   * \brief The opaque data pointer points to the allocated data. This will be
-   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
-   * aligned to 256 bytes as in CUDA.
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
    *
    * For given DLTensor, the size of memory required to store the contents of
    * data is calculated as follows:
@@ -160,7 +220,7 @@ typedef struct {
   /*! \brief The device of the tensor */
   DLDevice device;
   /*! \brief Number of dimensions */
-  int ndim;
+  int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
@@ -180,6 +240,13 @@ typedef struct {
  *  not meant to transfer the tensor. When the borrowing framework doesn't need
  *  the tensor, it should call the deleter to notify the host that the resource
  *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
  */
 typedef struct DLManagedTensor {
   /*! \brief DLTensor which is being memory managed */
@@ -188,13 +255,65 @@ typedef struct DLManagedTensor {
    *   which DLManagedTensor is used in the framework. It can also be NULL.
    */
   void * manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructors deletes the argument self as well.
    */
   void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
diff --git a/numpy/core/src/common/float_status.hpp b/numpy/core/src/common/float_status.hpp
new file mode 100644
index 000000000..8e4d5e06a
--- /dev/null
+++ b/numpy/core/src/common/float_status.hpp
@@ -0,0 +1,134 @@
+#ifndef NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+#define NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
+#include "npstd.hpp"
+
+#include <fenv.h>
+
+namespace np {
+
+/// @addtogroup cpp_core_utility
+/// @{
+/**
+ * Class wraps floating-point environment operations,
+ * provides lazy access to its functionality.
+ */
+class FloatStatus {
+ public:
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifdef FE_DIVBYZERO
+    static constexpr int kDivideByZero = FE_DIVBYZERO;
+#else
+    static constexpr int kDivideByZero = 0;
+#endif
+#ifdef FE_INVALID
+    static constexpr int kInvalid = FE_INVALID;
+#else
+    static constexpr int kInvalid = 0;
+#endif
+#ifdef FE_INEXACT
+    static constexpr int kInexact = FE_INEXACT;
+#else
+    static constexpr int kInexact = 0;
+#endif
+#ifdef FE_OVERFLOW
+    static constexpr int kOverflow = FE_OVERFLOW;
+#else
+    static constexpr int kOverflow = 0;
+#endif
+#ifdef FE_UNDERFLOW
+    static constexpr int kUnderflow = FE_UNDERFLOW;
+#else
+    static constexpr int kUnderflow = 0;
+#endif
+    static constexpr int kAllExcept = (kDivideByZero | kInvalid | kInexact |
+                                       kOverflow | kUnderflow);
+
+    FloatStatus(bool clear_on_dst=true)
+        : clear_on_dst_(clear_on_dst)
+    {
+        if constexpr (kAllExcept != 0) {
+            fpstatus_ = fetestexcept(kAllExcept);
+        }
+        else {
+            fpstatus_ = 0;
+        }
+    }
+    ~FloatStatus()
+    {
+        if constexpr (kAllExcept != 0) {
+            if (fpstatus_ != 0 && clear_on_dst_) {
+                feclearexcept(kAllExcept);
+            }
+        }
+    }
+    constexpr bool IsDivideByZero() const
+    {
+        return (fpstatus_ & kDivideByZero) != 0;
+    }
+    constexpr bool IsInexact() const
+    {
+        return (fpstatus_ & kInexact) != 0;
+    }
+    constexpr bool IsInvalid() const
+    {
+        return (fpstatus_ & kInvalid) != 0;
+    }
+    constexpr bool IsOverFlow() const
+    {
+        return (fpstatus_ & kOverflow) != 0;
+    }
+    constexpr bool IsUnderFlow() const
+    {
+        return (fpstatus_ & kUnderflow) != 0;
+    }
+    static void RaiseDivideByZero()
+    {
+        if constexpr (kDivideByZero != 0) {
+            feraiseexcept(kDivideByZero);
+        }
+    }
+    static void RaiseInexact()
+    {
+        if constexpr (kInexact != 0) {
+            feraiseexcept(kInexact);
+        }
+    }
+    static void RaiseInvalid()
+    {
+        if constexpr (kInvalid != 0) {
+            feraiseexcept(kInvalid);
+        }
+    }
+    static void RaiseOverflow()
+    {
+        if constexpr (kOverflow != 0) {
+            feraiseexcept(kOverflow);
+        }
+    }
+    static void RaiseUnderflow()
+    {
+        if constexpr (kUnderflow != 0) {
+            feraiseexcept(kUnderflow);
+        }
+    }
+
+  private:
+    bool clear_on_dst_;
+    int fpstatus_;
+};
+
+/// @} cpp_core_utility
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..4d16e3bcc
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,264 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+#include "npy_cpu_dispatch.h" // NPY_HAVE_CPU_FEATURES
+#include "half_private.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - add support for arithmetic operations
+// - enables __fp16 causes massive FP exceptions on aarch64,
+//   needs a deep investigation
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+#if 1 // ndef __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    /// Whether `Half` has a full native HW support.
+    static constexpr bool kNative = false;
+    /// Whether `Half` has a native HW support for single/double conversion.
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        (
+            std::is_same_v<T, float> &&
+        #if defined(NPY_HAVE_FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        ) || (
+            std::is_same_v<T, double> &&
+        #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        )
+    );
+
+    /// Default constructor. initialize nothing.
+    Half() = default;
+
+    /// Constract from float
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(float f)
+    {
+    #if defined(NPY_HAVE_FP16)
+        __m128 mf = _mm_load_ss(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_cvtps_ph(mf, _MM_FROUND_TO_NEAREST_INT)));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (vf32));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromFloatBits(BitCast<uint32_t>(f));
+    #endif
+    }
+
+    /// Construct from double.
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(double f)
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        __m128d md = _mm_load_sd(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md))));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector double vf64 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f));
+    #endif
+    }
+
+    /// Cast to float
+    explicit operator float() const
+    {
+    #if defined(NPY_HAVE_FP16)
+        float ret;
+        _mm_store_ss(&ret, _mm_cvtph_ps(_mm_cvtsi32_si128(bits_)));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(vec_extract_fp_from_shorth)
+        return vec_extract(vec_extract_fp_from_shorth(vec_splats(bits_)), 0);
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32;
+        __asm__ __volatile__("xvcvhpsp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_.u)));
+        return vec_extract(vf32, 0);
+    #else
+        return BitCast<float>(half_private::ToFloatBits(bits_));
+    #endif
+    }
+
+    /// Cast to double
+    explicit operator double() const
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        double ret;
+        _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_))));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf64;
+        __asm__ __volatile__("xvcvhpdp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_)));
+        return vec_extract(vf64, 0);
+    #else
+        return BitCast<double>(half_private::ToDoubleBits(bits_));
+    #endif
+    }
+
+    /// Returns a new Half constracted from the IEEE 754 binary16.
+    static constexpr Half FromBits(uint16_t bits)
+    {
+        Half h{};
+        h.bits_ = bits;
+        return h;
+    }
+    /// Returns the IEEE 754 binary16 representation.
+    constexpr uint16_t Bits() const
+    {
+        return bits_;
+    }
+
+    /// @name Comparison operators (orderd)
+    /// @{
+    constexpr bool operator==(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Equal(r);
+    }
+    constexpr bool operator<(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Less(r);
+    }
+    constexpr bool operator<=(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && LessEqual(r);
+    }
+    constexpr bool operator>(Half r) const
+    {
+        return r < *this;
+    }
+    constexpr bool operator>=(Half r) const
+    {
+        return r <= *this;
+    }
+    /// @}
+
+    /// @name Comparison operators (unorderd)
+    /// @{
+    constexpr bool operator!=(Half r) const
+    {
+        return !(*this == r);
+    }
+    /// @} Comparison operators
+
+    /// @name Comparison with no guarantee of NaN behavior
+    /// @{
+    constexpr bool Less(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //   And make sure they are not equal to each other
+        //       in case of both are equal to +-0
+        // else
+        //   Test if  `a` has the sign.
+        //        and `a` != -0.0 and `b` != 0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) && (a != b)
+                                  : sign_a && ((a | b) != 0x8000u);
+    }
+    constexpr bool LessEqual(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //        or a == b (needed even if we used <= above instead
+        //                   since testing +-0 still required)
+        // else
+        //   Test if `a` has the sign
+        //        or `a` and `b` equal to +-0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) || (a == b)
+                                  : sign_a || ((a | b) == 0x8000u);
+    }
+    constexpr bool Equal(Half r) const
+    {
+        // fast16 cast is not worth it, since unpack op should involved.
+        uint16_t a = bits_, b = r.bits_;
+        return a == b || ((a | b) == 0x8000u);
+    }
+    /// @} Comparison
+
+    /// @name Properties
+    // @{
+    constexpr bool IsNaN() const
+    {
+        return ((bits_ & 0x7c00u) == 0x7c00u) &&
+               ((bits_ & 0x03ffu) != 0);
+    }
+    /// @} Properties
+
+  private:
+    uint16_t bits_;
+};
+#else // __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    static constexpr bool kNative = true;
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        std::is_same_v<T, float> || std::is_same_v<T, double>
+    );
+    Half() = default;
+    constexpr Half(__fp16 h) : half_(h)
+    {}
+    constexpr operator __fp16() const
+    { return half_; }
+    static Half FromBits(uint16_t bits)
+    {
+        Half h;
+        h.half_ = BitCast<__fp16>(bits);
+        return h;
+    }
+    uint16_t Bits() const
+    { return BitCast<uint16_t>(half_); }
+    constexpr bool Less(Half r) const
+    { return half_ < r.half_; }
+    constexpr bool LessEqual(Half r) const
+    { return half_ <= r.half_; }
+    constexpr bool Equal(Half r) const
+    { return half_ == r.half_; }
+    constexpr bool IsNaN() const
+    { return half_ != half_; }
+
+  private:
+    __fp16 half_;
+};
+#endif // __ARM_FP16_FORMAT_IEEE
+
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/half_private.hpp b/numpy/core/src/common/half_private.hpp
new file mode 100644
index 000000000..7a64eb397
--- /dev/null
+++ b/numpy/core/src/common/half_private.hpp
@@ -0,0 +1,330 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+
+#include "npstd.hpp"
+#include "float_status.hpp"
+
+/*
+ * The following functions that emulating float/double/half conversions
+ * are copied from npymath without any changes to its functionalty.
+ */
+namespace np { namespace half_private {
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromFloatBits(uint32_t f)
+{
+    uint32_t f_exp, f_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (uint16_t) ((f&0x80000000u) >> 16);
+    f_exp = (f&0x7f800000u);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (f_exp >= 0x47800000u) {
+        if (f_exp == 0x7f800000u) {
+            /* Inf or NaN */
+            f_sig = (f&0x007fffffu);
+            if (f_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (f_sig >> 13));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return (uint16_t) (h_sgn + 0x7c00u);
+            }
+        } else {
+            if constexpr (gen_overflow) {
+                /* overflow to signed inf */
+                FloatStatus::RaiseOverflow();
+            }
+            return (uint16_t) (h_sgn + 0x7c00u);
+        }
+    }
+
+    /* Exponent underflow converts to a subnormal half or signed zero */
+    if (f_exp <= 0x38000000u) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (f_exp < 0x33000000u) {
+            if constexpr (gen_underflow) {
+                /* If f != 0, it underflowed to 0 */
+                if ((f&0x7fffffff) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        f_exp >>= 23;
+        f_sig = (0x00800000u + (f&0x007fffffu));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((f_sig&(((uint32_t)1 << (126 - f_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Usually the significand is shifted by 13. For subnormals an
+         * additional shift needs to occur. This shift is one for the largest
+         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
+         * offsets the new first bit. At most the shift can be 1+10 bits.
+         */
+        f_sig >>= (113 - f_exp);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand. However, the (113 - f_exp)
+             * shift can lose up to 11 bits, so the || checks them in the original.
+             * In all other cases, we can just add one.
+             */
+            if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
+                f_sig += 0x00001000u;
+            }
+        }
+        else {
+            f_sig += 0x00001000u;
+        }
+        h_sig = (uint16_t) (f_sig >> 13);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return (uint16_t) (h_sgn + h_sig);
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    f_sig = (f&0x007fffffu);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((f_sig&0x00003fffu) != 0x00001000u) {
+            f_sig += 0x00001000u;
+        }
+    }
+    else {
+        f_sig += 0x00001000u;
+    }
+    h_sig = (uint16_t) (f_sig >> 13);
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromDoubleBits(uint64_t d)
+{
+    uint64_t d_exp, d_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (d&0x8000000000000000ULL) >> 48;
+    d_exp = (d&0x7ff0000000000000ULL);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (d_exp >= 0x40f0000000000000ULL) {
+        if (d_exp == 0x7ff0000000000000ULL) {
+            /* Inf or NaN */
+            d_sig = (d&0x000fffffffffffffULL);
+            if (d_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (d_sig >> 42));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return h_sgn + 0x7c00u;
+            }
+        } else {
+            /* overflow to signed inf */
+            if constexpr (gen_overflow) {
+                FloatStatus::RaiseOverflow();
+            }
+            return h_sgn + 0x7c00u;
+        }
+    }
+
+    /* Exponent underflow converts to subnormal half or signed zero */
+    if (d_exp <= 0x3f00000000000000ULL) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (d_exp < 0x3e60000000000000ULL) {
+            if constexpr (gen_underflow) {
+                /* If d != 0, it underflowed to 0 */
+                if ((d&0x7fffffffffffffffULL) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        d_exp >>= 52;
+        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((d_sig&(((uint64_t)1 << (1051 - d_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Unlike floats, doubles have enough room to shift left to align
+         * the subnormal significand leading to no loss of the last bits.
+         * The smallest possible exponent giving a subnormal is:
+         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
+         * shifted with respect to it. This adds a shift of 10+1 bits the final
+         * right shift when comparing it to the one in the normal branch.
+         */
+        assert(d_exp - 998 >= 0);
+        d_sig <<= (d_exp - 998);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand.  In all other cases, we do.
+             */
+            if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
+                d_sig += 0x0010000000000000ULL;
+            }
+        }
+        else {
+            d_sig += 0x0010000000000000ULL;
+        }
+        h_sig = (uint16_t) (d_sig >> 53);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return h_sgn + h_sig;
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((d_exp - 0x3f00000000000000ULL) >> 42);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    d_sig = (d&0x000fffffffffffffULL);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+            d_sig += 0x0000020000000000ULL;
+        }
+    }
+    else {
+        d_sig += 0x0000020000000000ULL;
+    }
+    h_sig = (uint16_t) (d_sig >> 42);
+
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+constexpr uint32_t ToFloatBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint32_t f_sgn = ((uint32_t)h&0x8000u) << 16;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return f_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint32_t f_exp = ((uint32_t)(127 - 15 - h_exp)) << 23;
+            uint32_t f_sig = ((uint32_t)(h_sig&0x03ffu)) << 13;
+            return f_sgn + f_exp + f_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return f_sgn + 0x7f800000u + (((uint32_t)(h&0x03ffu)) << 13);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return f_sgn + (((uint32_t)(h&0x7fffu) + 0x1c000u) << 13);
+    }
+}
+
+constexpr uint64_t ToDoubleBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint64_t d_sgn = ((uint64_t)h&0x8000u) << 48;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return d_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint64_t d_exp = ((uint64_t)(1023 - 15 - h_exp)) << 52;
+            uint64_t d_sig = ((uint64_t)(h_sig&0x03ffu)) << 42;
+            return d_sgn + d_exp + d_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return d_sgn + 0x7ff0000000000000ULL + (((uint64_t)(h&0x03ffu)) << 42);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return d_sgn + (((uint64_t)(h&0x7fffu) + 0xfc000u) << 42);
+    }
+}
+
+}} // namespace np::half_private
+#endif // NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+    using TF_ = typename details::IntBySize<
+        sizeof(T), std::is_unsigned<T>::value
+    >::Type;
+
+    using Type = typename std::conditional<
+        std::is_integral<T>::value, TF_, T
+    >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npdef.hpp b/numpy/core/src/common/npdef.hpp
new file mode 100644
index 000000000..56a0df52e
--- /dev/null
+++ b/numpy/core/src/common/npdef.hpp
@@ -0,0 +1,28 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+#define NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
+#if !defined(__cplusplus) || __cplusplus < 201703L
+    #error "NumPy requires a compiler with at least C++17 enabled"
+#endif
+
+/// @addtogroup cpp_core_defs
+/// @{
+
+/// Whether compiler supports C++20
+#if __cplusplus > 202002L
+    #define NP_HAS_CPP20 1
+#else
+    #define NP_HAS_CPP20 0
+#endif
+
+/// Wraps `__has_builtin`
+#if defined(__has_builtin)
+    #define NP_HAS_BUILTIN(INTRIN) __has_builtin(INTRIN)
+#else
+    #define NP_HAS_BUILTIN(INTRIN) 0
+#endif
+
+/// @} cpp_core_defs
+
+#endif // NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..92e3d5cc5
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,57 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+#include <cstdint>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+using std::uint_fast16_t;
+using std::uint_fast32_t;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+    !std::is_same<npy_longdouble, long double>::value,
+     void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
 
 #include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
 #include "numpy/numpyconfig.h"
 #include "numpy/utils.h"
 #include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c b/numpy/core/src/common/npy_cpu_features.c
index 7563979d1..64a85f6fb 100644
--- a/numpy/core/src/common/npy_cpu_features.c
+++ b/numpy/core/src/common/npy_cpu_features.c
@@ -14,13 +14,15 @@ static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
 static void
 npy__cpu_init_features(void);
 /*
- * Disable CPU dispatched features at runtime if environment variable
- * 'NPY_DISABLE_CPU_FEATURES' is defined.
+ * Enable or disable CPU dispatched features at runtime if the environment variable
+ * `NPY_ENABLE_CPU_FEATURES`  or  `NPY_DISABLE_CPU_FEATURES`
+ * depends on the value of boolean parameter `disable`(toggle).
+ *
  * Multiple features can be present, and separated by space, comma, or tab.
- * Raises an error if parsing fails or if the feature was not enabled
+ * Raises an error if parsing fails or if the feature was not enabled or disabled
 */
 static int
-npy__cpu_try_disable_env(void);
+npy__cpu_check_env(int disable, const char *env);
 
 /* Ensure the build's CPU baseline features are supported at runtime */
 static int
@@ -43,9 +45,22 @@ npy_cpu_init(void)
     if (npy__cpu_validate_baseline() < 0) {
         return -1;
     }
-    if (npy__cpu_try_disable_env() < 0) {
+    char *enable_env = getenv("NPY_ENABLE_CPU_FEATURES");
+    char *disable_env = getenv("NPY_DISABLE_CPU_FEATURES");
+    int is_enable = enable_env && enable_env[0];
+    int is_disable = disable_env && disable_env[0];
+    if (is_enable & is_disable) {
+        PyErr_Format(PyExc_ImportError,
+            "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES "
+            "environment variables cannot be set simultaneously."
+        );
         return -1;
     }
+    if (is_enable | is_disable) {
+        if (npy__cpu_check_env(is_disable, is_disable ? disable_env : enable_env) < 0) {
+            return -1;
+        }
+    }
     return 0;
 }
 
@@ -81,12 +96,14 @@ static struct {
                 {NPY_CPU_FEATURE_AVX512VBMI, "AVX512VBMI"},
                 {NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"},
                 {NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"},
+                {NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"},
                 {NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"},
                 {NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"},
                 {NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"},
                 {NPY_CPU_FEATURE_AVX512_CLX, "AVX512_CLX"},
                 {NPY_CPU_FEATURE_AVX512_CNL, "AVX512_CNL"},
                 {NPY_CPU_FEATURE_AVX512_ICL, "AVX512_ICL"},
+                {NPY_CPU_FEATURE_AVX512_SPR, "AVX512_SPR"},
                 {NPY_CPU_FEATURE_VSX, "VSX"},
                 {NPY_CPU_FEATURE_VSX2, "VSX2"},
                 {NPY_CPU_FEATURE_VSX3, "VSX3"},
@@ -208,7 +225,8 @@ npy__cpu_validate_baseline(void)
         *(fptr-1) = '\0'; // trim the last space
         PyErr_Format(PyExc_RuntimeError,
             "NumPy was built with baseline optimizations: \n"
-            "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
+            "(" NPY_WITH_CPU_BASELINE ") but your machine "
+            "doesn't support:\n(%s).",
             baseline_failure
         );
         return -1;
@@ -218,27 +236,31 @@ npy__cpu_validate_baseline(void)
 }
 
 static int
-npy__cpu_try_disable_env(void)
-{
-    char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
-    if (disenv == NULL || disenv[0] == 0) {
-        return 0;
-    }
-    #define NPY__CPU_ENV_ERR_HEAD \
-        "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
+npy__cpu_check_env(int disable, const char *env) {
+
+    static const char *names[] = {
+        "enable", "disable",
+        "NPY_ENABLE_CPU_FEATURES", "NPY_DISABLE_CPU_FEATURES",
+        "During parsing environment variable: 'NPY_ENABLE_CPU_FEATURES':\n",
+        "During parsing environment variable: 'NPY_DISABLE_CPU_FEATURES':\n"
+    };
+    disable = disable ? 1 : 0;
+    const char *act_name = names[disable];
+    const char *env_name = names[disable + 2];
+    const char *err_head = names[disable + 4];
 
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
     #define NPY__MAX_VAR_LEN 1024 // More than enough for this era
-    size_t var_len = strlen(disenv) + 1;
+    size_t var_len = strlen(env) + 1;
     if (var_len > NPY__MAX_VAR_LEN) {
         PyErr_Format(PyExc_RuntimeError,
-            "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
-            var_len, NPY__MAX_VAR_LEN - 1
+            "Length of environment variable '%s' is %d, only %d accepted",
+            env_name, var_len, NPY__MAX_VAR_LEN
         );
         return -1;
     }
-    char disable_features[NPY__MAX_VAR_LEN];
-    memcpy(disable_features, disenv, var_len);
+    char features[NPY__MAX_VAR_LEN];
+    memcpy(features, env, var_len);
 
     char nexist[NPY__MAX_VAR_LEN];
     char *nexist_cur = &nexist[0];
@@ -248,17 +270,19 @@ npy__cpu_try_disable_env(void)
 
     //comma and space including (htab, vtab, CR, LF, FF)
     const char *delim = ", \t\v\r\n\f";
-    char *feature = strtok(disable_features, delim);
+    char *feature = strtok(features, delim);
     while (feature) {
-        if (npy__cpu_baseline_fid(feature) > 0) {
-            PyErr_Format(PyExc_RuntimeError,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU feature '%s', since it is part of "
-                "the baseline optimizations:\n"
-                "(" NPY_WITH_CPU_BASELINE ").",
-                feature
-            );
-            return -1;
+        if (npy__cpu_baseline_fid(feature) > 0){
+            if (disable) {
+                PyErr_Format(PyExc_RuntimeError,
+                    "%s"
+                    "You cannot disable CPU feature '%s', since it is part of "
+                    "the baseline optimizations:\n"
+                    "(" NPY_WITH_CPU_BASELINE ").",
+                    err_head, feature
+                );
+                return -1;
+            } goto next;
         }
         // check if the feature is part of dispatched features
         int feature_id = npy__cpu_dispatch_fid(feature);
@@ -275,47 +299,58 @@ npy__cpu_try_disable_env(void)
             notsupp_cur[flen] = ' '; notsupp_cur += flen + 1;
             goto next;
         }
-        // Finally we can disable it
-        npy__cpu_have[feature_id] = 0;
+        // Finally we can disable or mark for enabling
+        npy__cpu_have[feature_id] = disable ? 0:2;
     next:
         feature = strtok(NULL, delim);
     }
+    if (!disable){
+        // Disables any unmarked dispatched feature.
+        #define NPY__CPU_DISABLE_DISPATCH_CB(FEATURE, DUMMY) \
+            if(npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)] != 0)\
+            {npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)]--;}\
+
+        NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_DISABLE_DISPATCH_CB, DUMMY) // extra arg for msvc
+    }
 
     *nexist_cur = '\0';
     if (nexist[0] != '\0') {
         *(nexist_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not part of the dispatched optimizations\n"
-                "(" NPY_WITH_CPU_DISPATCH ").",
-                nexist
+        if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%sYou cannot %s CPU features (%s), since "
+            "they are not part of the dispatched optimizations\n"
+            "(" NPY_WITH_CPU_DISPATCH ").",
+            err_head, act_name, nexist
         ) < 0) {
             return -1;
         }
+        return 0;
     }
 
+    #define NOTSUPP_BODY \
+                "%s" \
+                "You cannot %s CPU features (%s), since " \
+                "they are not supported by your machine.", \
+                err_head, act_name, notsupp
+
     *notsupp_cur = '\0';
     if (notsupp[0] != '\0') {
         *(notsupp_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not supported by your machine.",
-                notsupp
-        ) < 0) {
+        if (!disable){
+            PyErr_Format(PyExc_RuntimeError, NOTSUPP_BODY);
             return -1;
         }
     }
 #else
-    if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+    if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%s"
+            "You cannot use environment variable '%s', since "
         #ifdef NPY_DISABLE_OPTIMIZATION
-            "the NumPy library was compiled with optimization disabled."
+            "the NumPy library was compiled with optimization disabled.",
         #else
-            "the NumPy library was compiled without any dispatched optimizations."
+            "the NumPy library was compiled without any dispatched optimizations.",
         #endif
+        err_head, env_name, act_name
     ) < 0) {
         return -1;
     }
@@ -506,6 +541,11 @@ npy__cpu_init_features(void)
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+        // Sapphire Rapids
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16]     = (reg[3] & (1 << 23))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16];
+
     }
 }
 
@@ -513,7 +553,10 @@ npy__cpu_init_features(void)
 
 #elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
 
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__)
+    #ifdef __FreeBSD__
+        #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX
+    #endif
     #include <sys/auxv.h>
     #ifndef AT_HWCAP2
         #define AT_HWCAP2 26
@@ -533,12 +576,21 @@ static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#if defined(__linux__) || defined(__FreeBSD__)
 #ifdef __linux__
     unsigned int hwcap = getauxval(AT_HWCAP);
     if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
         return;
 
     hwcap = getauxval(AT_HWCAP2);
+#else
+    unsigned long hwcap;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+        return;
+
+    elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+#endif // __linux__
     if (hwcap & PPC_FEATURE2_ARCH_3_1)
     {
         npy__cpu_have[NPY_CPU_FEATURE_VSX]  =
@@ -551,7 +603,7 @@ npy__cpu_init_features(void)
     npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
-// TODO: AIX, FreeBSD
+// TODO: AIX, OpenBSD
 #else
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
     #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 96c543e70..9180670c4 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -43,6 +43,7 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512VNNI        = 42,
     NPY_CPU_FEATURE_AVX512VBMI2       = 43,
     NPY_CPU_FEATURE_AVX512BITALG      = 44,
+    NPY_CPU_FEATURE_AVX512FP16        = 45,
 
     // X86 CPU Groups
     // Knights Landing (F,CD,ER,PF)
@@ -57,6 +58,8 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512_CNL        = 105,
     // Ice Lake        (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
     NPY_CPU_FEATURE_AVX512_ICL        = 106,
+    // Sapphire Rapids (Ice Lake, AVX512FP16)
+    NPY_CPU_FEATURE_AVX512_SPR        = 107,
 
     // IBM/POWER VSX
     // POWER7
@@ -103,13 +106,25 @@ enum npy_cpu_features
  *  - detects runtime CPU features
  *  - check that baseline CPU features are present
  *  - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
+ *  - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
  *
  * It will set a RuntimeError when 
  *  - CPU baseline features from the build are not supported at runtime
  *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
- * and will warn if 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that
- * is not disabled (the machine or build does not support it, or the project was
- * not built with any feature optimization support)
+ *  - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
+ *    simultaneously set
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
+ *    not built with any feature optimization support
+ *  
+ * It will set an ImportWarning when:
+ *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
+ *    disable/enable a feature when the project was not built with any feature
+ *    optimization support
+ * 
  * return 0 on success otherwise return -1
  */
 NPY_VISIBILITY_HIDDEN int
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index 5ee95191d..2fec06e1c 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -779,3 +779,25 @@ NumPyOS_strtoull(const char *str, char **endptr, int base)
     return strtoull(str, endptr, base);
 }
 
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#if _MSC_VER >= 1900
+/* npy3k_compat.h uses this function in the _Py_BEGIN/END_SUPPRESS_IPH
+ * macros. It does not need to be defined when building using MSVC
+ * earlier than 14.0 (_MSC_VER == 1900).
+ */
+
+static void __cdecl _silent_invalid_parameter_handler(
+    wchar_t const* expression,
+    wchar_t const* function,
+    wchar_t const* file,
+    unsigned int line,
+    uintptr_t pReserved) { }
+
+_invalid_parameter_handler _Py_silent_invalid_parameter_handler = _silent_invalid_parameter_handler;
+
+#endif
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index ad9688338..58d842a6d 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm256_fnmsub_ps
     #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm256_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm256_fmaddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
         npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
         return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
     }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); }
+
 #endif // !NPY_HAVE_FMA3
 
 /***************************
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
     #define NPY_SIMD_FMA3 0 // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm256_i32gather_*
 #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 410c35dc8..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 { return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
 //// 64
-#if 0 // slower
-NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
-{
-    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
-    return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
-}
-NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
-{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
-NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
-{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
-#endif
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 {
     __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
@@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+    __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+    __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride));
+    __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3));
+    return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1));
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m256d a = npyv_loadl_f64(ptr);
+    return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+    _mm_storel_pd((double*)ptr, a0);
+    _mm_storeh_pd((double*)(ptr + stride), a0);
+    _mm_storel_pd((double*)(ptr + stride*2), a1);
+    _mm_storeh_pd((double*)(ptr + stride*3), a1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    npyv_storel_u64(ptr, a);
+    npyv_storeh_u64(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
 }
 // fill zero to rest lanes
@@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_maskload_epi64((const void*)ptr, mask);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+/// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
 }
 /*********************************
  * Non-contiguous partial load
@@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    __m256i a = npyv_loadl_s64(ptr);
+#if defined(_MSC_VER) && defined(_M_IX86)
+    __m128i fill =_mm_setr_epi32(
+        (int)fill_lo, (int)(fill_lo >> 32),
+        (int)fill_hi, (int)(fill_hi >> 32)
+    );
+#else
+    __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
+#endif
+    __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
+    return _mm256_inserti128_si256(a, b, 1);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
-    _mm256_maskstore_epi64((void*)ptr, mask, a);
+    _mm256_maskstore_epi64((long long*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + 2, a);
+    }
 }
 /*********************************
  * Non-contiguous partial store
@@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128i a0 = _mm256_castsi256_si128(a);
     __m128i a1 = _mm256_extracti128_si256(a, 1);
+
+    ptr[stride*0] = _mm_extract_epi32(a0, 0);
     switch(nlane) {
-    default:
-        ptr[stride*7] = _mm_extract_epi32(a1, 3);
-    case 7:
-        ptr[stride*6] = _mm_extract_epi32(a1, 2);
-    case 6:
-        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+    case 1:
+        return;
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        return;
+    case 3:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        return;
+    case 4:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        return;
     case 5:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
         ptr[stride*4] = _mm_extract_epi32(a1, 0);
-    case 4:
+        return;
+    case 6:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
         ptr[stride*3] = _mm_extract_epi32(a0, 3);
-    case 3:
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        return;
+    case 7:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
         ptr[stride*2] = _mm_extract_epi32(a0, 2);
-    case 2:
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a0, 1);
-    case 1:
-        ptr[stride*0] = _mm_extract_epi32(a0, 0);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        ptr[stride*7] = _mm_extract_epi32(a1, 3);
     }
 }
 //// 64
@@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
     __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
     double *dptr = (double*)ptr;
+    _mm_storel_pd(dptr, a0);
     switch(nlane) {
-    default:
-        _mm_storeh_pd(dptr + stride * 3, a1);
+    case 1:
+        return;
+    case 2:
+        _mm_storeh_pd(dptr + stride * 1, a0);
+        return;
     case 3:
+        _mm_storeh_pd(dptr + stride * 1, a0);
         _mm_storel_pd(dptr + stride * 2, a1);
-    case 2:
+        return;
+    default:
         _mm_storeh_pd(dptr + stride * 1, a0);
+        _mm_storel_pd(dptr + stride * 2, a1);
+        _mm_storeh_pd(dptr + stride * 3, a1);
+    }
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
+    _mm_storel_pd((double*)ptr, a0);
+    switch(nlane) {
     case 1:
-        _mm_storel_pd(dptr + stride * 0, a0);
+        return;
+    case 2:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        return;
+    case 3:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        return;
+    default:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        _mm_storeh_pd((double*)(ptr + stride * 3), a1);
     }
 }
 
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + stride, a);
+    }
+}
 /*****************************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
  *****************************************************************************/
@@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (load/store pair)
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX)                             \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup tables
  *********************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c10267b21..7b9b6a344 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
 #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
 #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
 
-// precision comparison
+// precision comparison (ordered)
 #define npyv_cmpeq_f32(A, B)  _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
 #define npyv_cmpeq_f64(A, B)  _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
 #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+    npyv_u8x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+    npyv_u16x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+    __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+    npyv_u64x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+    __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+    return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+    npyv_f64x2 r;
+    r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 1290dc0ad..a63da87d0 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
 // negate multiply and subtract, -(a*b) - c
 #define npyv_nmulsub_f32 _mm512_fnmsub_ps
 #define npyv_nmulsub_f64 _mm512_fnmsub_pd
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#define npyv_muladdsub_f32 _mm512_fmaddsub_ps
+#define npyv_muladdsub_f64 _mm512_fmaddsub_pd
 
 /***************************
  * Summation: Calculates the sum of all vector elements.
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
 #define NPY_SIMD_F64 1
 #define NPY_SIMD_FMA3 1 // native support
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
index d1c188390..88fa4a68c 100644
--- a/numpy/core/src/common/simd/avx512/maskop.h
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
 
+// division, m ? a / b : c
+NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{ return _mm512_mask_div_ps(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+{ return _mm512_maskz_div_ps(m, a, b); }
+// division, m ? a / b : c
+NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{ return _mm512_mask_div_pd(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b)
+{ return _mm512_maskz_div_pd(m, a, b); }
+
 #endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index 03fcb4630..fdf96a92c 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)),
+        (const double*)(ptr + stride)
+    );
+    __m128d b = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))),
+        (const double*)(ptr + stride*3)
+    );
+    __m128d c = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))),
+        (const double*)(ptr + stride*5)
+    );
+    __m128d d = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))),
+        (const double*)(ptr + stride*7)
+    );
+    return _mm512_castpd_si512(npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    ));
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadu_pd(ptr);
+    __m128d b = _mm_loadu_pd(ptr + stride);
+    __m128d c = _mm_loadu_pd(ptr + stride * 2);
+    __m128d d = _mm_loadu_pd(ptr + stride * 3);
+    return npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    );
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); }
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a));
+    __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1);
+    __m128d e0 = _mm256_castpd256_pd128(lo);
+    __m128d e1 = _mm256_extractf128_pd(lo, 1);
+    __m128d e2 = _mm256_castpd256_pd128(hi);
+    __m128d e3 = _mm256_extractf128_pd(hi, 1);
+    _mm_storel_pd((double*)(ptr + stride * 0), e0);
+    _mm_storeh_pd((double*)(ptr + stride * 1), e0);
+    _mm_storel_pd((double*)(ptr + stride * 2), e1);
+    _mm_storeh_pd((double*)(ptr + stride * 3), e1);
+    _mm_storel_pd((double*)(ptr + stride * 4), e2);
+    _mm_storeh_pd((double*)(ptr + stride * 5), e2);
+    _mm_storel_pd((double*)(ptr + stride * 6), e3);
+    _mm_storeh_pd((double*)(ptr + stride * 7), e3);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    __m256i lo = npyv512_lower_si256(a);
+    __m256i hi = npyv512_higher_si256(a);
+    __m128i e0 = _mm256_castsi256_si128(lo);
+    __m128i e1 = _mm256_extracti128_si256(lo, 1);
+    __m128i e2 = _mm256_castsi256_si128(hi);
+    __m128i e3 = _mm256_extracti128_si256(hi, 1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
 }
 //// 64
@@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
 }
 /*********************************
@@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
 }
 // fill zero to rest lanes
@@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
         4*stride, 5*stride, 6*stride, 7*stride
     );
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+       0,        1,          stride,   stride+1,
+       stride*2, stride*2+1, stride*3, stride*3+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
 }
 //// 64
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
 }
 /*********************************
@@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
 }
 //// 64
@@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4);
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0,        1,            stride,   stride+1,
+        2*stride, 2*stride+1, 3*stride, 3*stride+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
 }
 
@@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (pair load/store)
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                              \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX)                           \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64)
+
 /**************************************************
  * Lookup table
  *************************************************/
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+    const __m512i idx_a = npyv_set_u8(
+        0,  2,  4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+        32, 34, 36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+        64, 66, 68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+        96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+    );
+    const __m512i idx_b = npyv_set_u8(
+        1,  3,  5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        33, 35, 37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        65, 67, 69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,  93,  95,
+        97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+    );
+    r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+    #ifdef NPY_HAVE_AVX512BW
+        const __m512i idx = npyv_set_u8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+        __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+    #else
+        const __m256i idx = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+        __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+        __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+        __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+        __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+        __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+    #endif
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx_a = npyv_set_u16(
+        0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+        32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+    );
+    const __m512i idx_b = npyv_set_u16(
+        1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+    );
+    r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+    __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+    __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+    __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+    __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+    __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u32x2 r;
+    r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_u64x2 r;
+    r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_f32x2 r;
+    r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_f64x2 r;
+    r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 2a808a153..6627e5010 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
 #if NPY_SIMD_F64
     NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
 #endif
+#if NPY_SIMD_F32
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f32
+    npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        const npyv_f32 one = npyv_setall_f32(1.0f);
+        npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one));
+        return npyv_select_f32(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f32
+    npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+    {
+        const npyv_f32 zero = npyv_zero_f32();
+        return npyv_ifdiv_f32(m, a, b, zero);
+    }
+#endif
+#if NPY_SIMD_F64
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f64
+    npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 one = npyv_setall_f64(1.0);
+        npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one));
+        return npyv_select_f64(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f64
+    npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b)
+    {
+        const npyv_f64 zero = npyv_zero_f64();
+        return npyv_ifdiv_f64(m, a, b, zero);
+    }
+#endif
 
 #endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 00994806d..683620111 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
     { return vmlsq_f32(vnegq_f32(c), a, b); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+
 /***************************
  * FUSED F64
  ***************************/
@@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return vfmsq_f64(c, a, b); }
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vfmsq_f64(vnegq_f64(c), a, b); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+    }
 #endif // NPY_SIMD_F64
 
 /***************************
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index c0a771b5d..58d14809f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
     return vrndnq_f32(a);
 #else
     // ARMv7 NEON only supports fp to int truncate conversion.
-    // a magic trick of adding 1.5 * 2**23 is used for rounding
+    // a magic trick of adding 1.5 * 2^23 is used for rounding
     // to nearest even and then subtract this magic number to get
     // the integer.
-    const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
-    const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
-    npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
-    npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
-    round = vbslq_f32(overflow, round, a);
-    // signed zero
-    round = vreinterpretq_f32_s32(vorrq_s32(
-         vreinterpretq_s32_f32(round),
-         vandq_s32(vreinterpretq_s32_f32(a), szero)
-    ));
-    return round;
+    //
+    const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+    const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+    const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+    const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+    npyv_u32 nnan_mask = vceqq_f32(a, a);
+    // eliminate nans to avoid invalid fp errors
+    npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
+    // round by add magic number 1.5 * 2^23
+    npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+    // copysign
+    round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
+    // a if |a| >= 2^23 or a == NaN
+    npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+             mask = vandq_u32(mask, nnan_mask);
+    return vbslq_f32(mask, round, a);
 #endif
 }
 #if NPY_SIMD_F64
@@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_ceil_f32 vrndpq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
-        /**
-         * On armv7, vcvtq.f32 handles special cases as follows:
-         *  NaN return 0
-         * +inf or +outrange return 0x80000000(-0.0f)
-         * -inf or -outrange return 0x7fffffff(nan)
-         */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcltq_f32(round, a), one))
-        );
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(ceil),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+            vandq_u32(vcltq_f32(round, x), one))
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, ceil, a);
    }
 #endif
 #if NPY_SIMD_F64
@@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_trunc_f32 vrndq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+    {
         const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(
+            vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));
+
+        npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
+                 nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
+                 nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
+        // eliminate nans/inf to avoid invalid fp errors
+        npyv_f32 x = vreinterpretq_f32_u32(
+            veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
         /**
          * On armv7, vcvtq.f32 handles special cases as follows:
          *  NaN return 0
          * +inf or +outrange return 0x80000000(-0.0f)
          * -inf or -outrange return 0x7fffffff(nan)
          */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_s32 trunci = vcvtq_s32_f32(x);
+        npyv_f32 trunc = vcvtq_f32_s32(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(round),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        trunc = vreinterpretq_f32_u32(
+            vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
+        // if overflow return a
+        npyv_u32 overflow_mask = vorrq_u32(
+            vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // a if a overflow or nonfinite
+        return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
    }
 #endif
 #if NPY_SIMD_F64
@@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_floor_f32 vrndmq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
 
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcgtq_f32(round, a), one)
-        ));
-        // respect signed zero
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(floor),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
+            vandq_u32(vcgtq_f32(round, x), one)
         ));
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
-        );
-
-       return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, floor, a);
    }
 #endif // NPY_HAVE_ASIMD
 #if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 7060ea628..6163440c3 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
     );
 }
 #endif
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return vcombine_u32(
+        vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride)
+    );
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+#endif
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
 #endif
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+#if NPY_SIMD_F64
+    vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0);
+    vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1);
+#else
+    // armhf strict to alignment
+    vst1_u32((uint32_t*)ptr, vget_low_u32(a));
+    vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a));
+#endif
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+#endif
 /*********************************
  * Partial Load
  *********************************/
@@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 { return npyv_load_till_s64(ptr, nlane, 0); }
 
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
 
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                          npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        // armhf strict to alignment, may cause bus error
+    #if NPY_SIMD_F64
+        vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    #else
+        npyv_storel_s32(ptr, a);
+    #endif
+        return;
+    }
+    npyv_store_s32(ptr, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    vst1q_lane_s32((int32_t*)ptr, a, 0);
     switch(nlane) {
-    default:
-        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+    case 1:
+        return;
+    case 2:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+        return;
     case 3:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
         vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
-    case 2:
+        return;
+    default:
         vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
-    case 1:
-        vst1q_lane_s32((int32_t*)ptr, a, 0);
-        break;
+        vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
     }
 }
 //// 64
@@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     npyv_storen_s64(ptr, stride, a);
 }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+#if NPY_SIMD_F64
+    vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    if (nlane > 1) {
+        vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1);
+    }
+#else
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+#endif
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
 #if NPY_SIMD_F64
 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
 #endif
+
+// 128-bit/64-bit stride
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64)
+#endif
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR)                        \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return vld2q_##SFX((const T_PTR*)ptr);                           \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        vst2q_##SFX((T_PTR*)ptr, v);                                     \
+    }
+
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u8,  uint8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s8,  int8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float)
+
+#if NPY_SIMD_F64
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t)
+#else
+    #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX)                               \
+        NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                         \
+            const npyv_lanetype_##SFX *ptr)                                     \
+        {                                                                       \
+            npyv_##SFX a = npyv_load_##SFX(ptr);                                \
+            npyv_##SFX b = npyv_load_##SFX(ptr + 2);                            \
+            npyv_##SFX##x2 r;                                                   \
+            r.val[0] = vcombine_##SFX(vget_low_##SFX(a),  vget_low_##SFX(b));   \
+            r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b));  \
+            return r;                                                           \
+        }                                                                       \
+        NPY_FINLINE void npyv_store_##SFX##x2(                                  \
+            npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v)                         \
+        {                                                                       \
+            npyv_store_##SFX(ptr, vcombine_##SFX(                               \
+                vget_low_##SFX(v.val[0]),  vget_low_##SFX(v.val[1])));          \
+            npyv_store_##SFX(ptr + 2, vcombine_##SFX(                           \
+                vget_high_##SFX(v.val[0]),  vget_high_##SFX(v.val[1])));        \
+        }
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64)
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64)
+#endif
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
     #define NPY_SIMD_FMA3 0  // HW emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef uint8x16_t  npyv_u8;
 typedef int8x16_t   npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 249621bd6..e18ea94b8 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -240,10 +240,35 @@
 
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vceqq_f32(a, a); }
+{
+#if defined(__clang__)
+/**
+ * To avoid signaling qNaN, workaround for clang symmetric inputs bug
+ * check https://github.com/numpy/numpy/issues/22933,
+ * for more clarification.
+ */
+    npyv_b32 ret;
+    #if NPY_SIMD_F64
+        __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
+    #else
+        __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
+    #endif
+    return ret;
+#else
+    return vceqq_f32(a, a);
+#endif
+}
 #if NPY_SIMD_F64
     NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
-    { return vceqq_f64(a, a); }
+    {
+    #if defined(__clang__)
+        npyv_b64 ret;
+        __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
+        return ret;
+    #else
+        return vceqq_f64(a, a);
+    #endif
+    }
 #endif
 
 // Test cross all vector lanes
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
 NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #endif
 
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
-    NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
-    {                                                        \
-        T_VEC##x2 r;                                         \
-        r.val[0] = vzip1q_##SFX(a, b);                       \
-        r.val[1] = vzip2q_##SFX(a, b);                       \
-        return r;                                            \
-    }
-
+// interleave & deinterleave two vectors
 #ifdef __aarch64__
-    NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
-    NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
-    NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
-    NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
-    NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
-    NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
-    NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
-    NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vzip1q_##SFX(a, b);                       \
+            r.val[1] = vzip2q_##SFX(a, b);                       \
+            return r;                                            \
+        }                                                        \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vuzp1q_##SFX(a, b);                       \
+            r.val[1] = vuzp2q_##SFX(a, b);                       \
+            return r;                                            \
+        }
 #else
-    #define npyv_zip_u8  vzipq_u8
-    #define npyv_zip_s8  vzipq_s8
-    #define npyv_zip_u16 vzipq_u16
-    #define npyv_zip_s16 vzipq_s16
-    #define npyv_zip_u32 vzipq_u32
-    #define npyv_zip_s32 vzipq_s32
-    #define npyv_zip_f32 vzipq_f32
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        { return vzipq_##SFX(a, b); }                            \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        { return vuzpq_##SFX(a, b); }
 #endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
 
 // Reverse elements of each 64-bit lane
 #define npyv_rev64_u8  vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_rev64_s32 vrev64q_s32
 #define npyv_rev64_f32 vrev64q_f32
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+    #define npyv_permi128_u32(A, E0, E1, E2, E3)          \
+        npyv_set_u32(                                     \
+            vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+            vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3)  \
+        )
+    #define npyv_permi128_s32(A, E0, E1, E2, E3)          \
+        npyv_set_s32(                                     \
+            vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+            vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3)  \
+        )
+    #define npyv_permi128_f32(A, E0, E1, E2, E3)          \
+        npyv_set_f32(                                     \
+            vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+            vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s32 npyv_permi128_u32
+    #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1)                  \
+        npyv_set_u64(                                     \
+            vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1)  \
+        )
+    #define npyv_permi128_s64(A, E0, E1)                  \
+        npyv_set_s64(                                     \
+            vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1)  \
+        )
+    #define npyv_permi128_f64(A, E0, E1)                  \
+        npyv_set_f64(                                     \
+            vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s64 npyv_permi128_u64
+    #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+    #undef npyv_permi128_f64
+#endif
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_FMA3 0
     /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
     #define NPY_SIMD_BIGENDIAN 0
+    /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+    /// raises FP invalid exception for quite NaNs.
+    #define NPY_SIMD_CMPSIGNAL 0
 #endif
 
 // enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index bced35108..72a87eac1 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm_fnmsub_ps
     #define npyv_nmulsub_f64 _mm_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm_fmaddsub_pd
 #elif defined(NPY_HAVE_FMA4)
     // multiply and add, a*b + c
     #define npyv_muladd_f32 _mm_macc_ps
@@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and add, -(a*b) + c
     #define npyv_nmuladd_f32 _mm_nmacc_ps
     #define npyv_nmuladd_f64 _mm_nmacc_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_maddsub_ps
+    #define npyv_muladdsub_f64 _mm_maddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
     NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 m = npyv_mul_f32(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_ps(m, c);
+    #else
+        const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+        return npyv_add_f32(m, npyv_xor_f32(msign, c));
+    #endif
+    }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 m = npyv_mul_f64(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_pd(m, c);
+    #else
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_add_f64(m, npyv_xor_f64(msign, c));
+    #endif
+    }
 #endif // NPY_HAVE_FMA3
 #ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
     // negate multiply and subtract, -(a*b) - c
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 967240d09..b51c935af 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f32 szero = _mm_set1_ps(-0.0f);
-    __m128i roundi = _mm_cvtps_epi32(a);
-    __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
-    __m128 r = _mm_cvtepi32_ps(roundi);
-    // respect sign of zero
-    r = _mm_or_ps(r, _mm_and_ps(a, szero));
-    return npyv_select_f32(overflow, a, r);
+    const __m128 szero = _mm_set1_ps(-0.0f);
+    const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+    __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+            nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+            nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+    // eliminate nans/inf to avoid invalid fp errors
+    __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+    __m128i roundi = _mm_cvtps_epi32(x);
+    __m128 round = _mm_cvtepi32_ps(roundi);
+    // respect signed zero
+    round = _mm_or_ps(round, _mm_and_ps(a, szero));
+    // if overflow return a
+    __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+    // a if a overflow or nonfinite
+    return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round);
 #endif
 }
 
@@ -285,16 +295,20 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f64 szero = _mm_set1_pd(-0.0);
-    const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-    npyv_f64 abs_a = npyv_abs_f64(a);
+    const __m128d szero = _mm_set1_pd(-0.0);
+    const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+    __m128d nan_mask = _mm_cmpunord_pd(a, a);
+    // eliminate nans to avoid invalid fp errors within cmpge
+    __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
     // round by add magic number 2^52
     // assuming that MXCSR register is set to rounding
-    npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_a), two_power_52);
+    __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
     // copysign
-    npyv_f64 round = _mm_or_pd(abs_round, _mm_and_pd(a, szero));
-    // a if a >= 2^52
-    return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, round);
+    round = _mm_or_pd(round, _mm_and_pd(a, szero));
+    // a if |a| >= 2^52 or a == NaN
+    __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+            mask = _mm_or_pd(mask, nan_mask);
+    return npyv_select_f64(_mm_castpd_si128(mask), a, round);
 #endif
 }
 // ceil
@@ -304,24 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 round = _mm_cvtepi32_ps(roundi);
-        npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one));
+        // respect signed zero
+        ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil);
     }
     NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        npyv_f64 round = npyv_rint_f64(a);
-        npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
+        // round by add magic number 2^52
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one));
+        // respects sign of 0.0
+        ceil = _mm_or_pd(ceil, sign_x);
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+                mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, ceil);
     }
 #endif
 
@@ -332,26 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i trunci = _mm_cvttps_epi32(x);
+        __m128 trunc = _mm_cvtepi32_ps(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+        trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc);
     }
     NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-        npyv_f64 abs_a = npyv_abs_f64(a);
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
         // round by add magic number 2^52
-        npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
-        npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
-        npyv_f64 trunc = _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
-        // a if a >= 2^52
-        return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, trunc);
+        // assuming that MXCSR register is set to rounding
+        __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one);
+        __m128d trunc = _mm_sub_pd(abs_round, subtrahend);
+        // copysign
+        trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, trunc);
     }
 #endif
 
@@ -362,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
     {
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_f32 round = npyv_rint_f32(a);
-        return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one));
+        // respect signed zero
+        floor = _mm_or_ps(floor, _mm_and_ps(a, szero));
+        // if overflow return a
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor);
     }
     NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
     {
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        npyv_f64 round = npyv_rint_f64(a);
-        return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+        const __m128d one = _mm_set1_pd(1.0f);
+        const __m128d szero = _mm_set1_pd(-0.0f);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
+        // round by add magic number 2^52
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, floor);
     }
 #endif // NPY_HAVE_SSE41
 
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 3ff64848d..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d r = _mm_loadh_pd(
+        npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride)
+    );
+    return _mm_castpd_ps(r);
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a));
+    _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a));
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
 /*********************************
  * Partial Load
  *********************************/
@@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
         return _mm_cvtsi32_si128(*ptr);
     case 2:
         return _mm_loadl_epi64((const __m128i*)ptr);
-    case 3:;
-        npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
-    #ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[2], 2);
-    #else
-        return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
-    #endif
+    case 3: {
+            npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+        #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[2], 2);
+        #else
+            return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+        #endif
+        }
     default:
         return npyv_load_s32(ptr);
     }
@@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     }
     return npyv_load_s64(ptr);
 }
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
     case 1:
         return _mm_cvtsi32_si128(ptr[0]);
     case 2:;
-        npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[stride], 1);
-#else
-        return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
-    case 3:;
-        a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        a = _mm_insert_epi32(a, ptr[stride], 1);
-        a = _mm_insert_epi32(a, ptr[stride*2], 2);
-        return a;
-#else
-        a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-        a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
-        return a;
-#endif // NPY_HAVE_SSE41
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[stride], 1);
+    #else
+            return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+    #endif // NPY_HAVE_SSE41
+        }
+    case 3:
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            a = _mm_insert_epi32(a, ptr[stride], 1);
+            a = _mm_insert_epi32(a, ptr[stride*2], 2);
+            return a;
+    #else
+            a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+            a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+            return a;
+    #endif // NPY_HAVE_SSE41
+        }
     default:
         return npyv_loadn_s32(ptr, stride);
     }
@@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
     }
     return npyv_loadn_s64(ptr, stride);
 }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = _mm_cvtsi128_si32(a);
     switch(nlane) {
+    case 1:
+        return;
 #ifdef NPY_HAVE_SSE41
-    default:
-        ptr[stride*3] = _mm_extract_epi32(a, 3);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
+        return;
     case 3:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
         ptr[stride*2] = _mm_extract_epi32(a, 2);
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a, 1);
+        ptr[stride*2] = _mm_extract_epi32(a, 2);
+        ptr[stride*3] = _mm_extract_epi32(a, 3);
 #else
-    default:
-        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+    case 2:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        return;
     case 3:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
         ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
 #endif
-    case 1:
-        ptr[stride*0] = _mm_cvtsi128_si32(a);
-        break;
     }
 }
 //// 64
@@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX)                              \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u8(abl, abh);
+#else
+    __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+    __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+    __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+    __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+    __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+    __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+    npyv_u8x2 r;
+    r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+    r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+    return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u16(abl, abh);
+#else
+    __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+    __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+    __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+    __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+    npyv_u16x2 r;
+    r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+    r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+    return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    npyv_f32x2 r;
+    r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+    r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
 {
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
     #define NPY_SIMD_FMA3 0  // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef __m128i npyv_u8;
 typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index a2e9d07eb..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vec_neg(vec_madd(a, b, c)); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+#endif
+NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{
+    const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+    return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+}
 /***************************
  * Summation
  ***************************/
@@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
     return vec_extract(sum, 0) + vec_extract(sum, 1);
+    (void)sum;
 }
 #endif
 
@@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
     npyv_u32 four = vec_sum4s(a, zero);
     npyv_s32 one  = vec_sums((npyv_s32)four, (npyv_s32)zero);
     return (npy_uint16)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
@@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
     npyv_u32   four  = vec_add(eight.val[0], eight.val[1]);
     npyv_s32   one   = vec_sums((npyv_s32)four, zero);
     return (npy_uint32)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index f0d625c55..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 4);
     #endif
+        // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+	(void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
     {
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
     {
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
     {
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
 #else
     NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
@@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #ifdef NPY_HAVE_VXE2
         return vec_signed(a);
     #elif defined(NPY_HAVE_VXE)
-        return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+        return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
+            vec_signed(npyv_doublee(vec_mergel(a, a))));
     // VSX
     #elif defined(__IBMC__)
         return vec_cts(a, 0);
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
     {                                                                   \
         npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8));               \
         return (npy_##STYPE)vec_extract(r, 0);                          \
+    	(void)r;					 	                                \
     }
 NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
 NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
     {                                                             \
         npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8));           \
         return vec_extract(r, 0);                                 \
+        (void)r;                                                  \
     }                                                             \
     NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a)    \
     {                                                             \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index e8f588ef2..1ad39cead 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
 #endif
 
 // avoid aliasing rules
-#ifdef __cplusplus
-    template<typename T_PTR>
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 
 // load lower part
 NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
@@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_set_s64(ptr[0], ptr[stride]); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return npyv_set_f64(ptr[0], ptr[stride]); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#endif
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0);
+    *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     npyv_s32 vfill = npyv_setall_s32(fill);
 #ifdef NPY_HAVE_VX
-    const unsigned blane = (unsigned short)nlane;
+    const unsigned blane = (nlane > 4) ? 4 : nlane;
     const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
-    const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+    const npyv_u32 vlane = npyv_setall_u32(blane);
     const npyv_b32 mask  = vec_cmpgt(vlane, steps);
     npyv_s32 a = vec_load_len(ptr, blane*4-1);
     return vec_sel(vfill, a, mask);
@@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = ((unsigned short)nlane)*4 - 1;
-    return vec_load_len(ptr, blane);
+    unsigned blane = (nlane > 4) ? 4 : nlane;
+    return vec_load_len(ptr, blane*4-1);
 #else
     return npyv_load_till_s32(ptr, nlane, 0);
 #endif
@@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     return vec_load_len((const signed long long*)ptr, blane*8-1);
 #else
     return npyv_load_till_s64(ptr, nlane, 0);
 #endif
 }
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 4) ? 4 : nlane;
     vec_store_len(a, ptr, blane*4-1);
 #else
     switch(nlane) {
@@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     vec_store_len(a, (signed long long*)ptr, blane*8-1);
 #else
     if (nlane == 1) {
@@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     npyv_store_s64(ptr, a);
 #endif
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = vec_extract(a, 0);
     switch(nlane) {
-    default:
-        ptr[stride*3] = vec_extract(a, 3);
-    case 3:
-        ptr[stride*2] = vec_extract(a, 2);
+    case 1:
+        return;
     case 2:
         ptr[stride*1] = vec_extract(a, 1);
-    case 1:
-        ptr[stride*0] = vec_extract(a, 0);
-        break;
+        return;
+    case 3:
+        ptr[stride*1] = vec_extract(a, 1);
+        ptr[stride*2] = vec_extract(a, 2);
+        return;
+    default:
+         ptr[stride*1] = vec_extract(a, 1);
+         ptr[stride*2] = vec_extract(a, 2);
+         ptr[stride*3] = vec_extract(a, 3);
     }
 }
 //// 64
@@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX)                                \
+    NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX);   \
+    NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return npyv_unzip_##SFX(                                         \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+        );                                                               \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]);         \
+        npyv_store_##SFX(ptr, zip.val[0]);                               \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);           \
+    }
+
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u64)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f32)
+#endif
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
 #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
 #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
 
-#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)VAL)
-#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)(VAL))
+#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
 #if NPY_SIMD_F32
-    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
 #endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
 #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
 
 // vector with specific values set to each lane and
 // set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
 #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
 #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
 #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
 #if NPY_SIMD_F32
     #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
 #endif
 NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u8x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+    npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+    npyv_s8x2 r;
+    r.val[0] = (npyv_s8)ru.val[0];
+    r.val[1] = (npyv_s8)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+    );
+    npyv_u16x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+    npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+    npyv_s16x2 r;
+    r.val[0] = (npyv_s16)ru.val[0];
+    r.val[1] = (npyv_s16)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    npyv_u32 m0 = vec_mergeh(ab0, ab1);
+    npyv_u32 m1 = vec_mergel(ab0, ab1);
+    npyv_u32 r0 = vec_mergeh(m0, m1);
+    npyv_u32 r1 = vec_mergel(m0, m1);
+    npyv_u32x2 r;
+    r.val[0] = r0;
+    r.val[1] = r1;
+    return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+    npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+    npyv_s32x2 r;
+    r.val[0] = (npyv_s32)ru.val[0];
+    r.val[1] = (npyv_s32)ru.val[1];
+    return r;
+}
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+    {
+        npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+        npyv_f32x2 r;
+        r.val[0] = (npyv_f32)ru.val[0];
+        r.val[1] = (npyv_f32)ru.val[1];
+        return r;
+    }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
     { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
 #endif
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3)      \
+    vec_perm(A, A, npyv_set_u8(                   \
+        (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+        (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+        (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+        (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3  \
+    ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+    #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
 #endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
 
 #ifdef NPY_HAVE_VX
     #define NPY_SIMD_BIGENDIAN 1
+    #define NPY_SIMD_CMPSIGNAL 0
 #else
     #define NPY_SIMD_BIGENDIAN 0
+    #define NPY_SIMD_CMPSIGNAL 1
 #endif
 
 typedef __vector unsigned char      npyv_u8;
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index 6fe157199..4b17a2809 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -2,6 +2,6 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
 
 NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
+PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align);
 
 #endif  /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/core/src/common/utils.hpp b/numpy/core/src/common/utils.hpp
new file mode 100644
index 000000000..f847cab44
--- /dev/null
+++ b/numpy/core/src/common/utils.hpp
@@ -0,0 +1,51 @@
+#ifndef NUMPY_CORE_SRC_COMMON_UTILS_HPP
+#define NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
+#include "npdef.hpp"
+
+#if NP_HAS_CPP20
+    #include <bit>
+#endif
+
+#include <type_traits>
+#include <string.h>
+
+namespace np {
+
+/** Create a value of type `To` from the bits of `from`.
+ *
+ * similar to `std::bit_cast` but compatible with C++17,
+ * should perform similar to `*reinterpret_cast<To*>(&from)`
+ * or through punning without expecting any undefined behaviors.
+ */
+template<typename To, typename From>
+#if NP_HAS_BUILTIN(__builtin_bit_cast) || NP_HAS_CPP20
+[[nodiscard]] constexpr
+#else
+inline
+#endif
+To BitCast(const From &from) noexcept
+{
+    static_assert(
+        sizeof(To) == sizeof(From),
+        "both data types must have the same size");
+
+    static_assert(
+        std::is_trivially_copyable_v<To> &&
+        std::is_trivially_copyable_v<From>,
+        "both data types must be trivially copyable");
+
+#if NP_HAS_CPP20
+    return std::bit_cast<To>(from);
+#elif NP_HAS_BUILTIN(__builtin_bit_cast)
+    return __builtin_bit_cast(To, from);
+#else
+    To to;
+    memcpy(&to, &from, sizeof(from));
+    return to;
+#endif
+}
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
diff --git a/numpy/core/src/multiarray/abstractdtypes.h b/numpy/core/src/multiarray/abstractdtypes.h
index b0850bd35..a3f6ceb05 100644
--- a/numpy/core/src/multiarray/abstractdtypes.h
+++ b/numpy/core/src/multiarray/abstractdtypes.h
@@ -58,7 +58,8 @@ npy_mark_tmp_array_if_pyscalar(
         }
         return 1;
     }
-    else if (PyComplex_Check(obj) && PyArray_TYPE(arr) == NPY_CDOUBLE) {
+    else if (PyComplex_Check(obj) && !PyArray_IsScalar(obj, CDouble)
+             && PyArray_TYPE(arr) == NPY_CDOUBLE) {
         ((PyArrayObject_fields *)arr)->flags |= NPY_ARRAY_WAS_PYTHON_COMPLEX;
         if (dtype != NULL) {
             Py_INCREF(&PyArray_PyComplexAbstractDType);
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 1d7753275..d79be1df5 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 9d5bf6875..7aee7d6e9 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -130,7 +130,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     }
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        npy_clear_floatstatus_barrier(src_data);
+        npy_clear_floatstatus_barrier((char*)&src_data);
     }
     if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS;
@@ -153,7 +153,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     NPY_cast_info_xfree(&cast_info);
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier(src_data);
+        int fpes = npy_get_floatstatus_barrier((char*)&src_data);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 13d6682f7..ba9118052 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -16,6 +16,7 @@
 #include "common_dtype.h"
 #include "dtypemeta.h"
 
+#include "npy_argparse.h"
 #include "abstractdtypes.h"
 #include "array_coercion.h"
 #include "ctors.h"
@@ -873,42 +874,62 @@ find_descriptor_from_array(
 
 /**
  * Given a dtype or DType object, find the correct descriptor to cast the
- * array to.
+ * array to.  In some places, this function is used with dtype=NULL which
+ * means that legacy behavior is used: The dtype instances "S0", "U0", and
+ * "V0" are converted to mean the DType classes instead.
+ * When dtype != NULL, this path is ignored, and the function does nothing
+ * unless descr == NULL. If both descr and dtype are null, it returns the
+ * descriptor for the array.
  *
  * This function is identical to normal casting using only the dtype, however,
  * it supports inspecting the elements when the array has object dtype
  * (and the given datatype describes a parametric DType class).
  *
  * @param arr
- * @param dtype A dtype instance or class.
+ * @param dtype NULL or a dtype class
+ * @param descr A dtype instance, if the dtype is NULL the dtype class is
+ *              found and e.g. "S0" is converted to denote only String.
  * @return A concrete dtype instance or NULL
  */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype)
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr)
 {
     /* If the requested dtype is flexible, adapt it */
-    PyArray_Descr *new_dtype;
-    PyArray_DTypeMeta *new_DType;
+    PyArray_Descr *new_descr;
     int res;
 
-    res = PyArray_ExtractDTypeAndDescriptor((PyObject *)dtype,
-            &new_dtype, &new_DType);
-    if (res < 0) {
-        return NULL;
+    if (dtype != NULL && descr != NULL) {
+        /* descr was given and no special logic, return (call not necessary) */
+        Py_INCREF(descr);
+        return descr;
     }
-    if (new_dtype == NULL) {
-        res = find_descriptor_from_array(arr, new_DType, &new_dtype);
+    if (dtype == NULL) {
+        res = PyArray_ExtractDTypeAndDescriptor(descr, &new_descr, &dtype);
         if (res < 0) {
-            Py_DECREF(new_DType);
             return NULL;
         }
-        if (new_dtype == NULL) {
-            /* This is an object array but contained no elements, use default */
-            new_dtype = NPY_DT_CALL_default_descr(new_DType);
+        if (new_descr != NULL) {
+            Py_DECREF(dtype);
+            return new_descr;
         }
     }
-    Py_DECREF(new_DType);
-    return new_dtype;
+    else {
+        assert(descr == NULL);  /* gueranteed above */
+        Py_INCREF(dtype);
+    }
+
+    res = find_descriptor_from_array(arr, dtype, &new_descr);
+    if (res < 0) {
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    if (new_descr == NULL) {
+        /* This is an object array but contained no elements, use default */
+        new_descr = NPY_DT_CALL_default_descr(dtype);
+    }
+    Py_XDECREF(dtype);
+    return new_descr;
 }
 
 
@@ -1004,53 +1025,6 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             Py_DECREF(arr);
             arr = NULL;
         }
-        else if (curr_dims > 0 && curr_dims != max_dims) {
-            /*
-             * Deprecated 2020-12-09, NumPy 1.20
-             *
-             * See https://github.com/numpy/numpy/issues/17965
-             * Shapely had objects which are not sequences but did export
-             * the array-interface (and so are arguably array-like).
-             * Previously numpy would not use array-like information during
-             * shape discovery, so that it ended up acting as if this was
-             * an (unknown) scalar but with the specified dtype.
-             * Thus we ignore "scalars" here, as the value stored in the
-             * array should be acceptable.
-             */
-            if (PyArray_NDIM(arr) > 0 && NPY_UNLIKELY(!PySequence_Check(obj))) {
-                if (PyErr_WarnFormat(PyExc_FutureWarning, 1,
-                        "The input object of type '%s' is an array-like "
-                        "implementing one of the corresponding protocols "
-                        "(`__array__`, `__array_interface__` or "
-                        "`__array_struct__`); but not a sequence (or 0-D). "
-                        "In the future, this object will be coerced as if it "
-                        "was first converted using `np.array(obj)`. "
-                        "To retain the old behaviour, you have to either "
-                        "modify the type '%s', or assign to an empty array "
-                        "created with `np.empty(correct_shape, dtype=object)`.",
-                        Py_TYPE(obj)->tp_name, Py_TYPE(obj)->tp_name) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                /*
-                 * Strangely enough, even though we threw away the result here,
-                 * we did use it during descriptor discovery, so promote it:
-                 */
-                if (update_shape(curr_dims, &max_dims, out_shape,
-                        0, NULL, NPY_FALSE, flags) < 0) {
-                    *flags |= FOUND_RAGGED_ARRAY;
-                    Py_DECREF(arr);
-                    return max_dims;
-                }
-                if (!(*flags & DESCRIPTOR_WAS_SET) && handle_promotion(
-                        out_descr, PyArray_DESCR(arr), fixed_DType, flags) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                Py_DECREF(arr);
-                return max_dims;
-            }
-        }
     }
     if (arr != NULL) {
         /*
@@ -1260,7 +1234,9 @@ PyArray_DiscoverDTypeAndShape(
     }
 
     if (requested_descr != NULL) {
-        assert(fixed_DType == NPY_DTYPE(requested_descr));
+        if (fixed_DType != NULL) {
+            assert(fixed_DType == NPY_DTYPE(requested_descr));
+        }
         /* The output descriptor must be the input. */
         Py_INCREF(requested_descr);
         *out_descr = requested_descr;
@@ -1380,105 +1356,25 @@ PyArray_DiscoverDTypeAndShape(
 }
 
 
-
-/**
- * Check the descriptor is a legacy "flexible" DType instance, this is
- * an instance which is (normally) not attached to an array, such as a string
- * of length 0 or a datetime with no unit.
- * These should be largely deprecated, and represent only the DType class
- * for most `dtype` parameters.
- *
- * TODO: This function should eventually receive a deprecation warning and
- *       be removed.
- *
- * @param descr
- * @return 1 if this is not a concrete dtype instance 0 otherwise
- */
-static int
-descr_is_legacy_parametric_instance(PyArray_Descr *descr)
-{
-    if (PyDataType_ISUNSIZED(descr)) {
-        return 1;
-    }
-    /* Flexible descr with generic time unit (which can be adapted) */
-    if (PyDataType_ISDATETIME(descr)) {
-        PyArray_DatetimeMetaData *meta;
-        meta = get_datetime_metadata_from_dtype(descr);
-        if (meta->base == NPY_FR_GENERIC) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Given either a DType instance or class, (or legacy flexible instance),
- * ands sets output dtype instance and DType class. Both results may be
- * NULL, but if `out_descr` is set `out_DType` will always be the
- * corresponding class.
- *
- * @param dtype
- * @param out_descr
- * @param out_DType
- * @return 0 on success -1 on failure
- */
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
-{
-    *out_DType = NULL;
-    *out_descr = NULL;
-
-    if (dtype != NULL) {
-        if (PyObject_TypeCheck(dtype, (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            assert(dtype != (PyObject * )&PyArrayDescr_Type);  /* not np.dtype */
-            *out_DType = (PyArray_DTypeMeta *)dtype;
-            Py_INCREF(*out_DType);
-        }
-        else if (PyObject_TypeCheck((PyObject *)Py_TYPE(dtype),
-                    (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            *out_DType = NPY_DTYPE(dtype);
-            Py_INCREF(*out_DType);
-            if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype)) {
-                *out_descr = (PyArray_Descr *)dtype;
-                Py_INCREF(*out_descr);
-            }
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                    "dtype parameter must be a DType instance or class.");
-            return -1;
-        }
-    }
-    return 0;
-}
-
-
 /*
  * Python API function to expose the dtype+shape discovery functionality
  * directly.
  */
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs)
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    static char *kwlist[] = {"obj", "dtype", NULL};
-
     PyObject *obj;
-    PyObject *dtype = NULL;
-    PyArray_Descr *fixed_descriptor = NULL;
-    PyArray_DTypeMeta *fixed_DType = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     npy_intp shape[NPY_MAXDIMS];
 
-    if (!PyArg_ParseTupleAndKeywords(
-            args, kwargs, "O|O:_discover_array_parameters", kwlist,
-            &obj, &dtype)) {
-        return NULL;
-    }
-
-    if (PyArray_ExtractDTypeAndDescriptor(dtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments(
+            "_discover_array_parameters", args, len_args, kwnames,
+            "", NULL, &obj,
+            "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
+            NULL, NULL, NULL) < 0) {
+        /* fixed is last to parse, so never necessary to clean up */
         return NULL;
     }
 
@@ -1487,9 +1383,9 @@ _discover_array_parameters(PyObject *NPY_UNUSED(self),
     int ndim = PyArray_DiscoverDTypeAndShape(
             obj, NPY_MAXDIMS, shape,
             &coercion_cache,
-            fixed_DType, fixed_descriptor, (PyArray_Descr **)&out_dtype, 0);
-    Py_XDECREF(fixed_DType);
-    Py_XDECREF(fixed_descriptor);
+            dt_info.dtype, dt_info.descr, (PyArray_Descr **)&out_dtype, 0);
+    Py_XDECREF(dt_info.dtype);
+    Py_XDECREF(dt_info.descr);
     if (ndim < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index 63d543cf7..0757c1cb8 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -26,7 +26,8 @@ NPY_NO_EXPORT int
 PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value);
 
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype);
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr);
 
 NPY_NO_EXPORT int
 PyArray_DiscoverDTypeAndShape(
@@ -36,14 +37,9 @@ PyArray_DiscoverDTypeAndShape(
         PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
         PyArray_Descr **out_descr, int never_copy);
 
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
-
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs);
-
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames);
 
 /* Would make sense to inline the freeing functions everywhere */
 /* Frees the coercion cache object recursively. */
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index 79c588f25..87515bb03 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -27,15 +27,18 @@
  *    weak reference to the input DTypes.
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _UMATHMODULE
 #define _MULTIARRAYMODULE
 
 #include <npy_pycompat.h>
 #include "arrayobject.h"
+#include "array_coercion.h"
 #include "array_method.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "convert_datatype.h"
 #include "common.h"
+#include "numpy/ufuncobject.h"
 
 
 /*
@@ -248,6 +251,7 @@ fill_arraymethod_from_slots(
     /* Set the defaults */
     meth->get_strided_loop = &npy_default_get_strided_loop;
     meth->resolve_descriptors = &default_resolve_descriptors;
+    meth->get_reduction_initial = NULL;  /* no initial/identity by default */
 
     /* Fill in the slots passed by the user */
     /*
@@ -260,7 +264,7 @@ fill_arraymethod_from_slots(
             case NPY_METH_resolve_descriptors:
                 meth->resolve_descriptors = slot->pfunc;
                 continue;
-            case NPY_METH_get_loop:
+            case _NPY_METH_get_loop:
                 /*
                  * NOTE: get_loop is considered "unstable" in the public API,
                  *       I do not like the signature, and the `move_references`
@@ -284,6 +288,12 @@ fill_arraymethod_from_slots(
             case NPY_METH_unaligned_contiguous_loop:
                 meth->unaligned_contiguous_loop = slot->pfunc;
                 continue;
+            case NPY_METH_get_reduction_initial:
+                meth->get_reduction_initial = slot->pfunc;
+                continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -334,27 +344,39 @@ fill_arraymethod_from_slots(
     }
 
     /* Check whether the provided loops make sense. */
+    if (meth->flags & NPY_METH_SUPPORTS_UNALIGNED) {
+        if (meth->unaligned_strided_loop == NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must provide unaligned strided inner loop when using "
+                    "NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    else {
+        if (meth->unaligned_strided_loop != NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must not provide unaligned strided inner loop when not "
+                    "using NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    /* Fill in the blanks: */
+    if (meth->unaligned_contiguous_loop == NULL) {
+        meth->unaligned_contiguous_loop = meth->unaligned_strided_loop;
+    }
     if (meth->strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide a strided inner loop function. (method: %s)",
-                spec->name);
-        return -1;
+        meth->strided_loop = meth->unaligned_strided_loop;
     }
     if (meth->contiguous_loop == NULL) {
         meth->contiguous_loop = meth->strided_loop;
     }
-    if (meth->unaligned_contiguous_loop != NULL &&
-            meth->unaligned_strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
-        return -1;
-    }
-    if ((meth->unaligned_strided_loop == NULL) !=
-            !(meth->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+
+    if (meth->strided_loop == NULL) {
         PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
+                "Must provide a strided inner loop function. (method: %s)",
+                spec->name);
         return -1;
     }
 
@@ -883,7 +905,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index c9ec8903d..c82a968cd 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -11,40 +11,7 @@
 extern "C" {
 #endif
 
-typedef enum {
-    /* Flag for whether the GIL is required */
-    NPY_METH_REQUIRES_PYAPI = 1 << 1,
-    /*
-     * Some functions cannot set floating point error flags, this flag
-     * gives us the option (not requirement) to skip floating point error
-     * setup/check. No function should set error flags and ignore them
-     * since it would interfere with chaining operations (e.g. casting).
-     */
-    /*
-     * TODO: Change this into a positive flag?  That would make "combing"
-     *       multiple methods easier. OTOH, if we add more flags, the default
-     *       would be 0 just like it is here.
-     */
-    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
-    /* Whether the method supports unaligned access (not runtime) */
-    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
-    /*
-     * Private flag for now for *logic* functions.  The logical functions
-     * `logical_or` and `logical_and` can always cast the inputs to booleans
-     * "safely" (because that is how the cast to bool is defined).
-     * @seberg: I am not sure this is the best way to handle this, so its
-     * private for now (also it is very limited anyway).
-     * There is one "exception". NA aware dtypes cannot cast to bool
-     * (hopefully), so the `??->?` loop should error even with this flag.
-     * But a second NA fallback loop will be necessary.
-     */
-    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
-
-    /* All flags which can change at runtime */
-    NPY_METH_RUNTIME_FLAGS = (
-            NPY_METH_REQUIRES_PYAPI |
-            NPY_METH_NO_FLOATINGPOINT_ERRORS),
-} NPY_ARRAYMETHOD_FLAGS;
+#include "numpy/_dtype_api.h"
 
 
 /*
@@ -60,119 +27,6 @@ typedef enum {
             ((flags1 | flags2) & ~PyArrayMethod_MINIMAL_FLAGS)  \
             | (flags1 & flags2)))
 
-
-struct PyArrayMethodObject_tag;
-
-/*
- * This struct is specific to an individual (possibly repeated) call of
- * the ArrayMethods strided operator, and as such is passed into the various
- * methods of the ArrayMethod object (the resolve_descriptors function,
- * the get_loop function and the individual lowlevel strided operator calls).
- * It thus has to be persistent for one end-user call, and then be discarded.
- *
- * TODO: Before making this public, we should review which information should
- *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
- */
-typedef struct {
-    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
-    struct PyArrayMethodObject_tag *method;
-
-    /* Operand descriptors, filled in by resolve_descriptors */
-    PyArray_Descr **descriptors;
-} PyArrayMethod_Context;
-
-
-typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
-        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
-        NpyAuxData *transferdata);
-
-
-typedef NPY_CASTING (resolve_descriptors_function)(
-        struct PyArrayMethodObject_tag *method,
-        PyArray_DTypeMeta **dtypes,
-        PyArray_Descr **given_descrs,
-        PyArray_Descr **loop_descrs,
-        npy_intp *view_offset);
-
-
-typedef int (get_loop_function)(
-        PyArrayMethod_Context *context,
-        int aligned, int move_references,
-        const npy_intp *strides,
-        PyArrayMethod_StridedLoop **out_loop,
-        NpyAuxData **out_transferdata,
-        NPY_ARRAYMETHOD_FLAGS *flags);
-
-
-/*
- * The following functions are only used be the wrapping array method defined
- * in umath/wrapping_array_method.c
- */
-
-/*
- * The function to convert the given descriptors (passed in to
- * `resolve_descriptors`) and translates them for the wrapped loop.
- * The new descriptors MUST be viewable with the old ones, `NULL` must be
- * supported (for outputs) and should normally be forwarded.
- *
- * The function must clean up on error.
- *
- * NOTE: We currently assume that this translation gives "viewable" results.
- *       I.e. there is no additional casting related to the wrapping process.
- *       In principle that could be supported, but not sure it is useful.
- *       This currently also means that e.g. alignment must apply identically
- *       to the new dtypes.
- *
- * TODO: Due to the fact that `resolve_descriptors` is also used for `can_cast`
- *       there is no way to "pass out" the result of this function.  This means
- *       it will be called twice for every ufunc call.
- *       (I am considering including `auxdata` as an "optional" parameter to
- *       `resolve_descriptors`, so that it can be filled there if not NULL.)
- */
-typedef int translate_given_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *wrapped_dtypes[],
-        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
-
-/**
- * The function to convert the actual loop descriptors (as returned by the
- * original `resolve_descriptors` function) to the ones the output array
- * should use.
- * This function must return "viewable" types, it must not mutate them in any
- * form that would break the inner-loop logic.  Does not need to support NULL.
- *
- * The function must clean up on error.
- *
- * @param nargs Number of arguments
- * @param new_dtypes The DTypes of the output (usually probably not needed)
- * @param given_descrs Original given_descrs to the resolver, necessary to
- *        fetch any information related to the new dtypes from the original.
- * @param original_descrs The `loop_descrs` returned by the wrapped loop.
- * @param loop_descrs The output descriptors, compatible to `original_descrs`.
- *
- * @returns 0 on success, -1 on failure.
- */
-typedef int translate_loop_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
-        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
-
-
-/*
- * This struct will be public and necessary for creating a new ArrayMethod
- * object (casting and ufuncs).
- * We could version the struct, although since we allow passing arbitrary
- * data using the slots, and have flags, that may be enough?
- * (See also PyBoundArrayMethodObject.)
- */
-typedef struct {
-    const char *name;
-    int nin, nout;
-    NPY_CASTING casting;
-    NPY_ARRAYMETHOD_FLAGS flags;
-    PyArray_DTypeMeta **dtypes;
-    PyType_Slot *slots;
-} PyArrayMethod_Spec;
-
-
 /*
  * Structure of the ArrayMethod. This structure should probably not be made
  * public. If necessary, we can make certain operations on it public
@@ -193,16 +47,20 @@ typedef struct PyArrayMethodObject_tag {
     NPY_ARRAYMETHOD_FLAGS flags;
     resolve_descriptors_function *resolve_descriptors;
     get_loop_function *get_strided_loop;
+    get_reduction_initial_function  *get_reduction_initial;
     /* Typical loop functions (contiguous ones are used in current casts) */
     PyArrayMethod_StridedLoop *strided_loop;
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
     translate_given_descrs_func *translate_given_descrs;
     translate_loop_descrs_func *translate_loop_descrs;
+    /* Chunk reserved for use by the legacy fallback arraymethod */
+    char legacy_initial[sizeof(npy_clongdouble)];  /* initial value storage */
 } PyArrayMethodObject;
 
 
@@ -226,18 +84,6 @@ typedef struct {
 extern NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type;
 extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 
-/*
- * SLOTS IDs For the ArrayMethod creation, one public, the IDs are fixed.
- * TODO: Before making it public, consider adding a large constant to private
- *       slots.
- */
-#define NPY_METH_resolve_descriptors 1
-#define NPY_METH_get_loop 2
-#define NPY_METH_strided_loop 3
-#define NPY_METH_contiguous_loop 4
-#define NPY_METH_unaligned_strided_loop 5
-#define NPY_METH_unaligned_contiguous_loop 6
-
 
 /*
  * Used internally (initially) for real to complex loops only
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index 2bb3fbe28..3c55e2164 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -1,11 +1,15 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#include <Python.h>
+#include "structmember.h"
+
 #include "npy_pycompat.h"
 #include "get_attr_string.h"
 #include "npy_import.h"
 #include "multiarraymodule.h"
 
+#include "arrayfunction_override.h"
 
 /* Return the ndarray.__array_function__ method. */
 static PyObject *
@@ -200,183 +204,67 @@ call_array_function(PyObject* argument, PyObject* method,
 }
 
 
-/**
- * Internal handler for the array-function dispatching. The helper returns
- * either the result, or NotImplemented (as a borrowed reference).
- *
- * @param public_api The public API symbol used for dispatching
- * @param relevant_args Arguments which may implement __array_function__
- * @param args Original arguments
- * @param kwargs Original keyword arguments
- *
- * @returns The result of the dispatched version, or a borrowed reference
- *          to NotImplemented to indicate the default implementation should
- *          be used.
+
+/*
+ * Helper to convert from vectorcall convention, since the protocol requires
+ * args and kwargs to be passed as tuple and dict explicitly.
+ * We always pass a dict, so always returns it.
  */
-static PyObject *
-array_implement_array_function_internal(
-    PyObject *public_api, PyObject *relevant_args,
-    PyObject *args, PyObject *kwargs)
+static int
+get_args_and_kwargs(
+        PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames,
+        PyObject **out_args, PyObject **out_kwargs)
 {
-    PyObject *implementing_args[NPY_MAXARGS];
-    PyObject *array_function_methods[NPY_MAXARGS];
-    PyObject *types = NULL;
+    len_args = PyVectorcall_NARGS(len_args);
+    PyObject *args = PyTuple_New(len_args);
+    PyObject *kwargs = NULL;
 
-    PyObject *result = NULL;
-
-    static PyObject *errmsg_formatter = NULL;
-
-    relevant_args = PySequence_Fast(
-        relevant_args,
-        "dispatcher for __array_function__ did not return an iterable");
-    if (relevant_args == NULL) {
-        return NULL;
+    if (args == NULL) {
+        return -1;
     }
-
-    /* Collect __array_function__ implementations */
-    int num_implementing_args = get_implementing_args_and_methods(
-        relevant_args, implementing_args, array_function_methods);
-    if (num_implementing_args == -1) {
-        goto cleanup;
+    for (Py_ssize_t i = 0; i < len_args; i++) {
+        Py_INCREF(fast_args[i]);
+        PyTuple_SET_ITEM(args, i, fast_args[i]);
     }
-
-    /*
-     * Handle the typical case of no overrides. This is merely an optimization
-     * if some arguments are ndarray objects, but is also necessary if no
-     * arguments implement __array_function__ at all (e.g., if they are all
-     * built-in types).
-     */
-    int any_overrides = 0;
-    for (int j = 0; j < num_implementing_args; j++) {
-        if (!is_default_array_function(array_function_methods[j])) {
-            any_overrides = 1;
-            break;
-        }
+    kwargs = PyDict_New();
+    if (kwargs == NULL) {
+        Py_DECREF(args);
+        return -1;
     }
-    if (!any_overrides) {
-        /*
-         * When the default implementation should be called, return
-         * `Py_NotImplemented` to indicate this.
-         */
-        result = Py_NotImplemented;
-        goto cleanup;
-    }
-
-    /*
-     * Create a Python object for types.
-     * We use a tuple, because it's the fastest Python collection to create
-     * and has the bonus of being immutable.
-     */
-    types = PyTuple_New(num_implementing_args);
-    if (types == NULL) {
-        goto cleanup;
-    }
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
-        Py_INCREF(arg_type);
-        PyTuple_SET_ITEM(types, j, arg_type);
-    }
-
-    /* Call __array_function__ methods */
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *argument = implementing_args[j];
-        PyObject *method = array_function_methods[j];
-
-        /*
-         * We use `public_api` instead of `implementation` here so
-         * __array_function__ implementations can do equality/identity
-         * comparisons.
-         */
-        result = call_array_function(
-            argument, method, public_api, types, args, kwargs);
-
-        if (result == Py_NotImplemented) {
-            /* Try the next one */
-            Py_DECREF(result);
-            result = NULL;
-        }
-        else {
-            /* Either a good result, or an exception was raised. */
-            goto cleanup;
+    if (kwnames != NULL) {
+        Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
+        for (Py_ssize_t i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = fast_args[i+len_args];
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                Py_DECREF(args);
+                Py_DECREF(kwargs);
+                return -1;
+            }
         }
     }
+    *out_args = args;
+    *out_kwargs = kwargs;
+    return 0;
+}
+
 
+static void
+set_no_matching_types_error(PyObject *public_api, PyObject *types)
+{
+    static PyObject *errmsg_formatter = NULL;
     /* No acceptable override found, raise TypeError. */
     npy_cache_import("numpy.core._internal",
                      "array_function_errmsg_formatter",
                      &errmsg_formatter);
     if (errmsg_formatter != NULL) {
         PyObject *errmsg = PyObject_CallFunctionObjArgs(
-            errmsg_formatter, public_api, types, NULL);
+                errmsg_formatter, public_api, types, NULL);
         if (errmsg != NULL) {
             PyErr_SetObject(PyExc_TypeError, errmsg);
             Py_DECREF(errmsg);
         }
     }
-
-cleanup:
-    for (int j = 0; j < num_implementing_args; j++) {
-        Py_DECREF(implementing_args[j]);
-        Py_DECREF(array_function_methods[j]);
-    }
-    Py_XDECREF(types);
-    Py_DECREF(relevant_args);
-    return result;
-}
-
-
-/*
- * Implements the __array_function__ protocol for a Python function, as described in
- * in NEP-18. See numpy.core.overrides for a full docstring.
- */
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
-{
-    PyObject *res, *implementation, *public_api, *relevant_args, *args, *kwargs;
-
-    if (!PyArg_UnpackTuple(
-            positional_args, "implement_array_function", 5, 5,
-            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
-        return NULL;
-    }
-
-    /*
-     * Remove `like=` kwarg, which is NumPy-exclusive and thus not present
-     * in downstream libraries. If `like=` is specified but doesn't
-     * implement `__array_function__`, raise a `TypeError`.
-     */
-    if (kwargs != NULL && PyDict_Contains(kwargs, npy_ma_str_like)) {
-        PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
-        if (like_arg != NULL) {
-            PyObject *tmp_has_override = get_array_function(like_arg);
-            if (tmp_has_override == NULL) {
-                return PyErr_Format(PyExc_TypeError,
-                        "The `like` argument must be an array-like that "
-                        "implements the `__array_function__` protocol.");
-            }
-            Py_DECREF(tmp_has_override);
-            PyDict_DelItem(kwargs, npy_ma_str_like);
-
-            /*
-             * If `like=` kwarg was removed, `implementation` points to the NumPy
-             * public API, as `public_api` is in that case the wrapper dispatcher
-             * function. For example, in the `np.full` case, `implementation` is
-             * `np.full`, whereas `public_api` is `_full_with_like`. This is done
-             * to ensure `__array_function__` implementations can do
-             * equality/identity comparisons when `like=` is present.
-             */
-            public_api = implementation;
-        }
-    }
-
-    res = array_implement_array_function_internal(
-        public_api, relevant_args, args, kwargs);
-
-    if (res == Py_NotImplemented) {
-        return PyObject_Call(implementation, args, kwargs);
-    }
-    return res;
 }
 
 /*
@@ -392,64 +280,52 @@ array_implement_c_array_function_creation(
     PyObject *args, PyObject *kwargs,
     PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyObject *relevant_args = NULL;
+    PyObject *dispatch_types = NULL;
     PyObject *numpy_module = NULL;
     PyObject *public_api = NULL;
     PyObject *result = NULL;
 
     /* If `like` doesn't implement `__array_function__`, raise a `TypeError` */
-    PyObject *tmp_has_override = get_array_function(like);
-    if (tmp_has_override == NULL) {
+    PyObject *method = get_array_function(like);
+    if (method == NULL) {
         return PyErr_Format(PyExc_TypeError,
                 "The `like` argument must be an array-like that "
                 "implements the `__array_function__` protocol.");
     }
-    Py_DECREF(tmp_has_override);
-
-    if (fast_args != NULL) {
+    if (is_default_array_function(method)) {
         /*
-         * Convert from vectorcall convention, since the protocol requires
-         * the normal convention.  We have to do this late to ensure the
-         * normal path where NotImplemented is returned is fast.
+         * Return a borrowed reference of Py_NotImplemented to defer back to
+         * the original function.
          */
+        Py_DECREF(method);
+        return Py_NotImplemented;
+    }
+
+    /* We needs args and kwargs for __array_function__ (when not using it). */
+    if (fast_args != NULL) {
         assert(args == NULL);
         assert(kwargs == NULL);
-        args = PyTuple_New(len_args);
-        if (args == NULL) {
-            return NULL;
-        }
-        for (Py_ssize_t i = 0; i < len_args; i++) {
-            Py_INCREF(fast_args[i]);
-            PyTuple_SET_ITEM(args, i, fast_args[i]);
-        }
-        if (kwnames != NULL) {
-            kwargs = PyDict_New();
-            if (kwargs == NULL) {
-                Py_DECREF(args);
-                return NULL;
-            }
-            Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
-            for (Py_ssize_t i = 0; i < nkwargs; i++) {
-                PyObject *key = PyTuple_GET_ITEM(kwnames, i);
-                PyObject *value = fast_args[i+len_args];
-                if (PyDict_SetItem(kwargs, key, value) < 0) {
-                    Py_DECREF(args);
-                    Py_DECREF(kwargs);
-                    return NULL;
-                }
-            }
+        if (get_args_and_kwargs(
+                fast_args, len_args, kwnames, &args, &kwargs) < 0) {
+            goto finish;
         }
     }
+    else {
+        Py_INCREF(args);
+        Py_INCREF(kwargs);
+    }
 
-    relevant_args = PyTuple_Pack(1, like);
-    if (relevant_args == NULL) {
+    dispatch_types = PyTuple_Pack(1, Py_TYPE(like));
+    if (dispatch_types == NULL) {
         goto finish;
     }
+
     /* The like argument must be present in the keyword arguments, remove it */
     if (PyDict_DelItem(kwargs, npy_ma_str_like) < 0) {
         goto finish;
     }
 
+    /* Fetch the actual symbol (the long way right now) */
     numpy_module = PyImport_Import(npy_ma_str_numpy);
     if (numpy_module == NULL) {
         goto finish;
@@ -466,16 +342,21 @@ array_implement_c_array_function_creation(
         goto finish;
     }
 
-    result = array_implement_array_function_internal(
-            public_api, relevant_args, args, kwargs);
+    result = call_array_function(like, method,
+            public_api, dispatch_types, args, kwargs);
 
-  finish:
-    if (kwnames != NULL) {
-        /* args and kwargs were converted from vectorcall convention */
-        Py_XDECREF(args);
-        Py_XDECREF(kwargs);
+    if (result == Py_NotImplemented) {
+        /* This shouldn't really happen as there is only one type, but... */
+        Py_DECREF(result);
+        result = NULL;
+        set_no_matching_types_error(public_api, dispatch_types);
     }
-    Py_XDECREF(relevant_args);
+
+  finish:
+    Py_DECREF(method);
+    Py_XDECREF(args);
+    Py_XDECREF(kwargs);
+    Py_XDECREF(dispatch_types);
     Py_XDECREF(public_api);
     return result;
 }
@@ -530,3 +411,373 @@ cleanup:
     Py_DECREF(relevant_args);
     return result;
 }
+
+
+typedef struct {
+    PyObject_HEAD
+    vectorcallfunc vectorcall;
+    PyObject *dict;
+    PyObject *relevant_arg_func;
+    PyObject *default_impl;
+    /* The following fields are used to clean up TypeError messages only: */
+    PyObject *dispatcher_name;
+    PyObject *public_name;
+} PyArray_ArrayFunctionDispatcherObject;
+
+
+static void
+dispatcher_dealloc(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    Py_CLEAR(self->relevant_arg_func);
+    Py_CLEAR(self->default_impl);
+    Py_CLEAR(self->dict);
+    Py_CLEAR(self->dispatcher_name);
+    Py_CLEAR(self->public_name);
+    PyObject_FREE(self);
+}
+
+
+static void
+fix_name_if_typeerror(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    if (!PyErr_ExceptionMatches(PyExc_TypeError)) {
+        return;
+    }
+
+    PyObject *exc, *val, *tb, *message;
+    PyErr_Fetch(&exc, &val, &tb);
+
+    if (!PyUnicode_CheckExact(val)) {
+        /*
+         * We expect the error to be unnormalized, but maybe it isn't always
+         * the case, so normalize and fetch args[0] if it isn't a string.
+         */
+        PyErr_NormalizeException(&exc, &val, &tb);
+
+        PyObject *args = PyObject_GetAttrString(val, "args");
+        if (args == NULL || !PyTuple_CheckExact(args)
+                || PyTuple_GET_SIZE(args) != 1) {
+            Py_XDECREF(args);
+            goto restore_error;
+        }
+        message = PyTuple_GET_ITEM(args, 0);
+        Py_INCREF(message);
+        Py_DECREF(args);
+        if (!PyUnicode_CheckExact(message)) {
+            Py_DECREF(message);
+            goto restore_error;
+        }
+    }
+    else {
+        Py_INCREF(val);
+        message = val;
+    }
+
+    Py_ssize_t cmp = PyUnicode_Tailmatch(
+            message, self->dispatcher_name, 0, -1, -1);
+    if (cmp <= 0) {
+        Py_DECREF(message);
+        goto restore_error;
+    }
+    Py_SETREF(message, PyUnicode_Replace(
+            message, self->dispatcher_name, self->public_name, 1));
+    if (message == NULL) {
+        goto restore_error;
+    }
+    PyErr_SetObject(PyExc_TypeError, message);
+    Py_DECREF(exc);
+    Py_XDECREF(val);
+    Py_XDECREF(tb);
+    Py_DECREF(message);
+    return;
+
+  restore_error:
+    /* replacement not successful, so restore original error */
+    PyErr_Restore(exc, val, tb);
+}
+
+
+static PyObject *
+dispatcher_vectorcall(PyArray_ArrayFunctionDispatcherObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *result = NULL;
+    PyObject *types = NULL;
+    PyObject *relevant_args = NULL;
+
+    PyObject *public_api;
+
+    /* __array_function__ passes args, kwargs.  These may be filled: */
+    PyObject *packed_args = NULL;
+    PyObject *packed_kwargs = NULL;
+
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+
+    int num_implementing_args;
+
+    if (self->relevant_arg_func != NULL) {
+        public_api = (PyObject *)self;
+
+        /* Typical path, need to call the relevant_arg_func and unpack them */
+        relevant_args = PyObject_Vectorcall(
+                self->relevant_arg_func, args, len_args, kwnames);
+        if (relevant_args == NULL) {
+            fix_name_if_typeerror(self);
+            return NULL;
+        }
+        Py_SETREF(relevant_args, PySequence_Fast(relevant_args,
+                "dispatcher for __array_function__ did not return an iterable"));
+        if (relevant_args == NULL) {
+            return NULL;
+        }
+
+        num_implementing_args = get_implementing_args_and_methods(
+                relevant_args, implementing_args, array_function_methods);
+        if (num_implementing_args < 0) {
+            Py_DECREF(relevant_args);
+            return NULL;
+        }
+    }
+    else {
+        /* For like= dispatching from Python, the public_symbol is the impl */
+        public_api = self->default_impl;
+
+        /*
+         * We are dealing with `like=` from Python.  For simplicity, the
+         * Python code passes it on as the first argument.
+         */
+        if (PyVectorcall_NARGS(len_args) == 0) {
+            PyErr_Format(PyExc_TypeError,
+                    "`like` argument dispatching, but first argument is not "
+                    "positional in call to %S.", self->default_impl);
+            return NULL;
+        }
+
+        array_function_methods[0] = get_array_function(args[0]);
+        if (array_function_methods[0] == NULL) {
+            return PyErr_Format(PyExc_TypeError,
+                    "The `like` argument must be an array-like that "
+                    "implements the `__array_function__` protocol.");
+        }
+        num_implementing_args = 1;
+        implementing_args[0] = args[0];
+        Py_INCREF(implementing_args[0]);
+
+        /* do not pass the like argument */
+        len_args = PyVectorcall_NARGS(len_args) - 1;
+        len_args |= PY_VECTORCALL_ARGUMENTS_OFFSET;
+        args++;
+    }
+
+    /*
+     * Handle the typical case of no overrides. This is merely an optimization
+     * if some arguments are ndarray objects, but is also necessary if no
+     * arguments implement __array_function__ at all (e.g., if they are all
+     * built-in types).
+     */
+    int any_overrides = 0;
+    for (int j = 0; j < num_implementing_args; j++) {
+        if (!is_default_array_function(array_function_methods[j])) {
+            any_overrides = 1;
+            break;
+        }
+    }
+    if (!any_overrides) {
+        /* Directly call the actual implementation. */
+        result = PyObject_Vectorcall(self->default_impl, args, len_args, kwnames);
+        goto cleanup;
+    }
+
+    /* Find args and kwargs as tuple and dict, as we pass them out: */
+    if (get_args_and_kwargs(
+            args, len_args, kwnames, &packed_args, &packed_kwargs) < 0) {
+        goto cleanup;
+    }
+
+    /*
+     * Create a Python object for types.
+     * We use a tuple, because it's the fastest Python collection to create
+     * and has the bonus of being immutable.
+     */
+    types = PyTuple_New(num_implementing_args);
+    if (types == NULL) {
+        goto cleanup;
+    }
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
+        Py_INCREF(arg_type);
+        PyTuple_SET_ITEM(types, j, arg_type);
+    }
+
+    /* Call __array_function__ methods */
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        PyObject *method = array_function_methods[j];
+
+        result = call_array_function(
+                argument, method, public_api, types,
+                packed_args, packed_kwargs);
+
+        if (result == Py_NotImplemented) {
+            /* Try the next one */
+            Py_DECREF(result);
+            result = NULL;
+        }
+        else {
+            /* Either a good result, or an exception was raised. */
+            goto cleanup;
+        }
+    }
+
+    set_no_matching_types_error(public_api, types);
+
+cleanup:
+    for (int j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_XDECREF(packed_args);
+    Py_XDECREF(packed_kwargs);
+    Py_XDECREF(types);
+    Py_XDECREF(relevant_args);
+    return result;
+}
+
+
+static PyObject *
+dispatcher_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwargs)
+{
+    PyArray_ArrayFunctionDispatcherObject *self;
+
+    self = PyObject_New(
+            PyArray_ArrayFunctionDispatcherObject,
+            &PyArrayFunctionDispatcher_Type);
+    if (self == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    char *kwlist[] = {"", "", NULL};
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwargs, "OO:_ArrayFunctionDispatcher", kwlist,
+            &self->relevant_arg_func, &self->default_impl)) {
+        Py_DECREF(self);
+        return NULL;
+    }
+
+    self->vectorcall = (vectorcallfunc)dispatcher_vectorcall;
+    Py_INCREF(self->default_impl);
+    self->dict = NULL;
+    self->dispatcher_name = NULL;
+    self->public_name = NULL;
+
+    if (self->relevant_arg_func == Py_None) {
+        /* NULL in the relevant arg function means we use `like=` */
+        Py_CLEAR(self->relevant_arg_func);
+    }
+    else {
+        /* Fetch names to clean up TypeErrors (show actual name) */
+        Py_INCREF(self->relevant_arg_func);
+        self->dispatcher_name = PyObject_GetAttrString(
+            self->relevant_arg_func, "__qualname__");
+        if (self->dispatcher_name == NULL) {
+            Py_DECREF(self);
+            return NULL;
+        }
+        self->public_name = PyObject_GetAttrString(
+            self->default_impl, "__qualname__");
+        if (self->public_name == NULL) {
+            Py_DECREF(self);
+            return NULL;
+        }
+    }
+
+    /* Need to be like a Python function that has arbitrary attributes */
+    self->dict = PyDict_New();
+    if (self->dict == NULL) {
+        Py_DECREF(self);
+        return NULL;
+    }
+    return (PyObject *)self;
+}
+
+
+static PyObject *
+dispatcher_str(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    return PyObject_Str(self->default_impl);
+}
+
+
+static PyObject *
+dispatcher_repr(PyObject *self)
+{
+    PyObject *name = PyObject_GetAttrString(self, "__name__");
+    if (name == NULL) {
+        return NULL;
+    }
+    /* Print like a normal function */
+    return PyUnicode_FromFormat("<function %S at %p>", name, self);
+}
+
+
+static PyObject *
+func_dispatcher___get__(PyObject *self, PyObject *obj, PyObject *cls)
+{
+    if (obj == NULL) {
+        /* Act like a static method, no need to bind */
+        Py_INCREF(self);
+        return self;
+    }
+    return PyMethod_New(self, obj);
+}
+
+
+static PyObject *
+dispatcher_get_implementation(
+        PyArray_ArrayFunctionDispatcherObject *self, void *NPY_UNUSED(closure))
+{
+    Py_INCREF(self->default_impl);
+    return self->default_impl;
+}
+
+
+static PyObject *
+dispatcher_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    return PyObject_GetAttrString(self, "__qualname__");
+}
+
+
+static struct PyMethodDef func_dispatcher_methods[] = {
+    {"__reduce__",
+        (PyCFunction)dispatcher_reduce, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+static struct PyGetSetDef func_dispatcher_getset[] = {
+    {"__dict__", &PyObject_GenericGetDict, 0, NULL, 0},
+    {"_implementation", (getter)&dispatcher_get_implementation, 0, NULL, 0},
+    {0, 0, 0, 0, 0}
+};
+
+
+NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type = {
+     PyVarObject_HEAD_INIT(NULL, 0)
+     .tp_name = "numpy._ArrayFunctionDispatcher",
+     .tp_basicsize = sizeof(PyArray_ArrayFunctionDispatcherObject),
+     /* We have a dict, so in theory could traverse, but in practice... */
+     .tp_dictoffset = offsetof(PyArray_ArrayFunctionDispatcherObject, dict),
+     .tp_dealloc = (destructor)dispatcher_dealloc,
+     .tp_new = (newfunc)dispatcher_new,
+     .tp_str = (reprfunc)dispatcher_str,
+     .tp_repr = (reprfunc)dispatcher_repr,
+     .tp_flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_VECTORCALL
+                  | Py_TPFLAGS_METHOD_DESCRIPTOR),
+     .tp_methods = func_dispatcher_methods,
+     .tp_getset = func_dispatcher_getset,
+     .tp_descr_get = func_dispatcher___get__,
+     .tp_call = &PyVectorcall_Call,
+     .tp_vectorcall_offset = offsetof(PyArray_ArrayFunctionDispatcherObject, vectorcall),
+};
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index 09f7ee548..3b8b88bac 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -1,9 +1,7 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+extern NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type;
 
 NPY_NO_EXPORT PyObject *
 array__get_implementing_args(
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 08e2cc683..46c4d90c9 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -56,6 +56,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "alloc.h"
 #include "mem_overlap.h"
 #include "numpyos.h"
+#include "refcount.h"
 #include "strfuncs.h"
 
 #include "binop_override.h"
@@ -452,9 +453,11 @@ array_dealloc(PyArrayObject *self)
     }
 
     if ((fa->flags & NPY_ARRAY_OWNDATA) && fa->data) {
-        /* Free internal references if an Object array */
-        if (PyDataType_FLAGCHK(fa->descr, NPY_ITEM_REFCOUNT)) {
-            PyArray_XDECREF(self);
+        /* Free any internal references */
+        if (PyDataType_REFCHK(fa->descr)) {
+            if (PyArray_ClearArray(self) < 0) {
+                PyErr_WriteUnraisable(NULL);
+            }
         }
         if (fa->mem_handler == NULL) {
             char *env = getenv("NUMPY_WARN_IF_NO_MEM_POLICY");
@@ -994,7 +997,7 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
      * TODO: If/once we correctly push structured comparisons into the ufunc
      *       we could consider pushing this path into the ufunc itself as a
      *       fallback loop (which ignores the input arrays).
-     *       This would have the advantage that subclasses implemementing
+     *       This would have the advantage that subclasses implementing
      *       `__array_ufunc__` do not explicitly need `__eq__` and `__ne__`.
      */
     if (result == NULL
@@ -1220,7 +1223,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                                      (int)dims.len,
                                      dims.ptr,
                                      strides.ptr, NULL, is_f_order, NULL, NULL,
-                                     0, 1);
+                                     _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
@@ -1257,7 +1260,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                 subtype, descr,
                 dims.len, dims.ptr, strides.ptr, offset + (char *)buffer.ptr,
                 buffer.flags, NULL, buffer.base,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
diff --git a/numpy/core/src/multiarray/arrayobject.h b/numpy/core/src/multiarray/arrayobject.h
index f7d0734db..a6b3a32bd 100644
--- a/numpy/core/src/multiarray/arrayobject.h
+++ b/numpy/core/src/multiarray/arrayobject.h
@@ -40,6 +40,13 @@ static const int NPY_ARRAY_WARN_ON_WRITE = (1 << 31);
 static const int NPY_ARRAY_WAS_PYTHON_INT = (1 << 30);
 static const int NPY_ARRAY_WAS_PYTHON_FLOAT = (1 << 29);
 static const int NPY_ARRAY_WAS_PYTHON_COMPLEX = (1 << 28);
+/*
+ * Mark that this was a huge int which was turned into an object array (or
+ * unsigned/non-default integer array), but then replaced by a temporary
+ * array for further processing. This flag is only used in the ufunc machinery
+ * where it is tricky to cover correctly all type resolution paths.
+ */
+static const int NPY_ARRAY_WAS_INT_AND_REPLACED = (1 << 27);
 static const int NPY_ARRAY_WAS_PYTHON_LITERAL = (1 << 30 | 1 << 29 | 1 << 28);
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYOBJECT_H_ */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 3149ff36f..b84625c77 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -224,8 +224,6 @@ MyPyLong_AsUnsigned@Type@(PyObject *obj)
  **                         GETITEM AND SETITEM                             **
  *****************************************************************************
  */
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -293,7 +291,7 @@ static int
                     "Python integers to integer arrays.  The conversion "
                     "of %.100R to %S will fail in the future.\n"
                     "For the old behavior, usually:\n"
-                    "    np.array(value).astype(dtype)`\n"
+                    "    np.array(value).astype(dtype)\n"
                     "will give the desired result (the cast overflows).",
                     obj, descr) < 0) {
                 Py_DECREF(descr);
@@ -4648,12 +4646,30 @@ set_typeinfo(PyObject *dict)
      * should be defined on the class and inherited to the scalar.
      * (NPY_HALF is the largest builtin one.)
      */
-    for (i = 0; i <= NPY_HALF; i++) {
-        if (dtypemeta_wrap_legacy_descriptor(_builtin_descrs[i]) < 0) {
-            return -1;
-        }
+    /**begin repeat
+     *
+     * #NAME = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+     *         OBJECT, STRING, UNICODE, VOID,
+     *         DATETIME, TIMEDELTA#
+     */
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_@NAME@],
+            "numpy.dtypes." NPY_@NAME@_Name "DType",
+#ifdef NPY_@NAME@_alias
+            "numpy.dtypes." NPY_@NAME@_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
     }
 
+    /**end repeat**/
+
     /*
      * Add cast functions for the new types
      */
diff --git a/numpy/core/src/multiarray/arraytypes.h.src b/numpy/core/src/multiarray/arraytypes.h.src
index aad464ccf..90a075dad 100644
--- a/numpy/core/src/multiarray/arraytypes.h.src
+++ b/numpy/core/src/multiarray/arraytypes.h.src
@@ -66,4 +66,102 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@,
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
     (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
 
+
+/*
+ * Define DType and scalar type names and aliases as used in Python.
+ */
+
+/**begin repeat
+ * #NAME = BOOL,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID, OBJECT,
+ *         DATETIME, TIMEDELTA#
+ * #name = bool_,
+ *         float16, float32, float64, longdouble,
+ *         complex64, complex128, clongdouble,
+ *         bytes_, str_, void, object_,
+ *         datetime64, timedelta64#
+ * #Name = Bool,
+ *         Float16, Float32, Float64, LongDouble,
+ *         Complex64, Complex128, CLongDouble,
+ *         Bytes, Str, Void, Object,
+ *         DateTime64, TimeDelta64#
+ */
+#define NPY_@NAME@_name "@name@"
+#define NPY_@NAME@_Name "@Name@"
+
+/**end repeat**/
+
+
+/*
+ * Give integers different names when they are the same size (gh-9799).
+ * `intX` always refers to the first int of that size in the sequence
+ * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
+ * Unfortunately, since the bitsize names are not strictly fixed, we add
+ * the C name for all integer types (as aliases).
+ *
+ * Right now, we do not define the C aliases for floats (which are always
+ * the same).
+ */
+
+#if NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT
+    #define BYTE_not_size_named
+#endif
+#if NPY_SIZEOF_SHORT == NPY_SIZEOF_INT
+    #define SHORT_not_size_named
+#endif
+#if NPY_SIZEOF_INT == NPY_SIZEOF_LONG
+    #define INT_not_size_named
+#endif
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+    #define LONGLONG_not_size_named
+#endif
+
+
+/**begin repeat
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #CNAME = (BYTE, SHORT, INT, LONG, LONGLONG)*2#
+ * #cname = byte, short, intc, int_, longlong,
+ *          ubyte, ushort, uintc, uint, ulonglong#
+ * #CName = Byte, Short, Int, Long, LongLong,
+ *          UByte, UShort, UInt, ULong, ULongLong#
+ * #bitname = int*5, uint*5#
+ * #BitName = Int*5, UInt*5#
+ */
+
+#ifdef @CNAME@_not_size_named
+    #define NPY_@NAME@_name "@cname@"
+    #define NPY_@NAME@_Name "@CName@"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_@NAME@_alias "@cname@"
+    #define NPY_@NAME@_Alias "@CName@"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_@CNAME@ == 8
+        #define NPY_@NAME@_name "@bitname@8"
+        #define NPY_@NAME@_Name "@BitName@8"
+    #elif NPY_BITSOF_@CNAME@ == 16
+        #define NPY_@NAME@_name "@bitname@16"
+        #define NPY_@NAME@_Name "@BitName@16"
+    #elif NPY_BITSOF_@CNAME@ == 32
+        #define NPY_@NAME@_name "@bitname@32"
+        #define NPY_@NAME@_Name "@BitName@32"
+    #elif NPY_BITSOF_@CNAME@ == 64
+        #define NPY_@NAME@_name "@bitname@64"
+        #define NPY_@NAME@_Name "@BitName@64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+/**end repeat**/
+
+#undef BYTE_not_size_named
+#undef SHORT_not_size_named
+#undef INT_not_size_named
+#undef LONGLONG_not_size_named
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 76652550a..c9d881e16 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -17,6 +17,7 @@
 #include "numpyos.h"
 #include "arrayobject.h"
 #include "scalartypes.h"
+#include "dtypemeta.h"
 
 /*************************************************************************
  ****************   Implement Buffer Protocol ****************************
@@ -417,9 +418,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             break;
         }
         default:
-            PyErr_Format(PyExc_ValueError,
-                         "cannot include dtype '%c' in a buffer",
-                         descr->type);
+            if (NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%c' in a buffer",
+                             descr->type);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%s' in a buffer",
+                             ((PyTypeObject*)NPY_DTYPE(descr))->tp_name);
+            }
             return -1;
         }
     }
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index a985a2308..62a994c64 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -7,6 +7,7 @@
 
 #include "numpy/arrayobject.h"
 #include "lowlevel_strided_loops.h"
+#include "dtypemeta.h"
 
 #include "npy_config.h"
 
@@ -50,7 +51,7 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
     int axis_copy = axis;
     npy_intp _shape_buf[NPY_MAXDIMS];
     npy_intp *out_shape;
-    // Keep the number of dimensions and shape of 
+    // Keep the number of dimensions and shape of
     // original array. Helps when `keepdims` is True.
     npy_intp* original_op_shape = PyArray_DIMS(op);
     int out_ndim = PyArray_NDIM(op);
@@ -87,9 +88,13 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
         op = ap;
     }
 
-    /* Will get native-byte order contiguous copy. */
-    ap = (PyArrayObject *)PyArray_ContiguousFromAny((PyObject *)op,
-                                  PyArray_DESCR(op)->type_num, 1, 0);
+    // Will get native-byte order contiguous copy.
+    PyArray_Descr *descr = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(op));
+    if (descr == NULL) {
+        return NULL;
+    }
+    ap = (PyArrayObject *)PyArray_FromArray(op, descr, NPY_ARRAY_DEFAULT);
+
     Py_DECREF(op);
     if (ap == NULL) {
         return NULL;
@@ -106,9 +111,9 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
             for (int i = 0; i < out_ndim; i++) {
                 out_shape[i] = 1;
             }
-        } 
+        }
         else {
-            /* 
+            /*
              * While `ap` may be transposed, we can ignore this for `out` because the
              * transpose only reorders the size 1 `axis` (not changing memory layout).
              */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index da8d23a26..573d0d606 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -128,24 +128,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
 }
 
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret)
-{
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        PyObject *zero = PyLong_FromLong(0);
-        PyArray_FillObjectArray(ret, zero);
-        Py_DECREF(zero);
-        if (PyErr_Occurred()) {
-            return -1;
-        }
-    }
-    else {
-        npy_intp n = PyArray_NBYTES(ret);
-        memset(PyArray_DATA(ret), 0, n);
-    }
-    return 0;
-}
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap)
 {
@@ -459,3 +441,32 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
     }
 }
 
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v)
+{
+    if (PyArray_NDIM(v) == 0) {
+        return 0;
+    }
+
+    /* Remove this if-else block when the deprecation expires */
+    if (PyArray_SIZE(v) == 1) {
+        /* Numpy 1.25.0, 2023-01-02 */
+        if (DEPRECATE(
+                "Conversion of an array with ndim > 0 to a scalar "
+                "is deprecated, and will error in future. "
+                "Ensure you extract a single element from your array "
+                "before performing this operation. "
+                "(Deprecated NumPy 1.25.)") < 0) {
+            return -1;
+        }
+        return 0;
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "only length-1 arrays can be converted to Python scalars");
+        return -1;
+    }
+
+    PyErr_SetString(PyExc_TypeError,
+            "only 0-dimensional arrays can be converted to Python scalars");
+    return -1;
+}
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 06322e902..cb9fadc4e 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -55,9 +55,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims,
 NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op);
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret);
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap);
 
@@ -178,7 +175,19 @@ check_and_adjust_axis(int *axis, int ndim)
 }
 
 /* used for some alignment checks */
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/* 
+ * GCC releases before GCC 4.9 had a bug in _Alignof.  See GCC bug 52023
+ * <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52023>.
+ * clang versions < 8.0.0 have the same bug.
+ */
+#if (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112 \
+     || (defined __GNUC__ && __GNUC__ < 4 + (__GNUC_MINOR__ < 9) \
+  && !defined __clang__) \
+     || (defined __clang__ && __clang_major__ < 8))
+# define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#else
+# define _ALIGN(type) _Alignof(type)
+#endif
 #define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
@@ -323,6 +332,13 @@ PyArray_TupleFromItems(int n, PyObject *const *items, int make_null_none)
     return tuple;
 }
 
+/*
+ * Returns 0 if the array has rank 0, -1 otherwise. Prints a deprecation
+ * warning for arrays of _size_ 1.
+ */
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v);
+
 
 #include "ucsnarrow.h"
 
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 2ae36c13a..22f2547ad 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/npy_3kcompat.h"
 #include "numpy/npy_math.h"
+#include "npy_argparse.h"
 #include "npy_config.h"
 #include "templ_common.h" /* for npy_mul_sizes_with_overflow */
 #include "lowlevel_strided_loops.h" /* for npy_bswap8 */
@@ -109,7 +110,8 @@ minmax(const npy_intp *data, npy_intp data_len, npy_intp *mn, npy_intp *mx)
  * output array.
  */
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+arr_bincount(PyObject *NPY_UNUSED(self), PyObject *const *args,
+                            Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *list = NULL, *weight = Py_None, *mlength = NULL;
     PyArrayObject *lst = NULL, *ans = NULL, *wts = NULL;
@@ -117,11 +119,14 @@ arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     npy_intp minlength = 0;
     npy_intp i;
     double *weights , *dans;
-    static char *kwlist[] = {"list", "weights", "minlength", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO:bincount",
-                kwlist, &list, &weight, &mlength)) {
-            goto fail;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("bincount", args, len_args, kwnames,
+                "list", NULL, &list,
+                "|weights", NULL, &weight,
+                "|minlength", NULL, &mlength,
+                NULL, NULL, NULL) < 0) {
+        return NULL;
     }
 
     lst = (PyArrayObject *)PyArray_ContiguousFromAny(list, NPY_INTP, 1, 1);
@@ -265,7 +270,7 @@ arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
  * indicated by the mask
  */
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_place(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 {
     char *src, *dest;
     npy_bool *mask_data;
@@ -489,7 +494,8 @@ binary_search_with_guess(const npy_double key, const npy_double *arr,
 #undef LIKELY_IN_CACHE_SIZE
 
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -500,12 +506,16 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     const npy_double *dy, *dx, *dz;
     npy_double *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp", kwlist,
-                                     &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -654,7 +664,8 @@ fail:
 
 /* As for arr_interp but for complex fp values */
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -667,12 +678,16 @@ arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     npy_cdouble lval, rval;
     npy_cdouble *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp_complex",
-                                     kwlist, &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp_complex", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -1389,7 +1404,7 @@ fail:
 
 /* Can only be called if doc is currently NULL */
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
+arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj;
     PyObject *str;
@@ -1401,7 +1416,16 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
         Py_RETURN_NONE;
     }
 
-    if (!PyArg_ParseTuple(args, "OO!:add_docstring", &obj, &PyUnicode_Type, &str)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("add_docstring", args, len_args, NULL,
+            "", NULL, &obj,
+            "", NULL, &str,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+    if (!PyUnicode_Check(str)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument docstring of add_docstring should be a str");
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index d3bc08cb2..e0e73ac79 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -4,21 +4,21 @@
 #include "numpy/ndarraytypes.h"
 
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *, PyObject *, PyObject *);
+arr_place(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *, PyObject *, PyObject *);
+arr_bincount(PyObject *, PyObject *const *, Py_ssize_t, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *, PyObject *, PyObject *);
+arr_interp(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *, PyObject *, PyObject *);
+arr_interp_complex(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_ravel_multi_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_unravel_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *, PyObject *);
+arr_add_docstring(PyObject *, PyObject *const *, Py_ssize_t);
 NPY_NO_EXPORT PyObject *
 io_pack(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 092a17c89..aef78ff5e 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -21,6 +21,7 @@
 
 #include "convert.h"
 #include "array_coercion.h"
+#include "refcount.h"
 
 int
 fallocate(int fd, int mode, off_t offset, off_t len);
@@ -156,11 +157,35 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
             size = PyArray_SIZE(self);
             NPY_BEGIN_ALLOW_THREADS;
 
-#if defined (_MSC_VER) && defined(_WIN64)
-            /* Workaround Win64 fwrite() bug. Ticket #1660 */
+#if defined(NPY_OS_WIN64)
+            /*
+             * Workaround Win64 fwrite() bug. Issue gh-2256
+             * The native 64 windows runtime has this issue, the above will
+             * also trigger UCRT (which doesn't), so it could be more precise.
+             *
+             * If you touch this code, please run this test which is so slow
+             * it was removed from the test suite. Note that the original
+             * failure mode involves an infinite loop during tofile()
+             *
+             * import tempfile, numpy as np
+             * from numpy.testing import (assert_)
+             * fourgbplus = 2**32 + 2**16
+             * testbytes = np.arange(8, dtype=np.int8)
+             * n = len(testbytes)
+             * flike = tempfile.NamedTemporaryFile()
+             * f = flike.file
+             * np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
+             * flike.seek(0)
+             * a = np.fromfile(f, dtype=np.int8)
+             * flike.close()
+             * assert_(len(a) == fourgbplus)
+             * # check only start and end for speed:
+             * assert_((a[:n] == testbytes).all())
+             * assert_((a[-n:] == testbytes).all())
+             */
             {
-                npy_intp maxsize = 2147483648 / PyArray_DESCR(self)->elsize;
-                npy_intp chunksize;
+                size_t maxsize = 2147483648 / (size_t)PyArray_DESCR(self)->elsize;
+                size_t chunksize;
 
                 n = 0;
                 while (size > 0) {
@@ -168,7 +193,7 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
                     n2 = fwrite((const void *)
                              ((char *)PyArray_DATA(self) + (n * PyArray_DESCR(self)->elsize)),
                              (size_t) PyArray_DESCR(self)->elsize,
-                             (size_t) chunksize, fp);
+                             chunksize, fp);
                     if (n2 < chunksize) {
                         break;
                     }
@@ -359,6 +384,11 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
 NPY_NO_EXPORT int
 PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 {
+
+    if (PyArray_FailUnlessWriteable(arr, "assignment destination") < 0) {
+        return -1;
+    }
+
     /*
      * If we knew that the output array has at least one element, we would
      * not actually need a helping buffer, we always null it, just in case.
@@ -370,7 +400,7 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
     char *value = (char *)value_buffer_stack;
     PyArray_Descr *descr = PyArray_DESCR(arr);
 
-    if (descr->elsize > sizeof(value_buffer_stack)) {
+    if ((size_t)descr->elsize > sizeof(value_buffer_stack)) {
         /* We need a large temporary buffer... */
         value_buffer_heap = PyObject_Calloc(1, descr->elsize);
         if (value_buffer_heap == NULL) {
@@ -393,12 +423,17 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
             PyArray_BYTES(arr), PyArray_STRIDES(arr),
             descr, value);
 
+    if (PyDataType_REFCHK(descr)) {
+        PyArray_ClearBuffer(descr, value, 0, 1, 1);
+    }
     PyMem_FREE(value_buffer_heap);
     return retcode;
 }
 
 /*
- * Fills an array with zeros.
+ * Internal function to fill an array with zeros.
+ * Used in einsum and dot, which ensures the dtype is, in some sense, numerical
+ * and not a str or struct
  *
  * dst: The destination array.
  * wheremask: If non-NULL, a boolean mask specifying where to set the values.
@@ -409,21 +444,26 @@ NPY_NO_EXPORT int
 PyArray_AssignZero(PyArrayObject *dst,
                    PyArrayObject *wheremask)
 {
-    npy_bool value;
-    PyArray_Descr *bool_dtype;
-    int retcode;
-
-    /* Create a raw bool scalar with the value False */
-    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
-    if (bool_dtype == NULL) {
-        return -1;
+    int retcode = 0;
+    if (PyArray_ISOBJECT(dst)) {
+        PyObject * pZero = PyLong_FromLong(0);
+        retcode = PyArray_AssignRawScalar(dst, PyArray_DESCR(dst),
+                                     (char *)&pZero, wheremask, NPY_SAFE_CASTING);
+        Py_DECREF(pZero);
     }
-    value = 0;
+    else {
+        /* Create a raw bool scalar with the value False */
+        PyArray_Descr *bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (bool_dtype == NULL) {
+            return -1;
+        }
+        npy_bool value = 0;
 
-    retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
-                                      wheremask, NPY_SAFE_CASTING);
+        retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
+                                          wheremask, NPY_SAFE_CASTING);
 
-    Py_DECREF(bool_dtype);
+        Py_DECREF(bool_dtype);
+    }
     return retcode;
 }
 
@@ -483,7 +523,7 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
             PyArray_NDIM(self), PyArray_DIMS(self), PyArray_STRIDES(self),
             PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (ret == NULL) {
         Py_XDECREF(type);
         return NULL;
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 3973fc795..de1cd075d 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -18,6 +18,7 @@
 #include "can_cast_table.h"
 #include "common.h"
 #include "ctors.h"
+#include "descriptor.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "scalartypes.h"
@@ -31,6 +32,7 @@
 #include "array_method.h"
 #include "usertypes.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "arrayobject.h"
 
 
@@ -152,7 +154,7 @@ PyArray_GetCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
 {
     PyObject *res;
     if (from == to) {
-        res = NPY_DT_SLOTS(from)->within_dtype_castingimpl;
+        res = (PyObject *)NPY_DT_SLOTS(from)->within_dtype_castingimpl;
     }
     else {
         res = PyDict_GetItemWithError(NPY_DT_SLOTS(from)->castingimpls, (PyObject *)to);
@@ -329,7 +331,7 @@ PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int is_f_order)
         return NULL;
     }
 
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, (PyObject *)dtype));
+    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, NULL, dtype));
     if (dtype == NULL) {
         return NULL;
     }
@@ -1148,7 +1150,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType)
  */
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype)
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype)
 {
     if (requested_dtype == NULL) {
         return PyArray_ResultType(n, arrays, 0, NULL);
@@ -2414,8 +2416,7 @@ PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth)
             return -1;
         }
         Py_INCREF(meth->method);
-        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = (
-                (PyObject *)meth->method);
+        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = meth->method;
 
         return 0;
     }
@@ -2668,7 +2669,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
          * consider moving this warning into the inner-loop at some point
          * for simplicity (this requires ensuring it is only emitted once).
          */
-        slots[5].slot = NPY_METH_get_loop;
+        slots[5].slot = _NPY_METH_get_loop;
         slots[5].pfunc = &complex_to_noncomplex_get_loop;
         slots[6].slot = 0;
         slots[6].pfunc = NULL;
@@ -2689,7 +2690,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
         /* When there is no casting (equivalent C-types) use byteswap loops */
         slots[0].slot = NPY_METH_resolve_descriptors;
         slots[0].pfunc = &legacy_same_dtype_resolve_descriptors;
-        slots[1].slot = NPY_METH_get_loop;
+        slots[1].slot = _NPY_METH_get_loop;
         slots[1].pfunc = &get_byteswap_loop;
         slots[2].slot = 0;
         slots[2].pfunc = NULL;
@@ -2873,14 +2874,14 @@ add_other_to_and_from_string_cast(
      */
     PyArray_DTypeMeta *dtypes[2] = {other, string};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &cast_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
         .name = "legacy_cast_to_string",
         .nin = 1,
         .nout = 1,
-        .flags = NPY_METH_REQUIRES_PYAPI,
+        .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS,
         .dtypes = dtypes,
         .slots = slots,
     };
@@ -3012,7 +3013,7 @@ PyArray_InitializeStringCasts(void)
     /* string<->string and unicode<->unicode have their own specialized casts */
     PyArray_DTypeMeta *dtypes[2];
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &string_to_string_get_loop},
+            {_NPY_METH_get_loop, &string_to_string_get_loop},
             {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
@@ -3711,7 +3712,7 @@ PyArray_InitializeVoidToVoidCast(void)
     PyArray_DTypeMeta *Void = PyArray_DTypeFromTypeNum(NPY_VOID);
     PyArray_DTypeMeta *dtypes[2] = {Void, Void};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &void_to_void_get_loop},
+            {_NPY_METH_get_loop, &void_to_void_get_loop},
             {NPY_METH_resolve_descriptors, &void_to_void_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
@@ -3894,7 +3895,7 @@ PyArray_InitializeObjectToObjectCast(void)
     PyArray_DTypeMeta *Object = PyArray_DTypeFromTypeNum(NPY_OBJECT);
     PyArray_DTypeMeta *dtypes[2] = {Object, Object};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &object_to_object_get_loop},
+            {_NPY_METH_get_loop, &object_to_object_get_loop},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
             .name = "object_to_object_cast",
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 1a23965f8..a68c669ee 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -82,7 +82,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType);
 
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype);
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype);
 
 NPY_NO_EXPORT int
 PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth);
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 4f0bc7337..3c5b047c3 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -20,6 +20,8 @@
 #include "common.h"
 #include "ctors.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
+#include "dtypemeta.h"
 #include "shape.h"
 #include "npy_buffer.h"
 #include "lowlevel_strided_loops.h"
@@ -626,8 +628,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring)
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags)
 {
     PyArrayObject_fields *fa;
     npy_intp nbytes;
@@ -642,44 +643,54 @@ PyArray_NewFromDescr_int(
         return NULL;
     }
 
-    if (descr->subarray) {
-        PyObject *ret;
-        npy_intp newdims[2*NPY_MAXDIMS];
-        npy_intp *newstrides = NULL;
-        memcpy(newdims, dims, nd*sizeof(npy_intp));
-        if (strides) {
-            newstrides = newdims + NPY_MAXDIMS;
-            memcpy(newstrides, strides, nd*sizeof(npy_intp));
-        }
-        nd =_update_descr_and_dimensions(&descr, newdims,
-                                         newstrides, nd);
-        ret = PyArray_NewFromDescr_int(
-                subtype, descr,
-                nd, newdims, newstrides, data,
-                flags, obj, base,
-                zeroed, allow_emptystring);
-        return ret;
-    }
-
-    /* Check datatype element size */
     nbytes = descr->elsize;
-    if (PyDataType_ISUNSIZED(descr)) {
-        if (!PyDataType_ISFLEXIBLE(descr)) {
-            PyErr_SetString(PyExc_TypeError, "Empty data-type");
-            Py_DECREF(descr);
-            return NULL;
-        }
-        else if (PyDataType_ISSTRING(descr) && !allow_emptystring &&
-                 data == NULL) {
-            PyArray_DESCR_REPLACE(descr);
-            if (descr == NULL) {
+    /*
+     * Unless explicitly asked not to, we do replace dtypes in some cases.
+     * This mainly means that we never create arrays with a subarray dtype
+     * (unless for internal use when requested).  And neither do we create
+     * S0/U0 arrays in most cases (unless data == NULL so this is probably
+     * a view where growing the dtype would be presumable wrong).
+     */
+    if (!(cflags & _NPY_ARRAY_ENSURE_DTYPE_IDENTITY)) {
+        if (descr->subarray) {
+            PyObject *ret;
+            npy_intp newdims[2*NPY_MAXDIMS];
+            npy_intp *newstrides = NULL;
+            memcpy(newdims, dims, nd*sizeof(npy_intp));
+            if (strides) {
+                newstrides = newdims + NPY_MAXDIMS;
+                memcpy(newstrides, strides, nd*sizeof(npy_intp));
+            }
+            nd =_update_descr_and_dimensions(&descr, newdims,
+                                            newstrides, nd);
+            ret = PyArray_NewFromDescr_int(
+                    subtype, descr,
+                    nd, newdims, newstrides, data,
+                    flags, obj, base, cflags);
+            return ret;
+        }
+
+        /* Check datatype element size */
+        if (PyDataType_ISUNSIZED(descr)) {
+            if (!PyDataType_ISFLEXIBLE(descr) &&
+                NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_SetString(PyExc_TypeError, "Empty data-type");
+                Py_DECREF(descr);
                 return NULL;
             }
-            if (descr->type_num == NPY_STRING) {
-                nbytes = descr->elsize = 1;
-            }
-            else {
-                nbytes = descr->elsize = sizeof(npy_ucs4);
+            else if (PyDataType_ISSTRING(descr)
+                        && !(cflags & _NPY_ARRAY_ALLOW_EMPTY_STRING)
+                        && data == NULL) {
+                PyArray_DESCR_REPLACE(descr);
+                if (descr == NULL) {
+                    return NULL;
+                }
+                if (descr->type_num == NPY_STRING) {
+                    nbytes = descr->elsize = 1;
+                }
+                else {
+                    nbytes = descr->elsize = sizeof(npy_ucs4);
+                }
             }
         }
     }
@@ -712,6 +723,11 @@ PyArray_NewFromDescr_int(
     fa->base = (PyObject *)NULL;
     fa->weakreflist = (PyObject *)NULL;
 
+    /* needed for zero-filling logic below, defined and initialized up here
+       so cleanup logic can go in the fail block */
+    NPY_traverse_info fill_zero_info;
+    NPY_traverse_info_init(&fill_zero_info);
+
     if (nd > 0) {
         fa->dimensions = npy_alloc_cache_dim(2 * nd);
         if (fa->dimensions == NULL) {
@@ -781,6 +797,32 @@ PyArray_NewFromDescr_int(
 
 
     if (data == NULL) {
+        /* float errors do not matter and we do not release GIL */
+        NPY_ARRAYMETHOD_FLAGS zero_flags;
+        get_traverse_loop_function *get_fill_zero_loop =
+            NPY_DT_SLOTS(NPY_DTYPE(descr))->get_fill_zero_loop;
+        if (get_fill_zero_loop != NULL) {
+            if (get_fill_zero_loop(
+                    NULL, descr, 1, descr->elsize, &(fill_zero_info.func),
+                    &(fill_zero_info.auxdata), &zero_flags) < 0) {
+                goto fail;
+            }
+        }
+
+        /*
+         * We always want a zero-filled array allocated with calloc if
+         * NPY_NEEDS_INIT is set on the dtype, for safety.  We also want a
+         * zero-filled array if zeroed is set and the zero-filling loop isn't
+         * defined, for better performance.
+         *
+         * If the zero-filling loop is defined and zeroed is set, allocate
+         * with malloc and let the zero-filling loop fill the array buffer
+         * with valid zero values for the dtype.
+         */
+        int use_calloc = (
+                PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT) ||
+                ((cflags & _NPY_ARRAY_ZEROED) && (fill_zero_info.func == NULL)));
+
         /* Store the handler in case the default is modified */
         fa->mem_handler = PyDataMem_GetHandler();
         if (fa->mem_handler == NULL) {
@@ -798,11 +840,8 @@ PyArray_NewFromDescr_int(
                 fa->strides[i] = 0;
             }
         }
-        /*
-         * It is bad to have uninitialized OBJECT pointers
-         * which could also be sub-fields of a VOID array
-         */
-        if (zeroed || PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+
+        if (use_calloc) {
             data = PyDataMem_UserNEW_ZEROED(nbytes, 1, fa->mem_handler);
         }
         else {
@@ -813,6 +852,19 @@ PyArray_NewFromDescr_int(
             goto fail;
         }
 
+        /*
+         * If the array needs special dtype-specific zero-filling logic, do that
+         */
+        if (NPY_UNLIKELY((cflags & _NPY_ARRAY_ZEROED)
+                         && (fill_zero_info.func != NULL))) {
+            npy_intp size = PyArray_MultiplyList(fa->dimensions, fa->nd);
+            if (fill_zero_info.func(
+                    NULL, descr, data, size, descr->elsize,
+                    fill_zero_info.auxdata) < 0) {
+                goto fail;
+            }
+        }
+
         fa->flags |= NPY_ARRAY_OWNDATA;
     }
     else {
@@ -907,9 +959,11 @@ PyArray_NewFromDescr_int(
             }
         }
     }
+    NPY_traverse_info_xfree(&fill_zero_info);
     return (PyObject *)fa;
 
  fail:
+    NPY_traverse_info_xfree(&fill_zero_info);
     Py_XDECREF(fa->mem_handler);
     Py_DECREF(fa);
     return NULL;
@@ -957,7 +1011,7 @@ PyArray_NewFromDescrAndBase(
 {
     return PyArray_NewFromDescr_int(subtype, descr, nd,
                                     dims, strides, data,
-                                    flags, obj, base, 0, 0);
+                                    flags, obj, base, 0);
 }
 
 /*
@@ -1440,167 +1494,44 @@ PyArray_GetArrayParamsFromObject(PyObject *NPY_UNUSED(op),
 }
 
 
-/*
- * This function is a legacy implementation to retain subarray dtype
- * behaviour in array coercion. The behaviour here makes sense if tuples
- * of matching dimensionality are being coerced. Due to the difficulty
- * that the result is ill-defined for lists of array-likes, this is deprecated.
- *
- * WARNING: Do not use this function, it exists purely to support a deprecated
- *          code path.
+/*NUMPY_API
+ * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
+ * Steals a reference to newtype --- which can be NULL
  */
-static int
-setArrayFromSequence(PyArrayObject *a, PyObject *s,
-                        int dim, PyArrayObject * dst)
+NPY_NO_EXPORT PyObject *
+PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
+                int max_depth, int flags, PyObject *context)
 {
-    Py_ssize_t i, slen;
-    int res = -1;
-
-    /* first recursion, view equal destination */
-    if (dst == NULL)
-        dst = a;
+    npy_dtype_info dt_info = {NULL, NULL};
 
-    /*
-     * This code is to ensure that the sequence access below will
-     * return a lower-dimensional sequence.
-     */
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        newtype, &dt_info.descr, &dt_info.dtype);
 
-    /* INCREF on entry DECREF on exit */
-    Py_INCREF(s);
-
-    PyObject *seq = NULL;
-
-    if (PyArray_Check(s)) {
-        if (!(PyArray_CheckExact(s))) {
-            /*
-             * make sure a base-class array is used so that the dimensionality
-             * reduction assumption is correct.
-             */
-            /* This will DECREF(s) if replaced */
-            s = PyArray_EnsureArray(s);
-            if (s == NULL) {
-                goto fail;
-            }
-        }
-
-        /* dst points to correct array subsection */
-        if (PyArray_CopyInto(dst, (PyArrayObject *)s) < 0) {
-            goto fail;
-        }
-
-        Py_DECREF(s);
-        return 0;
-    }
-
-    if (dim > PyArray_NDIM(a)) {
-        PyErr_Format(PyExc_ValueError,
-                 "setArrayFromSequence: sequence/array dimensions mismatch.");
-        goto fail;
-    }
-
-    /* Try __array__ before using s as a sequence */
-    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL, 0);
-    if (tmp == NULL) {
-        goto fail;
-    }
-    else if (tmp == Py_NotImplemented) {
-        Py_DECREF(tmp);
-    }
-    else {
-        int r = PyArray_CopyInto(dst, (PyArrayObject *)tmp);
-        Py_DECREF(tmp);
-        if (r < 0) {
-            goto fail;
-        }
-        Py_DECREF(s);
-        return 0;
-    }
-
-    seq = PySequence_Fast(s, "Could not convert object to sequence");
-    if (seq == NULL) {
-        goto fail;
-    }
-    slen = PySequence_Fast_GET_SIZE(seq);
-
-    /*
-     * Either the dimensions match, or the sequence has length 1 and can
-     * be broadcast to the destination.
-     */
-    if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
-        PyErr_Format(PyExc_ValueError,
-                 "cannot copy sequence with size %zd to array axis "
-                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
-        goto fail;
-    }
-
-    /* Broadcast the one element from the sequence to all the outputs */
-    if (slen == 1) {
-        PyObject *o = PySequence_Fast_GET_ITEM(seq, 0);
-        npy_intp alen = PyArray_DIM(a, dim);
-
-        for (i = 0; i < alen; i++) {
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
-
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
-    }
-    /* Copy element by element */
-    else {
-        for (i = 0; i < slen; i++) {
-            PyObject * o = PySequence_Fast_GET_ITEM(seq, i);
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
+    Py_XDECREF(newtype);
 
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
     }
 
-    Py_DECREF(seq);
-    Py_DECREF(s);
-    return 0;
+    PyObject* ret =  PyArray_FromAny_int(op, dt_info.descr, dt_info.dtype,
+                                         min_depth, max_depth, flags, context);
 
- fail:
-    Py_XDECREF(seq);
-    Py_DECREF(s);
-    return res;
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
 }
 
-
-
-/*NUMPY_API
- * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
- * Steals a reference to newtype --- which can be NULL
+/*
+ * Internal version of PyArray_FromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
  */
+
 NPY_NO_EXPORT PyObject *
-PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
-                int max_depth, int flags, PyObject *context)
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context)
 {
     /*
      * This is the main code to make a NumPy array from a Python
@@ -1617,89 +1548,14 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    PyArray_Descr *fixed_descriptor;
-    PyArray_DTypeMeta *fixed_DType;
-    if (PyArray_ExtractDTypeAndDescriptor((PyObject *)newtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
-        Py_XDECREF(newtype);
-        return NULL;
-    }
-    Py_XDECREF(newtype);
-
     ndim = PyArray_DiscoverDTypeAndShape(op,
-            NPY_MAXDIMS, dims, &cache, fixed_DType, fixed_descriptor, &dtype,
+            NPY_MAXDIMS, dims, &cache, in_DType, in_descr, &dtype,
             flags & NPY_ARRAY_ENSURENOCOPY);
 
-    Py_XDECREF(fixed_descriptor);
-    Py_XDECREF(fixed_DType);
     if (ndim < 0) {
         return NULL;
     }
 
-    if (NPY_UNLIKELY(fixed_descriptor != NULL && PyDataType_HASSUBARRAY(dtype))) {
-        /*
-         * When a subarray dtype was passed in, its dimensions are appended
-         * to the array dimension (causing a dimension mismatch).
-         * There is a problem with that, because if we coerce from non-arrays
-         * we do this correctly by element (as defined by tuples), but for
-         * arrays we first append the dimensions and then assign to the base
-         * dtype and then assign which causes the problem.
-         *
-         * Thus, we check if there is an array included, in that case we
-         * give a FutureWarning.
-         * When the warning is removed, PyArray_Pack will have to ensure
-         * that it does not append the dimensions when creating the subarrays
-         * to assign `arr[0] = obj[0]`.
-         */
-        int includes_array = 0;
-        if (cache != NULL) {
-            /* This is not ideal, but it is a pretty special case */
-            coercion_cache_obj *next = cache;
-            while (next != NULL) {
-                if (!next->sequence) {
-                    includes_array = 1;
-                    break;
-                }
-                next = next->next;
-            }
-        }
-        if (includes_array) {
-            npy_free_coercion_cache(cache);
-
-            ret = (PyArrayObject *) PyArray_NewFromDescr(
-                    &PyArray_Type, dtype, ndim, dims, NULL, NULL,
-                    flags & NPY_ARRAY_F_CONTIGUOUS, NULL);
-            if (ret == NULL) {
-                return NULL;
-            }
-            assert(PyArray_NDIM(ret) != ndim);
-
-            /* NumPy 1.20, 2020-10-01 */
-            if (DEPRECATE_FUTUREWARNING(
-                    "creating an array with a subarray dtype will behave "
-                    "differently when the `np.array()` (or `asarray`, etc.) "
-                    "call includes an array or array object.\n"
-                    "If you are converting a single array or a list of arrays,"
-                    "you can opt-in to the future behaviour using:\n"
-                    "    np.array(arr, dtype=np.dtype(['f', dtype]))['f']\n"
-                    "    np.array([arr1, arr2], dtype=np.dtype(['f', dtype]))['f']\n"
-                    "\n"
-                    "By including a new field and indexing it after the "
-                    "conversion.\n"
-                    "This may lead to a different result or to current failures "
-                    "succeeding.  (FutureWarning since NumPy 1.20)") < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-
-            if (setArrayFromSequence(ret, op, 0, NULL) < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-            return (PyObject *)ret;
-        }
-    }
-
     if (dtype == NULL) {
         dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -1734,7 +1590,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     }
     else if (cache == NULL && PyArray_IsScalar(op, Void) &&
             !(((PyVoidScalarObject *)op)->flags & NPY_ARRAY_OWNDATA) &&
-            newtype == NULL) {
+             ((in_descr == NULL) && (in_DType == NULL))) {
         /*
          * Special case, we return a *view* into void scalars, mainly to
          * allow things similar to the "reversed" assignment:
@@ -1765,7 +1621,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    if (cache == NULL && newtype != NULL &&
+    if (cache == NULL && in_descr != NULL &&
             PyDataType_ISSIGNED(dtype) &&
             PyArray_IsScalar(op, Generic)) {
         assert(ndim == 0);
@@ -1896,7 +1752,6 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
  * NPY_ARRAY_FORCECAST will cause a cast to occur regardless of whether or not
  * it is safe.
  *
- * context is passed through to PyArray_GetArrayParamsFromObject
  */
 
 /*NUMPY_API
@@ -1906,24 +1761,57 @@ NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context)
 {
+    npy_dtype_info dt_info = {NULL, NULL};
+
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        descr, &dt_info.descr, &dt_info.dtype);
+
+    Py_XDECREF(descr);
+
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
+    }
+
+    PyObject* ret =  PyArray_CheckFromAny_int(
+        op, dt_info.descr, dt_info.dtype, min_depth, max_depth, requires,
+        context);
+
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
+}
+
+/*
+ * Internal version of PyArray_CheckFromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
+ */
+
+NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context)
+{
     PyObject *obj;
     if (requires & NPY_ARRAY_NOTSWAPPED) {
-        if (!descr && PyArray_Check(op) &&
+        if (!in_descr && PyArray_Check(op) &&
                 PyArray_ISBYTESWAPPED((PyArrayObject* )op)) {
-            descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
-            if (descr == NULL) {
+            in_descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
+            if (in_descr == NULL) {
                 return NULL;
             }
         }
-        else if (descr && !PyArray_ISNBO(descr->byteorder)) {
-            PyArray_DESCR_REPLACE(descr);
+        else if (in_descr && !PyArray_ISNBO(in_descr->byteorder)) {
+            PyArray_DESCR_REPLACE(in_descr);
         }
-        if (descr && descr->byteorder != NPY_IGNORE) {
-            descr->byteorder = NPY_NATIVE;
+        if (in_descr && in_descr->byteorder != NPY_IGNORE) {
+            in_descr->byteorder = NPY_NATIVE;
         }
     }
 
-    obj = PyArray_FromAny(op, descr, min_depth, max_depth, requires, context);
+    obj = PyArray_FromAny_int(op, in_descr, in_DType, min_depth,
+                              max_depth, requires, context);
     if (obj == NULL) {
         return NULL;
     }
@@ -2031,13 +1919,26 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
         if ((flags & NPY_ARRAY_ENSUREARRAY)) {
             subok = 0;
         }
+        Py_INCREF(newtype);
         ret = (PyArrayObject *)PyArray_NewLikeArray(arr, order,
                                                     newtype, subok);
         if (ret == NULL) {
+            Py_DECREF(newtype);
             return NULL;
         }
 
-        if (PyArray_CopyInto(ret, arr) < 0) {
+        int actual_ndim = PyArray_NDIM(ret);
+        PyArray_Descr *actual_dtype = PyArray_DESCR(ret);
+        ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(arr);
+        ((PyArrayObject_fields *)ret)->descr = newtype;
+
+        int success = PyArray_CopyInto(ret, arr);
+
+        Py_DECREF(newtype);
+        ((PyArrayObject_fields *)ret)->nd = actual_ndim;
+        ((PyArrayObject_fields *)ret)->descr = actual_dtype;
+
+        if (success < 0) {
             Py_DECREF(ret);
             return NULL;
         }
@@ -2832,7 +2733,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier((char *)src_iter);
+        int fpes = npy_get_floatstatus_barrier((char *)&src_iter);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
@@ -2956,23 +2857,13 @@ PyArray_Zeros(int nd, npy_intp const *dims, PyArray_Descr *type, int is_f_order)
             &PyArray_Type, type,
             nd, dims, NULL, NULL,
             is_f_order, NULL, NULL,
-            1, 0);
+            _NPY_ARRAY_ZEROED);
 
     if (ret == NULL) {
         return NULL;
     }
 
-    /* handle objects */
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-    }
-
-
     return (PyObject *)ret;
-
 }
 
 /*NUMPY_API
@@ -3620,7 +3511,7 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
                 &PyArray_Type, dtype,
                 1, &num, NULL, NULL,
                 0, NULL, NULL,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
     }
     if ((sep == NULL) || (strlen(sep) == 0)) {
         ret = array_fromfile_binary(fp, dtype, num, &nread);
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index 98160b1cc..a876ab2c0 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -13,12 +13,29 @@ PyArray_NewFromDescrAndBase(
         npy_intp const *dims, npy_intp const *strides, void *data,
         int flags, PyObject *obj, PyObject *base);
 
+
+/* Private options for NewFromDescriptor */
+typedef enum {
+    /*
+     * Indicate the the array should be zeroed, we may use calloc to do so
+     * (otherwise much like ).
+     */
+    _NPY_ARRAY_ZEROED = 1 << 0,
+    /* Whether to allow empty strings (implied by ensure dtype identity) */
+    _NPY_ARRAY_ALLOW_EMPTY_STRING = 1 << 1,
+    /*
+     * If we take a view into an existing array and use its dtype, then that
+     * dtype must be preserved (for example for subarray and S0, but also
+     * possibly for future dtypes that store more metadata).
+     */
+    _NPY_ARRAY_ENSURE_DTYPE_IDENTITY = 1 << 2,
+} _NPY_CREATION_FLAGS;
+
 NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring);
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags);
 
 NPY_NO_EXPORT PyObject *
 PyArray_NewLikeArrayWithShape(
@@ -36,10 +53,20 @@ _array_from_array_like(PyObject *op,
         int never_copy);
 
 NPY_NO_EXPORT PyObject *
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 int max_depth, int flags, PyObject *context);
 
 NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context);
 
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 695b696c2..820f40fd8 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -4100,7 +4100,7 @@ PyArray_InitializeDatetimeCasts()
     };
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_time_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &time_to_time_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4130,7 +4130,7 @@ PyArray_InitializeDatetimeCasts()
 
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &datetime_to_timedelta_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &legacy_cast_get_strided_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4203,7 +4203,7 @@ PyArray_InitializeDatetimeCasts()
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_string_resolve_descriptors;
     /* Strided loop differs for the two */
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
 
@@ -4252,7 +4252,7 @@ PyArray_InitializeDatetimeCasts()
     /* The default type resolution should work fine. */
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &string_to_datetime_cast_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &string_to_datetime_cast_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index d3e9e1451..93ed0972e 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -365,6 +365,7 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
                     npy_datetime *holidays_begin, npy_datetime *holidays_end)
 {
     npy_int64 count, whole_weeks;
+
     int day_of_week = 0;
     int swapped = 0;
 
@@ -386,6 +387,10 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
         date_begin = date_end;
         date_end = tmp;
         swapped = 1;
+        // we swapped date_begin and date_end, so we need to correct for the
+        // original date_end that should not be included. gh-23197
+        date_begin++;
+        date_end++;
     }
 
     /* Remove any earlier holidays */
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index d1973442a..68d398d64 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -74,9 +74,6 @@ _try_convert_from_ctypes_type(PyTypeObject *type)
     return (PyArray_Descr *)res;
 }
 
-static PyArray_Descr *
-_convert_from_any(PyObject *obj, int align);
-
 /*
  * This function creates a dtype object when the object has a "dtype" attribute,
  * and it can be converted to a dtype object.
@@ -1393,6 +1390,141 @@ PyArray_DescrConverter2(PyObject *obj, PyArray_Descr **at)
     }
 }
 
+
+/**
+ * Check the descriptor is a legacy "flexible" DType instance, this is
+ * an instance which is (normally) not attached to an array, such as a string
+ * of length 0 or a datetime with no unit.
+ * These should be largely deprecated, and represent only the DType class
+ * for most `dtype` parameters.
+ *
+ * TODO: This function should eventually receive a deprecation warning and
+ *       be removed.
+ *
+ * @param descr
+ * @return 1 if this is not a concrete dtype instance 0 otherwise
+ */
+static int
+descr_is_legacy_parametric_instance(PyArray_Descr *descr,
+                                    PyArray_DTypeMeta *DType)
+{
+    if (!NPY_DT_is_legacy(DType)) {
+        return 0;
+    }
+
+    if (PyDataType_ISUNSIZED(descr)) {
+        return 1;
+    }
+    /* Flexible descr with generic time unit (which can be adapted) */
+    if (PyDataType_ISDATETIME(descr)) {
+        PyArray_DatetimeMetaData *meta;
+        meta = get_datetime_metadata_from_dtype(descr);
+        if (meta->base == NPY_FR_GENERIC) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Given a descriptor (dtype instance), handles conversion of legacy flexible
+ * "unsized" descriptors to their DType.  It returns the DType and descriptor
+ * both results can be NULL (if the input is).  But it always sets the DType
+ * when a descriptor is set.
+ *
+ * @param dtype
+ * @param out_descr
+ * @param out_DType
+ * @return 0 on success -1 on failure
+ */
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
+{
+    *out_DType = NULL;
+    *out_descr = NULL;
+
+    if (dtype != NULL) {
+        *out_DType = NPY_DTYPE(dtype);
+        Py_INCREF(*out_DType);
+        if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype,
+                                                    *out_DType)) {
+            *out_descr = (PyArray_Descr *)dtype;
+            Py_INCREF(*out_descr);
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.
+ *
+ * @param obj representing a dtype instance (descriptor) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  The class is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *obj, npy_dtype_info *dt_info)
+{
+    /*
+     * Allow dtype classes pass, this could also be generalized to at least
+     * some scalar types (right now most of these give instances or)
+     */
+    dt_info->dtype = NULL;
+    dt_info->descr = NULL;
+
+    if (PyObject_TypeCheck(obj, &PyArrayDTypeMeta_Type)) {
+        Py_INCREF(obj);
+        dt_info->dtype = (PyArray_DTypeMeta *)obj;
+        dt_info->descr = NULL;
+        return NPY_SUCCEED;
+    }
+    PyArray_Descr *descr;
+    if (PyArray_DescrConverter(obj, &descr) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+    /*
+     * The above converts e.g. "S" or "S0" to the prototype instance, we make
+     * it behave the same as the DType.  This is not fully correct, "S0" should
+     * be considered an instance with actual 0 length.
+     * TODO: It would be nice to fix that eventually.
+     */
+    int res = PyArray_ExtractDTypeAndDescriptor(
+                descr, &dt_info->descr, &dt_info->dtype);
+    Py_DECREF(descr);
+    if (res < 0) {
+        return NPY_FAIL;
+    }
+    return NPY_SUCCEED;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.  It
+ * accepts `None` and does nothing in that case (user must initialize to
+ * NULL anyway).
+ *
+ * @param obj None or obj representing a dtype instance (descr) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  If `obj` is None, is not modified.  Otherwise the class
+ *         is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *obj, npy_dtype_info *dt_info)
+{
+    if (obj == Py_None) {
+        /* caller must have initialized for the optional version */
+        return NPY_SUCCEED;
+    }
+    return PyArray_DTypeOrDescrConverterRequired(obj, dt_info);
+}
+
 /**
  * Get a dtype instance from a python type
  */
@@ -2315,7 +2447,6 @@ arraydescr_new(PyTypeObject *subtype,
                 PyErr_NoMemory();
                 return NULL;
             }
-            PyObject_Init((PyObject *)descr, subtype);
             descr->f = &NPY_DT_SLOTS(DType)->f;
             Py_XINCREF(DType->scalar_type);
             descr->typeobj = DType->scalar_type;
@@ -3141,25 +3272,15 @@ arraydescr_newbyteorder(PyArray_Descr *self, PyObject *args)
 static PyObject *
 arraydescr_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
-
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if (args_len != 1) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 1 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 static PyMethodDef arraydescr_methods[] = {
diff --git a/numpy/core/src/multiarray/descriptor.h b/numpy/core/src/multiarray/descriptor.h
index 7e6f212f2..b14edc3aa 100644
--- a/numpy/core/src/multiarray/descriptor.h
+++ b/numpy/core/src/multiarray/descriptor.h
@@ -1,6 +1,29 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 
+
+/*
+ * In some API calls we wish to allow users to pass a DType class or a
+ * dtype instances with different meanings.
+ * This struct is mainly used for the argument parsing in
+ * `PyArray_DTypeOrDescrConverter`.
+ */
+typedef struct {
+    PyArray_DTypeMeta *dtype;
+    PyArray_Descr *descr;
+} npy_dtype_info;
+
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
+
 NPY_NO_EXPORT PyObject *arraydescr_protocol_typestr_get(
         PyArray_Descr *, void *);
 NPY_NO_EXPORT PyObject *arraydescr_protocol_descr_get(
diff --git a/numpy/core/src/multiarray/dlpack.c b/numpy/core/src/multiarray/dlpack.c
index 1c6eb58f2..d26701df8 100644
--- a/numpy/core/src/multiarray/dlpack.c
+++ b/numpy/core/src/multiarray/dlpack.c
@@ -171,7 +171,10 @@ array_dlpack(PyArrayObject *self,
     managed_dtype.bits = 8 * itemsize;
     managed_dtype.lanes = 1;
 
-    if (PyDataType_ISSIGNED(dtype)) {
+    if (PyDataType_ISBOOL(dtype)) {
+        managed_dtype.code = kDLBool;
+    }
+    else if (PyDataType_ISSIGNED(dtype)) {
         managed_dtype.code = kDLInt;
     }
     else if (PyDataType_ISUNSIGNED(dtype)) {
@@ -331,6 +334,11 @@ from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj) {
     const uint8_t bits = managed->dl_tensor.dtype.bits;
     const npy_intp itemsize = bits / 8;
     switch (managed->dl_tensor.dtype.code) {
+    case kDLBool:
+        if (bits == 8) {
+            typenum = NPY_BOOL;
+        }
+        break;
     case kDLInt:
         switch (bits)
         {
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 549300f73..f79662040 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -32,6 +32,7 @@
 
 #include "shape.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "alloc.h"
 #include "dtypemeta.h"
 #include "array_method.h"
@@ -73,18 +74,6 @@ _safe_print(PyObject *obj)
 }
 #endif
 
-/*
- * Returns a transfer function which DECREFs any references in src_type.
- *
- * Returns NPY_SUCCEED or NPY_FAIL.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api);
-
 
 /*************************** COPY REFERENCES *******************************/
 
@@ -159,7 +148,7 @@ typedef struct {
     NpyAuxData base;
     PyArray_GetItemFunc *getitem;
     PyArrayObject_fields arr_fields;
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _any_to_object_auxdata;
 
 
@@ -169,7 +158,7 @@ _any_to_object_auxdata_free(NpyAuxData *auxdata)
     _any_to_object_auxdata *data = (_any_to_object_auxdata *)auxdata;
 
     Py_DECREF(data->arr_fields.descr);
-    NPY_cast_info_xfree(&data->decref_src);
+    NPY_traverse_info_xfree(&data->decref_src);
     PyMem_Free(data);
 }
 
@@ -187,7 +176,7 @@ _any_to_object_auxdata_clone(NpyAuxData *auxdata)
     Py_INCREF(res->arr_fields.descr);
 
     if (data->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&res->decref_src, &data->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&res->decref_src, &data->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)res);
             return NULL;
         }
@@ -228,8 +217,8 @@ _strided_to_strided_any_to_object(
     }
     if (data->decref_src.func != NULL) {
         /* If necessary, clear the input buffer (`move_references`) */
-        if (data->decref_src.func(&data->decref_src.context,
-                &orig_src, &N, &src_stride, data->decref_src.auxdata) < 0) {
+        if (data->decref_src.func(NULL, data->decref_src.descr,
+                orig_src, N, src_stride, data->decref_src.auxdata) < 0) {
             return -1;
         }
     }
@@ -265,18 +254,18 @@ any_to_object_get_loop(
     data->arr_fields.nd = 0;
 
     data->getitem = context->descriptors[0]->f->getitem;
-    NPY_cast_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_src);
 
     if (move_references && PyDataType_REFCHK(context->descriptors[0])) {
-        int needs_api;
-        if (get_decref_transfer_function(
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
                 aligned, strides[0], context->descriptors[0],
-                &data->decref_src,
-                &needs_api) == NPY_FAIL)  {
+                &data->decref_src, &clear_flags) < 0)  {
             NPY_AUXDATA_FREE(*out_transferdata);
             *out_transferdata = NULL;
             return -1;
         }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
     }
     return 0;
 }
@@ -578,7 +567,7 @@ wrap_copy_swap_function(
             &PyArray_Type, dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->arr == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1327,7 +1316,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aip == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1354,7 +1343,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aop == NULL) {
         Py_DECREF(data->aip);
         PyMem_Free(data);
@@ -1393,7 +1382,7 @@ typedef struct {
     npy_intp N;
     NPY_cast_info wrapped;
     /* If finish->func is non-NULL the source needs a decref */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _one_to_n_data;
 
 /* transfer data free function */
@@ -1401,7 +1390,7 @@ static void _one_to_n_data_free(NpyAuxData *data)
 {
     _one_to_n_data *d = (_one_to_n_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -1420,7 +1409,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
     newdata->base.clone = &_one_to_n_data_clone;
     newdata->N = d->N;
     /* Initialize in case of error, or if it is unused */
-    NPY_cast_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_src);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
@@ -1430,7 +1419,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
         return (NpyAuxData *)newdata;
     }
 
-    if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
         return NULL;
     }
@@ -1490,8 +1479,8 @@ _strided_to_strided_one_to_n_with_finish(
             return -1;
         }
 
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &one_item, &zero_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, one_item, zero_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
 
@@ -1522,7 +1511,7 @@ get_one_to_n_transfer_function(int aligned,
     data->base.free = &_one_to_n_data_free;
     data->base.clone = &_one_to_n_data_clone;
     data->N = N;
-    NPY_cast_info_init(&data->decref_src);  /* In case of error */
+    NPY_traverse_info_init(&data->decref_src);  /* In case of error */
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1542,15 +1531,14 @@ get_one_to_n_transfer_function(int aligned,
 
     /* If the src object will need a DECREF, set src_dtype */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
     }
 
     if (data->decref_src.func == NULL) {
@@ -1741,8 +1729,8 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     NPY_cast_info wrapped;
-    NPY_cast_info decref_src;
-    NPY_cast_info decref_dst;  /* The use-case should probably be deprecated */
+    NPY_traverse_info decref_src;
+    NPY_traverse_info decref_dst;  /* The use-case should probably be deprecated */
     npy_intp src_N, dst_N;
     /* This gets a run-length encoded representation of the transfer */
     npy_intp run_count;
@@ -1755,8 +1743,8 @@ static void _subarray_broadcast_data_free(NpyAuxData *data)
 {
     _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
-    NPY_cast_info_xfree(&d->decref_dst);
+    NPY_traverse_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_dst);
     PyMem_Free(data);
 }
 
@@ -1780,21 +1768,21 @@ static NpyAuxData *_subarray_broadcast_data_clone(NpyAuxData *data)
     newdata->run_count = d->run_count;
     memcpy(newdata->offsetruns, d->offsetruns, offsetruns_size);
 
-    NPY_cast_info_init(&newdata->decref_src);
-    NPY_cast_info_init(&newdata->decref_dst);
+    NPY_traverse_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_dst);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _subarray_broadcast_data_free((NpyAuxData *)newdata);
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
     }
     if (d->decref_dst.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
@@ -1883,8 +1871,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
             }
             else {
                 if (d->decref_dst.func != NULL) {
-                    if (d->decref_dst.func(&d->decref_dst.context,
-                            &dst_ptr, &count, &dst_subitemsize,
+                    if (d->decref_dst.func(NULL, d->decref_dst.descr,
+                            dst_ptr, count, dst_subitemsize,
                             d->decref_dst.auxdata) < 0) {
                         return -1;
                     }
@@ -1895,8 +1883,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
         }
 
         if (d->decref_src.func != NULL) {
-            if (d->decref_src.func(&d->decref_src.context,
-                    &src, &d->src_N, &src_subitemsize,
+            if (d->decref_src.func(NULL, d->decref_src.descr,
+                    src, d->src_N, src_subitemsize,
                     d->decref_src.auxdata) < 0) {
                 return -1;
             }
@@ -1939,8 +1927,8 @@ get_subarray_broadcast_transfer_function(int aligned,
     data->src_N = src_size;
     data->dst_N = dst_size;
 
-    NPY_cast_info_init(&data->decref_src);
-    NPY_cast_info_init(&data->decref_dst);
+    NPY_traverse_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_dst);
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1959,12 +1947,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the src object will need a DECREF */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        src_dtype->elsize, 0,
-                        src_dtype, NULL,
-                        1,
-                        &data->decref_src,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        src_dtype->elsize, src_dtype,
+                        &data->decref_src, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -1972,12 +1957,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the dst object needs a DECREF to set it to NULL */
     if (PyDataType_REFCHK(dst_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        dst_dtype->elsize, 0,
-                        dst_dtype, NULL,
-                        1,
-                        &data->decref_dst,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        dst_dtype->elsize, dst_dtype,
+                        &data->decref_dst, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -2182,6 +2164,7 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     npy_intp field_count;
+    NPY_traverse_info decref_src;
     _single_field_transfer fields[];
 } _field_transfer_data;
 
@@ -2190,6 +2173,7 @@ typedef struct {
 static void _field_transfer_data_free(NpyAuxData *data)
 {
     _field_transfer_data *d = (_field_transfer_data *)data;
+    NPY_traverse_info_xfree(&d->decref_src);
 
     for (npy_intp i = 0; i < d->field_count; ++i) {
         NPY_cast_info_xfree(&d->fields[i].info);
@@ -2213,6 +2197,10 @@ static NpyAuxData *_field_transfer_data_clone(NpyAuxData *data)
     }
     newdata->base = d->base;
     newdata->field_count = 0;
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
 
     /* Copy all the fields transfer data */
     for (npy_intp i = 0; i < field_count; ++i) {
@@ -2254,6 +2242,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
             dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
@@ -2267,6 +2260,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             return 0;
         }
     }
@@ -2313,6 +2311,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
         data->field_count = 0;
+        NPY_traverse_info_init(&data->decref_src);
 
         *out_flags = PyArrayMethod_MINIMAL_FLAGS;
         for (i = 0; i < field_count; ++i) {
@@ -2340,23 +2339,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
 
         /*
-         * If references should be decrefd in src, add another transfer
-         * function to do that. Since a decref function only uses a single
-         * input, the second one (normally output) just does not matter here.
+         * If references should be decrefd in src, add a clear function.
          */
         if (move_references && PyDataType_REFCHK(src_dtype)) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_dtype,
-                                    &data->fields[field_count].info,
-                                    NULL) != NPY_SUCCEED) {
+            NPY_ARRAYMETHOD_FLAGS clear_flags;
+            if (PyArray_GetClearFunction(
+                    0, src_stride, src_dtype, &data->decref_src,
+                    &clear_flags) < 0) {
                 NPY_AUXDATA_FREE((NpyAuxData *)data);
                 return NPY_FAIL;
             }
-            data->fields[field_count].src_offset = 0;
-            data->fields[field_count].dst_offset = 0;
-            data->field_count = field_count;
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         }
 
         *out_stransfer = &_strided_to_strided_field_transfer;
@@ -2384,6 +2377,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
+        NPY_traverse_info_init(&data->decref_src);
 
         key = PyTuple_GET_ITEM(src_dtype->names, 0);
         tup = PyDict_GetItem(src_dtype->fields, key);
@@ -2432,6 +2426,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     data->base.free = &_field_transfer_data_free;
     data->base.clone = &_field_transfer_data_clone;
     data->field_count = 0;
+    NPY_traverse_info_init(&data->decref_src);
 
     *out_flags = PyArrayMethod_MINIMAL_FLAGS;
     /* set up the transfer function for each field */
@@ -2473,69 +2468,6 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     return NPY_SUCCEED;
 }
 
-static int
-get_decref_fields_transfer_function(int NPY_UNUSED(aligned),
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            PyArrayMethod_StridedLoop **out_stransfer,
-                            NpyAuxData **out_transferdata,
-                            int *out_needs_api)
-{
-    PyObject *names, *key, *tup, *title;
-    PyArray_Descr *src_fld_dtype;
-    npy_int i, structsize;
-    Py_ssize_t field_count;
-    int src_offset;
-
-    names = src_dtype->names;
-    field_count = PyTuple_GET_SIZE(src_dtype->names);
-
-    /* Over-allocating here: less fields may be used */
-    structsize = sizeof(_field_transfer_data) +
-                    field_count * sizeof(_single_field_transfer);
-    /* Allocate the data and populate it */
-    _field_transfer_data *data = PyMem_Malloc(structsize);
-    if (data == NULL) {
-        PyErr_NoMemory();
-        return NPY_FAIL;
-    }
-    data->base.free = &_field_transfer_data_free;
-    data->base.clone = &_field_transfer_data_clone;
-    data->field_count = 0;
-
-    _single_field_transfer *field = data->fields;
-    for (i = 0; i < field_count; ++i) {
-        key = PyTuple_GET_ITEM(names, i);
-        tup = PyDict_GetItem(src_dtype->fields, key);
-        if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
-                                                &src_offset, &title)) {
-            NPY_AUXDATA_FREE((NpyAuxData *)data);
-            return NPY_FAIL;
-        }
-        if (PyDataType_REFCHK(src_fld_dtype)) {
-            if (out_needs_api) {
-                *out_needs_api = 1;
-            }
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_fld_dtype,
-                                    &field->info,
-                                    out_needs_api) != NPY_SUCCEED) {
-                NPY_AUXDATA_FREE((NpyAuxData *)data);
-                return NPY_FAIL;
-            }
-            field->src_offset = src_offset;
-            data->field_count++;
-            field++;
-        }
-    }
-
-    *out_stransfer = &_strided_to_strided_field_transfer;
-    *out_transferdata = (NpyAuxData *)data;
-
-    return NPY_SUCCEED;
-}
-
 
 /************************* MASKED TRANSFER WRAPPER *************************/
 
@@ -2544,7 +2476,7 @@ typedef struct {
     /* The transfer function being wrapped (could likely be stored directly) */
     NPY_cast_info wrapped;
     /* The src decref function if necessary */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _masked_wrapper_transfer_data;
 
 /* transfer data free function */
@@ -2553,7 +2485,7 @@ _masked_wrapper_transfer_data_free(NpyAuxData *data)
 {
     _masked_wrapper_transfer_data *d = (_masked_wrapper_transfer_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -2576,7 +2508,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)newdata);
             return NULL;
         }
@@ -2586,7 +2518,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
 }
 
 static int
-_strided_masked_wrapper_decref_transfer_function(
+_strided_masked_wrapper_clear_function(
         PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
         const npy_intp *dimensions, const npy_intp *strides,
         npy_bool *mask, npy_intp mask_stride,
@@ -2603,8 +2535,8 @@ _strided_masked_wrapper_decref_transfer_function(
         /* Skip masked values, still calling decref for move_references */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 1);
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &subloopsize, &src_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, subloopsize, src_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
         dst += subloopsize * dst_stride;
@@ -2670,139 +2602,18 @@ _strided_masked_wrapper_transfer_function(
 }
 
 
-/*************************** CLEAR SRC *******************************/
 
+/* A no-op function (currently only used for cleanup purposes really) */
 static int
-_dec_src_ref_nop(
+_cast_no_op(
         PyArrayMethod_Context *NPY_UNUSED(context),
         char *const *NPY_UNUSED(args), const npy_intp *NPY_UNUSED(dimensions),
         const npy_intp *NPY_UNUSED(strides), NpyAuxData *NPY_UNUSED(auxdata))
 {
-    /* NOP */
+    /* Do nothing */
     return 0;
 }
 
-static int
-_strided_to_null_dec_src_ref_reference(
-        PyArrayMethod_Context *NPY_UNUSED(context),
-        char *const *args, const npy_intp *dimensions,
-        const npy_intp *strides, NpyAuxData *NPY_UNUSED(auxdata))
-{
-    char *src = args[0];
-    npy_intp N = dimensions[0];
-    npy_intp stride = strides[0];
-
-    PyObject *src_ref = NULL;
-    while (N > 0) {
-        /* Release the reference in src and set it to NULL */
-        NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
-        memcpy(&src_ref, src, sizeof(src_ref));
-        Py_XDECREF(src_ref);
-        memset(src, 0, sizeof(PyObject *));
-
-        src += stride;
-        --N;
-    }
-    return 0;
-}
-
-
-/*
- * Get a function to decref.  Currently, this uses a cast info slot, which
- * means that the second (destination) descriptor is always set to NULL
- * and generally does not have to be passed.
- * Since we do not currently have an `ArrayMethod` representing this, the
- * method is also set to NULL.
- *
- * TODO: this function should probably be moved onto the DType eventually,
- *       which would allow for user DTypes to include dynamic allocated
- *       memory or Python objects.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api)
-{
-    NPY_cast_info_init(cast_info);
-
-    /* If there are no references, it's a nop */
-    if (!PyDataType_REFCHK(src_dtype)) {
-        cast_info->func = &_dec_src_ref_nop;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If it's a single reference, it's one decref */
-    else if (src_dtype->type_num == NPY_OBJECT) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        cast_info->func = &_strided_to_null_dec_src_ref_reference;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If there are subarrays, need to wrap it */
-    else if (PyDataType_HASSUBARRAY(src_dtype)) {
-        PyArray_Dims src_shape = {NULL, -1};
-        npy_intp src_size;
-
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (!(PyArray_IntpConverter(src_dtype->subarray->shape,
-                                            &src_shape))) {
-            PyErr_SetString(PyExc_ValueError,
-                    "invalid subarray shape");
-            return NPY_FAIL;
-        }
-        src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
-        npy_free_cache_dim_obj(src_shape);
-
-        NPY_ARRAYMETHOD_FLAGS ignored_flags;
-        if (get_n_to_n_transfer_function(aligned,
-                src_stride, 0,
-                src_dtype->subarray->base, NULL, 1, src_size,
-                &cast_info->func, &cast_info->auxdata,
-                &ignored_flags) != NPY_SUCCEED) {
-            return NPY_FAIL;
-        }
-
-        goto finalize;
-    }
-    /* If there are fields, need to do each field */
-    else if (PyDataType_HASFIELDS(src_dtype)) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (get_decref_fields_transfer_function(aligned,
-                            src_stride, src_dtype,
-                            &cast_info->func, &cast_info->auxdata,
-                            out_needs_api) < 0) {
-            return NPY_FAIL;
-        }
-        goto finalize;
-    }
-    else {
-        PyErr_Format(PyExc_RuntimeError,
-                "Internal error, tried to fetch decref function for the "
-                "unsupported DType '%S'.", src_dtype);
-        return NPY_FAIL;
-    }
-
-  finalize:
-    /* Make sure all important fields are either set or cleared */
-    Py_INCREF(src_dtype);
-    cast_info->descriptors[0] = src_dtype;
-    cast_info->descriptors[1] = NULL;
-    cast_info->context.method = NULL;
-    cast_info->context.caller = NULL;
-    return NPY_SUCCEED;
-}
-
 
 /*
  * ********************* Generalized Multistep Cast ************************
@@ -3083,14 +2894,14 @@ _clear_cast_info_after_get_loop_failure(NPY_cast_info *cast_info)
     /* As public API we could choose to clear auxdata != NULL */
     assert(cast_info->auxdata == NULL);
     /* Set func to be non-null so that `NPY_cats_info_xfree` does not skip */
-    cast_info->func = &_dec_src_ref_nop;
+    cast_info->func = &_cast_no_op;
     NPY_cast_info_xfree(cast_info);
 }
 
 
 /*
  * Helper for PyArray_GetDTypeTransferFunction, which fetches a single
- * transfer function from the each casting implementation (ArrayMethod).
+ * transfer function from the each casting implementation (ArrayMethod)
  * May set the transfer function to NULL when the cast can be achieved using
  * a view.
  * TODO: Expand the view functionality for general offsets, not just 0:
@@ -3120,6 +2931,8 @@ define_cast_for_descrs(
         int move_references,
         NPY_cast_info *cast_info, NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
+    assert(dst_dtype != NULL);  /* Was previously used for decref */
+
     /* Storage for all cast info in case multi-step casting is necessary */
     _multistep_castdata castdata;
     /* Initialize funcs to NULL to simplify cleanup on error. */
@@ -3164,7 +2977,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (from_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped */
-            castdata.from.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.from.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.from);
         }
         else {
@@ -3203,7 +3016,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (to_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped. */
-            castdata.to.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.to.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.to);
         }
         else {
@@ -3279,33 +3092,6 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             NPY_cast_info *cast_info,
                             NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
-    assert(src_dtype != NULL);
-
-    /*
-     * If one of the dtypes is NULL, we give back either a src decref
-     * function or a dst setzero function
-     *
-     * TODO: Eventually, we may wish to support user dtype with references
-     *       (including and beyond bare `PyObject *` this may require extending
-     *       the ArrayMethod API and those paths should likely be split out
-     *       from this function.)
-     */
-    if (dst_dtype == NULL) {
-        assert(move_references);
-        int needs_api = 0;
-        int res = get_decref_transfer_function(aligned,
-                                src_dtype->elsize,
-                                src_dtype,
-                                cast_info,
-                                &needs_api);
-        /* decref'ing never creates floating point errors, so just ignore it */
-        *out_flags = PyArrayMethod_MINIMAL_FLAGS;
-        if (needs_api) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        }
-        return res;
-    }
-
     if (define_cast_for_descrs(aligned,
             src_stride, dst_stride,
             src_dtype, dst_dtype, move_references,
@@ -3563,20 +3349,19 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
 
     /* If the src object will need a DECREF, get a function to handle that */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         cast_info->func = (PyArrayMethod_StridedLoop *)
-                &_strided_masked_wrapper_decref_transfer_function;
+                &_strided_masked_wrapper_clear_function;
     }
     else {
-        NPY_cast_info_init(&data->decref_src);
+        NPY_traverse_info_init(&data->decref_src);
         cast_info->func = (PyArrayMethod_StridedLoop *)
                 &_strided_masked_wrapper_transfer_function;
     }
diff --git a/numpy/core/src/multiarray/dtype_transfer.h b/numpy/core/src/multiarray/dtype_transfer.h
index 6f7aa2837..04df5cb64 100644
--- a/numpy/core/src/multiarray/dtype_transfer.h
+++ b/numpy/core/src/multiarray/dtype_transfer.h
@@ -146,7 +146,6 @@ object_to_any_get_loop(
         NpyAuxData **out_transferdata,
         NPY_ARRAYMETHOD_FLAGS *flags);
 
-
 NPY_NO_EXPORT int
 wrap_aligned_transferfunction(
         int aligned, int must_wrap,
diff --git a/numpy/core/src/multiarray/dtype_traversal.c b/numpy/core/src/multiarray/dtype_traversal.c
new file mode 100644
index 000000000..f76119f94
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.c
@@ -0,0 +1,651 @@
+/*
+ * This file is simlar to the low-level loops for data type transfer
+ * in `dtype_transfer.c` but for those which only require visiting
+ * a single array (and mutating it in-place).
+ *
+ * As of writing, it is only used for CLEARing, which means mainly
+ * Python object DECREF/dealloc followed by NULL'ing the data
+ * (to support double clearing and ensure data is again in a usable state).
+ * However, memory initialization and traverse follows similar
+ * protocols (although traversal needs additional arguments).
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayobject.h"
+
+#include "alloc.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "dtype_traversal.h"
+
+
+/* Buffer size with the same use case as the one in dtype_transfer.c */
+#define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
+
+
+typedef int get_traverse_func_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/*
+ * Generic Clear function helpers:
+ */
+
+static int
+get_clear_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(clear_info);
+    /* not that cleanup code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_clear = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_clear_loop;
+    if (get_clear == NULL) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Internal error, `get_clear_loop` not set for the DType '%S'",
+                dtype);
+        return -1;
+    }
+
+    if (get_clear(traverse_context, dtype, aligned, stride,
+                  &clear_info->func, &clear_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(clear_info->func == NULL);
+        clear_info->func = NULL;
+        return -1;
+    }
+    Py_INCREF(dtype);
+    clear_info->descr = dtype;
+
+    return 0;
+}
+
+/*
+ * Helper to set up a strided loop used for clearing.  Clearing means
+ * deallocating any references (e.g. via Py_DECREF) and resetting the data
+ * back into a usable/initialized state (e.g. by NULLing any references).
+ *
+ * The function will error when called on a dtype which does not have
+ * references (and thus the get_clear_loop slot NULL).
+ * Note that old-style user-dtypes use the "void" version.
+ *
+ * NOTE: This function may have a use for a `traverse_context` at some point
+ *       but right now, it is always NULL and only exists to allow adding it
+ *       in the future without changing the strided-loop signature.
+ */
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    return get_clear_function(NULL, dtype, aligned, stride, clear_info, flags);
+}
+
+
+/*
+ * Generic zerofill/fill function helper:
+ */
+
+static int
+get_zerofill_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *zerofill_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(zerofill_info);
+    /* not that filling code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_zerofill = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_fill_zero_loop;
+    if (get_zerofill == NULL) {
+        /* Allowed to be NULL (and accept it here) */
+        return 0;
+    }
+
+    if (get_zerofill(traverse_context, dtype, aligned, stride,
+                     &zerofill_info->func, &zerofill_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(zerofill_info->func == NULL);
+        zerofill_info->func = NULL;
+        return -1;
+    }
+    if (zerofill_info->func == NULL) {
+        /* Zerofill also may return func=NULL without an error. */
+        return 0;
+    }
+
+    Py_INCREF(dtype);
+    zerofill_info->descr = dtype;
+
+    return 0;
+}
+
+
+/****************** Python Object clear ***********************/
+
+static int
+clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *aligned_copy = NULL;
+    while (size > 0) {
+        /* Release the reference in src and set it to NULL */
+        memcpy(&aligned_copy, data, sizeof(PyObject *));
+        Py_XDECREF(aligned_copy);
+        memset(data, 0, sizeof(PyObject *));
+
+        data += stride;
+        --size;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **out_auxdata,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI|NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &clear_object_strided_loop;
+    return 0;
+}
+
+
+/**************** Python Object zero fill *********************/
+
+static int
+fill_zero_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *zero = PyLong_FromLong(0);
+    while (size--) {
+        Py_INCREF(zero);
+        // assumes `data` doesn't have a pre-existing object inside it
+        memcpy(data, &zero, sizeof(zero));
+        data += stride;
+    }
+    Py_DECREF(zero);
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(void *NPY_UNUSED(traverse_context),
+                              PyArray_Descr *NPY_UNUSED(descr),
+                              int NPY_UNUSED(aligned),
+                              npy_intp NPY_UNUSED(fixed_stride),
+                              traverse_loop_function **out_loop,
+                              NpyAuxData **NPY_UNUSED(out_auxdata),
+                              NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &fill_zero_object_strided_loop;
+    return 0;
+}
+
+/**************** Structured DType generic funcationality ***************/
+
+/*
+ * Note that legacy user dtypes also make use of this.  Someone managed to
+ * hack objects into them by adding a field that contains objects and this
+ * remains (somewhat) valid.
+ * (Unlike our voids, those fields must be hardcoded probably, but...)
+ *
+ * The below functionality mirrors the casting functionality relatively
+ * closely.
+ */
+
+typedef struct {
+    npy_intp src_offset;
+    NPY_traverse_info info;
+} single_field_traverse_data;
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp field_count;
+    single_field_traverse_data fields[];
+} fields_traverse_data;
+
+
+/* traverse data free function */
+static void
+fields_traverse_data_free(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    for (npy_intp i = 0; i < d->field_count; ++i) {
+        NPY_traverse_info_xfree(&d->fields[i].info);
+    }
+    PyMem_Free(d);
+}
+
+
+/* traverse data copy function (untested due to no direct use currently) */
+static NpyAuxData *
+fields_traverse_data_clone(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    npy_intp field_count = d->field_count;
+    npy_intp structsize = sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data);
+
+    /* Allocate the data and populate it */
+    fields_traverse_data *newdata = PyMem_Malloc(structsize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->field_count = 0;
+
+    /* Copy all the fields transfer data */
+    single_field_traverse_data *in_field = d->fields;
+    single_field_traverse_data *new_field = newdata->fields;
+
+    for (; newdata->field_count < field_count;
+                newdata->field_count++, in_field++, new_field++) {
+        new_field->src_offset = in_field->src_offset;
+
+        if (NPY_traverse_info_copy(&new_field->info, &in_field->info) < 0) {
+            fields_traverse_data_free((NpyAuxData *)newdata);
+            return NULL;
+        }
+    }
+
+    return (NpyAuxData *)newdata;
+}
+
+
+static int
+traverse_fields_function(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    fields_traverse_data *d = (fields_traverse_data *)auxdata;
+    npy_intp i, field_count = d->field_count;
+
+    /* Do the traversing a block at a time for better memory caching */
+    const npy_intp blocksize = NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+
+    for (;;) {
+        if (N > blocksize) {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        blocksize, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            N -= blocksize;
+            data += blocksize * stride;
+        }
+        else {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        N, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            return 0;
+        }
+    }
+}
+
+
+static int
+get_fields_traverse_function(
+        void *traverse_context, PyArray_Descr *dtype, int NPY_UNUSED(aligned),
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *fld_dtype;
+    npy_int i, structsize;
+    Py_ssize_t field_count;
+
+    names = dtype->names;
+    field_count = PyTuple_GET_SIZE(dtype->names);
+
+    /* Over-allocating here: less fields may be used */
+    structsize = (sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data));
+    /* Allocate the data and populate it */
+    fields_traverse_data *data = PyMem_Malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    data->base.free = &fields_traverse_data_free;
+    data->base.clone = &fields_traverse_data_clone;
+    data->field_count = 0;
+
+    single_field_traverse_data *field = data->fields;
+    for (i = 0; i < field_count; ++i) {
+        int offset;
+
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &fld_dtype, &offset, &title)) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (get_traverse_func == &get_clear_function
+                && !PyDataType_REFCHK(fld_dtype)) {
+            /* No need to do clearing (could change to use NULL return) */
+            continue;
+        }
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (get_traverse_func(
+                traverse_context, fld_dtype, 0,
+                stride, &field->info, &clear_flags) < 0) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (field->info.func == NULL) {
+            /* zerofill allows NULL func as "default" memset to zero */
+            continue;
+        }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
+        field->src_offset = offset;
+        data->field_count++;
+        field++;
+    }
+
+    *out_func = &traverse_fields_function;
+    *out_auxdata = (NpyAuxData *)data;
+
+    return 0;
+}
+
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp count;
+    NPY_traverse_info info;
+} subarray_traverse_data;
+
+
+/* traverse data free function */
+static void
+subarray_traverse_data_free(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    NPY_traverse_info_xfree(&d->info);
+    PyMem_Free(d);
+}
+
+
+/*
+ * We seem to be neither using nor exposing this right now, so leave it NULL.
+ * (The implementation below should be functional.)
+ */
+#define subarray_traverse_data_clone NULL
+
+#ifndef subarray_traverse_data_clone
+/* traverse data copy function */
+static NpyAuxData *
+subarray_traverse_data_clone(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    /* Allocate the data and populate it */
+    subarray_traverse_data *newdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->count = d->count;
+
+    if (NPY_traverse_info_copy(&newdata->info, &d->info) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
+
+    return (NpyAuxData *)newdata;
+}
+#endif
+
+
+static int
+traverse_subarray_func(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    subarray_traverse_data *subarr_data = (subarray_traverse_data *)auxdata;
+
+    traverse_loop_function *func = subarr_data->info.func;
+    PyArray_Descr *sub_descr = subarr_data->info.descr;
+    npy_intp sub_N = subarr_data->count;
+    NpyAuxData *sub_auxdata = subarr_data->info.auxdata;
+    npy_intp sub_stride = sub_descr->elsize;
+
+    while (N--) {
+        if (func(traverse_context, sub_descr, data,
+                 sub_N, sub_stride, sub_auxdata) < 0) {
+            return -1;
+        }
+        data += stride;
+    }
+    return 0;
+}
+
+
+static int
+get_subarray_traverse_func(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp size, npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    subarray_traverse_data *auxdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (auxdata == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    auxdata->count = size;
+    auxdata->base.free = &subarray_traverse_data_free;
+    auxdata->base.clone = subarray_traverse_data_clone;
+
+    if (get_traverse_func(
+            traverse_context, dtype, aligned,
+            dtype->elsize, &auxdata->info, flags) < 0) {
+        PyMem_Free(auxdata);
+        return -1;
+    }
+    if (auxdata->info.func == NULL) {
+        /* zerofill allows func to be NULL, in which we need not do anything */
+        PyMem_Free(auxdata);
+        *out_func = NULL;
+        *out_auxdata = NULL;
+        return 0;
+    }
+    *out_func = &traverse_subarray_func;
+    *out_auxdata = (NpyAuxData *)auxdata;
+
+    return 0;
+}
+
+
+static int
+clear_no_op(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *NPY_UNUSED(data), npy_intp NPY_UNUSED(size),
+        npy_intp NPY_UNUSED(stride), NpyAuxData *NPY_UNUSED(auxdata))
+{
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    /*
+     * If there are no references, it's a nop.  This path should not be hit
+     * but structured dtypes are tricky when a dtype which included references
+     * was sliced to not include any.
+     */
+    if (!PyDataType_REFCHK(dtype)) {
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+        return 0;
+    }
+    else if (dtype->type_num == NPY_VOID) {
+        /* 
+         * Void dtypes can have "ghosts" of objects marking the dtype because
+         * holes (or the raw bytes if fields are gone) may include objects.
+         * Paths that need those flags should probably be considered incorrect.
+         * But as long as this can happen (a V8 that indicates references)
+         * we need to make it a no-op here.
+         */
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    PyErr_Format(PyExc_RuntimeError,
+            "Internal error, tried to fetch clear function for the "
+            "user dtype '%S' without fields or subarray (legacy support).",
+            dtype);
+    return -1;
+}
+
+/**************** Structured DType zero fill ***************/
+
+
+static int
+zerofill_fields_function(
+        void *traverse_context, PyArray_Descr *descr,
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    npy_intp itemsize = descr->elsize;
+
+    /*
+     * TODO: We could optimize this by chunking, but since we currently memset
+     *       each element always, just loop manually.
+     */
+    while (N--) {
+        memset(data, 0, itemsize);
+        if (traverse_fields_function(
+                traverse_context, descr, data, 1, stride, auxdata) < 0) {
+            return -1;
+        }
+        data +=stride;
+    }
+    return 0;
+}
+
+/*
+ * Similar to other (e.g. clear) traversal loop getter, but unlike it, we
+ * do need to take care of zeroing out everything (in principle not gaps).
+ * So we add a memset before calling the actual traverse function for the
+ * structured path.
+ */
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+        if (((fields_traverse_data *)*out_auxdata)->field_count == 0) {
+            /* If there are no fields, just return NULL for zerofill */
+            NPY_AUXDATA_FREE(*out_auxdata);
+            *out_auxdata = NULL;
+            *out_func = NULL;
+            return 0;
+        }
+        /* 
+         * Traversal skips fields that have no custom zeroing, so we need
+         * to take care of it.
+         */
+        *out_func = &zerofill_fields_function;
+        return 0;
+    }
+
+    /* Otherwise, assume there is nothing to do (user dtypes reach here) */
+    *out_auxdata = NULL;
+    *out_func = NULL;
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/dtype_traversal.h b/numpy/core/src/multiarray/dtype_traversal.h
new file mode 100644
index 000000000..fc12c0f7b
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.h
@@ -0,0 +1,98 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+
+#include "array_method.h"
+
+/* NumPy DType clear (object DECREF + NULLing) implementations */
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/* NumPy DType zero-filling implementations */
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **NPY_UNUSED(out_auxdata),
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/* Helper to deal with calling or nesting simple strided loops */
+
+typedef struct {
+    traverse_loop_function *func;
+    NpyAuxData *auxdata;
+    PyArray_Descr *descr;
+} NPY_traverse_info;
+
+
+static inline void
+NPY_traverse_info_init(NPY_traverse_info *cast_info)
+{
+    cast_info->func = NULL;  /* mark as uninitialized. */
+    cast_info->auxdata = NULL;  /* allow leaving auxdata untouched */
+    cast_info->descr = NULL;  /* mark as uninitialized. */
+}
+
+
+static inline void
+NPY_traverse_info_xfree(NPY_traverse_info *traverse_info)
+{
+    if (traverse_info->func == NULL) {
+        return;
+    }
+    traverse_info->func = NULL;
+    NPY_AUXDATA_FREE(traverse_info->auxdata);
+    Py_XDECREF(traverse_info->descr);
+}
+
+
+static inline int
+NPY_traverse_info_copy(
+        NPY_traverse_info *traverse_info, NPY_traverse_info *original)
+{
+    traverse_info->func = NULL;
+    if (original->func == NULL) {
+        /* Allow copying also of unused clear info */
+        return 0;
+    }
+    traverse_info->auxdata = NULL;
+    if (original->auxdata != NULL) {
+        traverse_info->auxdata = NPY_AUXDATA_CLONE(original->auxdata);
+        if (traverse_info->auxdata == NULL) {
+            return -1;
+        }
+    }
+    Py_INCREF(original->descr);
+    traverse_info->descr = original->descr;
+    traverse_info->func = original->func;
+
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_ */
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index edc07bc92..9c1f384d8 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -9,7 +9,9 @@
 #include <numpy/ndarraytypes.h>
 #include <numpy/arrayscalars.h>
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
+#include "arraytypes.h"
 #include "common.h"
 #include "dtypemeta.h"
 #include "descriptor.h"
@@ -18,6 +20,10 @@
 #include "scalartypes.h"
 #include "convert_datatype.h"
 #include "usertypes.h"
+#include "conversion_utils.h"
+#include "templ_common.h"
+#include "refcount.h"
+#include "dtype_traversal.h"
 
 #include <assert.h>
 
@@ -122,6 +128,50 @@ legacy_dtype_default_new(PyArray_DTypeMeta *self,
     return (PyObject *)self->singleton;
 }
 
+static PyObject *
+string_unicode_new(PyArray_DTypeMeta *self, PyObject *args, PyObject *kwargs)
+{
+    npy_intp size;
+
+    static char *kwlist[] = {"", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&", kwlist,
+                                     PyArray_IntpFromPyIntConverter, &size)) {
+        return NULL;
+    }
+
+    if (size < 0) {
+        PyErr_Format(PyExc_ValueError,
+                     "Strings cannot have a negative size but a size of "
+                     "%"NPY_INTP_FMT" was given", size);
+        return NULL;
+    }
+
+    PyArray_Descr *res = PyArray_DescrNewFromType(self->type_num);
+
+    if (res == NULL) {
+        return NULL;
+    }
+
+    if (self->type_num == NPY_UNICODE) {
+        // unicode strings are 4 bytes per character
+        if (npy_mul_sizes_with_overflow(&size, size, 4)) {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "Strings too large to store inside array.");
+            return NULL;
+        }
+    }
+
+    if (size > NPY_MAX_INT) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Strings too large to store inside array.");
+        return NULL;
+    }
+
+    res->elsize = (int)size;
+    return (PyObject *)res;
+}
 
 static PyArray_Descr *
 nonparametric_discover_descr_from_pyobject(
@@ -151,7 +201,7 @@ string_discover_descr_from_pyobject(
         }
         if (itemsize > NPY_MAX_INT) {
             PyErr_SetString(PyExc_TypeError,
-                    "string to large to store inside array.");
+                    "string too large to store inside array.");
         }
         PyArray_Descr *res = PyArray_DescrNewFromType(cls->type_num);
         if (res == NULL) {
@@ -478,6 +528,7 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
     return NULL;
 }
 
+
 NPY_NO_EXPORT int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
@@ -618,7 +669,7 @@ static PyArray_DTypeMeta *
 datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
 {
     /*
-     * Timedelta/datetime shouldn't actuall promote at all.  That they
+     * Timedelta/datetime shouldn't actually promote at all.  That they
      * currently do means that we need additional hacks in the comparison
      * type resolver.  For comparisons we have to make sure we reject it
      * nicely in order to return an array of True/False values.
@@ -677,12 +728,17 @@ object_common_dtype(
  * be a HeapType and its instances should be exact PyArray_Descr structs.
  *
  * @param descr The descriptor that should be wrapped.
- * @param name The name for the DType, if NULL the type character is used.
+ * @param name The name for the DType.
+ * @param alias A second name which is also set to the new class for builtins
+ *              (i.e. `np.types.LongDType` for `np.types.Int64DType`).
+ *              Some may have more aliases, as `intp` is not its own thing,
+ *              as of writing this, these are not added here.
  *
  * @returns 0 on success, -1 on failure.
  */
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
+dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr,
+        const char *name, const char *alias)
 {
     int has_type_set = Py_TYPE(descr) == &PyArrayDescr_Type;
 
@@ -709,47 +765,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
         return -1;
     }
 
-    /*
-     * Note: we have no intention of freeing the memory again since this
-     * behaves identically to static type definition (see comment above).
-     * This is seems cleaner for the legacy API, in the new API both static
-     * and heap types are possible (some difficulty arises from the fact that
-     * these are instances of DTypeMeta and not type).
-     * In particular our own DTypes can be true static declarations.
-     * However, this function remains necessary for legacy user dtypes.
-     */
-
-    const char *scalar_name = descr->typeobj->tp_name;
-    /*
-     * We have to take only the name, and ignore the module to get
-     * a reasonable __name__, since static types are limited in this regard
-     * (this is not ideal, but not a big issue in practice).
-     * This is what Python does to print __name__ for static types.
-     */
-    const char *dot = strrchr(scalar_name, '.');
-    if (dot) {
-        scalar_name = dot + 1;
-    }
-    Py_ssize_t name_length = strlen(scalar_name) + 14;
-
-    char *tp_name = PyMem_Malloc(name_length);
-    if (tp_name == NULL) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    snprintf(tp_name, name_length, "numpy.dtype[%s]", scalar_name);
-
     NPY_DType_Slots *dt_slots = PyMem_Malloc(sizeof(NPY_DType_Slots));
     if (dt_slots == NULL) {
-        PyMem_Free(tp_name);
         return -1;
     }
     memset(dt_slots, '\0', sizeof(NPY_DType_Slots));
 
     PyArray_DTypeMeta *dtype_class = PyMem_Malloc(sizeof(PyArray_DTypeMeta));
     if (dtype_class == NULL) {
-        PyMem_Free(tp_name);
         PyMem_Free(dt_slots);
         return -1;
     }
@@ -771,13 +794,19 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
             .tp_flags = Py_TPFLAGS_DEFAULT,
             .tp_base = &PyArrayDescr_Type,
             .tp_new = (newfunc)legacy_dtype_default_new,
+            .tp_doc = (
+                "DType class corresponding to the scalar type and dtype of "
+                "the same name.\n\n"
+                "Please see `numpy.dtype` for the typical way to create\n"
+                "dtype instances and :ref:`arrays.dtypes` for additional\n"
+                "information."),
         },},
         .flags = NPY_DT_LEGACY,
         /* Further fields are not common between DTypes */
     };
     memcpy(dtype_class, &prototype, sizeof(PyArray_DTypeMeta));
     /* Fix name of the Type*/
-    ((PyTypeObject *)dtype_class)->tp_name = tp_name;
+    ((PyTypeObject *)dtype_class)->tp_name = name;
     dtype_class->dt_slots = dt_slots;
 
     /* Let python finish the initialization (probably unnecessary) */
@@ -809,6 +838,7 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     dt_slots->common_dtype = default_builtin_common_dtype;
     dt_slots->common_instance = NULL;
     dt_slots->ensure_canonical = ensure_native_byteorder;
+    dt_slots->get_fill_zero_loop = NULL;
 
     if (PyTypeNum_ISSIGNED(dtype_class->type_num)) {
         /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
@@ -820,6 +850,8 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     }
     else if (descr->type_num == NPY_OBJECT) {
         dt_slots->common_dtype = object_common_dtype;
+        dt_slots->get_fill_zero_loop = npy_object_get_fill_zero_loop;
+        dt_slots->get_clear_loop = npy_get_clear_object_strided_loop;
     }
     else if (PyTypeNum_ISDATETIME(descr->type_num)) {
         /* Datetimes are flexible, but were not considered previously */
@@ -841,6 +873,10 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     void_discover_descr_from_pyobject);
             dt_slots->common_instance = void_common_instance;
             dt_slots->ensure_canonical = void_ensure_canonical;
+            dt_slots->get_fill_zero_loop = 
+                    npy_get_zerofill_void_and_legacy_user_dtype_loop;
+            dt_slots->get_clear_loop =
+                    npy_get_clear_void_and_legacy_user_dtype_loop;
         }
         else {
             dt_slots->default_descr = string_and_unicode_default_descr;
@@ -849,9 +885,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     string_discover_descr_from_pyobject);
             dt_slots->common_dtype = string_unicode_common_dtype;
             dt_slots->common_instance = string_unicode_common_instance;
+            ((PyTypeObject*)dtype_class)->tp_new = (newfunc)string_unicode_new;
         }
     }
 
+    if (PyTypeNum_ISNUMBER(descr->type_num)) {
+        dtype_class->flags |= NPY_DT_NUMERIC;
+    }
+
     if (_PyArray_MapPyTypeToDType(dtype_class, descr->typeobj,
             PyTypeNum_ISUSERDEF(dtype_class->type_num)) < 0) {
         Py_DECREF(dtype_class);
@@ -861,6 +902,21 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     /* Finally, replace the current class of the descr */
     Py_SET_TYPE(descr, (PyTypeObject *)dtype_class);
 
+    /* And it to the types submodule if it is a builtin dtype */
+    if (!PyTypeNum_ISUSERDEF(descr->type_num)) {
+        static PyObject *add_dtype_helper = NULL;
+        npy_cache_import("numpy.dtypes", "_add_dtype_helper", &add_dtype_helper);
+        if (add_dtype_helper == NULL) {
+            return -1;
+        }
+
+        if (PyObject_CallFunction(
+                add_dtype_helper,
+                "Os", (PyObject *)dtype_class, alias) == NULL) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -871,22 +927,35 @@ dtypemeta_get_abstract(PyArray_DTypeMeta *self) {
 }
 
 static PyObject *
+dtypemeta_get_legacy(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_legacy(self));
+}
+
+static PyObject *
 dtypemeta_get_parametric(PyArray_DTypeMeta *self) {
     return PyBool_FromLong(NPY_DT_is_parametric(self));
 }
 
+static PyObject *
+dtypemeta_get_is_numeric(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_numeric(self));
+}
+
 /*
  * Simple exposed information, defined for each DType (class).
  */
 static PyGetSetDef dtypemeta_getset[] = {
         {"_abstract", (getter)dtypemeta_get_abstract, NULL, NULL, NULL},
+        {"_legacy", (getter)dtypemeta_get_legacy, NULL, NULL, NULL},
         {"_parametric", (getter)dtypemeta_get_parametric, NULL, NULL, NULL},
+        {"_is_numeric", (getter)dtypemeta_get_is_numeric, NULL, NULL, NULL},
         {NULL, NULL, NULL, NULL, NULL}
 };
 
 static PyMemberDef dtypemeta_members[] = {
     {"type",
-        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY, NULL},
+        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY,
+        "scalar type corresponding to the DType."},
     {NULL, 0, 0, 0, NULL},
 };
 
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 5df333584..32212228c 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -1,44 +1,18 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 
+#include "array_method.h"
+#include "dtype_traversal.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* DType flags, currently private, since we may just expose functions */
-#define NPY_DT_LEGACY 1 << 0
-#define NPY_DT_ABSTRACT 1 << 1
-#define NPY_DT_PARAMETRIC 1 << 2
-
-
-typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
-        PyArray_DTypeMeta *cls, PyObject *obj);
-
-/*
- * Before making this public, we should decide whether it should pass
- * the type, or allow looking at the object. A possible use-case:
- * `np.array(np.array([0]), dtype=np.ndarray)`
- * Could consider arrays that are not `dtype=ndarray` "scalars".
- */
-typedef int (is_known_scalar_type_function)(
-        PyArray_DTypeMeta *cls, PyTypeObject *obj);
-
-typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
-typedef PyArray_DTypeMeta *(common_dtype_function)(
-        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
-typedef PyArray_Descr *(common_instance_function)(
-        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
-typedef PyArray_Descr *(ensure_canonical_function)(PyArray_Descr *dtype);
+#include "numpy/_dtype_api.h"
 
-/*
- * TODO: These two functions are currently only used for experimental DType
- *       API support.  Their relation should be "reversed": NumPy should
- *       always use them internally.
- *       There are open points about "casting safety" though, e.g. setting
- *       elements is currently always unsafe.
- */
-typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
-typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+/* DType flags, currently private, since we may just expose functions 
+   Other publicly visible flags are in _dtype_api.h                   */
+#define NPY_DT_LEGACY 1 << 0
 
 
 typedef struct {
@@ -55,10 +29,40 @@ typedef struct {
     setitemfunction *setitem;
     getitemfunction *getitem;
     /*
+     * Either NULL or fetches a clearing function.  Clearing means deallocating
+     * any referenced data and setting it to a safe state.  For Python objects
+     * this means using `Py_CLEAR` which is equivalent to `Py_DECREF` and
+     * setting the `PyObject *` to NULL.
+     * After the clear, the data must be fillable via cast/copy and calling
+     * clear a second time must be safe.
+     * If the DType class does not implement `get_clear_loop` setting
+     * NPY_ITEM_REFCOUNT on its dtype instances is invalid.  Note that it is
+     * acceptable for  NPY_ITEM_REFCOUNT to inidicate references that are not
+     * Python objects.
+     */
+    get_traverse_loop_function *get_clear_loop;
+    /*
+       Either NULL or a function that sets a function pointer to a traversal
+       loop that fills an array with zero values appropriate for the dtype. If
+       get_fill_zero_loop is undefined or the function pointer set by it is
+       NULL, the array buffer is allocated with calloc. If this function is
+       defined and it sets a non-NULL function pointer, the array buffer is
+       allocated with malloc and the zero-filling loop function pointer is
+       called to fill the buffer. For the best performance, avoid using this
+       function if a zero-filled array buffer allocated with calloc makes sense
+       for the dtype.
+
+       Note that this is currently used only for zero-filling a newly allocated
+       array. While it can be used to zero-fill an already-filled buffer, that
+       will not work correctly for arrays holding references. If you need to do
+       that, clear the array first.
+    */
+    get_traverse_loop_function *get_fill_zero_loop;
+    /*
      * The casting implementation (ArrayMethod) to convert between two
      * instances of this DType, stored explicitly for fast access:
      */
-    PyObject *within_dtype_castingimpl;
+    PyArrayMethodObject *within_dtype_castingimpl;
     /*
      * Dictionary of ArrayMethods representing most possible casts
      * (structured and object are exceptions).
@@ -74,6 +78,13 @@ typedef struct {
     PyArray_ArrFuncs f;
 } NPY_DType_Slots;
 
+// This must be updated if new slots before within_dtype_castingimpl
+// are added
+#define NPY_NUM_DTYPE_SLOTS 10
+#define NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS 22
+#define NPY_DT_MAX_ARRFUNCS_SLOT \
+  NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS + _NPY_DT_ARRFUNCS_OFFSET
+
 
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
 #define NPY_DT_SLOTS(dtype) ((NPY_DType_Slots *)(dtype)->dt_slots)
@@ -81,6 +92,8 @@ typedef struct {
 #define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
 #define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
 #define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
+#define NPY_DT_is_numeric(dtype) (((dtype)->flags & NPY_DT_NUMERIC) != 0)
+#define NPY_DT_is_user_defined(dtype) (((dtype)->type_num == -1))
 
 /*
  * Macros for convenient classmethod calls, since these require
@@ -127,7 +140,8 @@ python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *cls, PyTypeObject *pytype);
 
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
+dtypemeta_wrap_legacy_descriptor(
+        PyArray_Descr *dtypem, const char *name, const char *alias);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index cd1a58982..0b3b3fd8c 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -17,6 +17,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
 #include <npy_pycompat.h>
+#include <array_assign.h>   //PyArray_AssignRawScalar
 
 #include <ctype.h>
 
@@ -321,8 +322,7 @@ get_single_op_view(PyArrayObject *op, char *labels,
                 Py_TYPE(op), PyArray_DESCR(op),
                 ndim_output, new_dims, new_strides, PyArray_DATA(op),
                 PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
-                (PyObject *)op, (PyObject *)op,
-                0, 0);
+                (PyObject *)op, (PyObject *)op, 0);
 
         if (*ret == NULL) {
             return -1;
@@ -532,10 +532,17 @@ unbuffered_loop_nop1_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(1, ptrs[0], strides[0], shape[0]);
 
+        if (needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -586,11 +593,18 @@ unbuffered_loop_nop1_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(1, ptrs[0], strides[0], shape[0]);
 
+            if (needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -643,10 +657,17 @@ unbuffered_loop_nop2_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(2, ptrs[0], strides[0], shape[0]);
 
+        if(needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -699,11 +720,18 @@ unbuffered_loop_nop2_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(2, ptrs[0], strides[0], shape[0]);
 
+            if(needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -1025,7 +1053,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         goto fail;
     }
 
-    /* Initialize the output to all zeros */
+    /* Initialize the output to all zeros or None*/
     ret = NpyIter_GetOperandArray(iter)[nop];
     if (PyArray_AssignZero(ret, NULL) < 0) {
         goto fail;
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index e7b2f2c2c..5f2ccf2dc 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -1026,6 +1026,57 @@ bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
 
 /**end repeat**/
 
+/**begin repeat
+ * #fn_name = 
+ *  object_sum_of_products_any,
+ *  object_sum_of_products_one,
+ *  object_sum_of_products_two,
+ *  object_sum_of_products_three,
+ *  object_sum_of_products_contig_any,
+ *  object_sum_of_products_contig_one,
+ *  object_sum_of_products_contig_two,
+ *  object_sum_of_products_contig_three,
+ *  object_sum_of_products_outstride0_any,
+ *  object_sum_of_products_outstride0_one,
+ *  object_sum_of_products_outstride0_two,
+ *  object_sum_of_products_outstride0_three#
+ */
+static void
+@fn_name@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+/**end repeat**/
+
 /* These tables need to match up with the type enum */
 static sum_of_products_fn
 _contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
@@ -1116,7 +1167,7 @@ static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1152,7 +1203,7 @@ static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1188,7 +1239,7 @@ static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index 84507b481..c0da0a7b7 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -16,19 +16,6 @@
 #include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 5
-
-
-typedef struct{
-    PyTypeObject *typeobj;    /* type of python scalar or NULL */
-    int flags;                /* flags, including parametric and abstract */
-    /* NULL terminated cast definitions. Use NULL for the newly created DType */
-    PyArrayMethod_Spec **casts;
-    PyType_Slot *slots;
-} PyArrayDTypeMeta_Spec;
-
-
-
 static PyArray_DTypeMeta *
 dtype_does_not_promote(
         PyArray_DTypeMeta *NPY_UNUSED(self), PyArray_DTypeMeta *NPY_UNUSED(other))
@@ -116,10 +103,6 @@ PyArray_ArrFuncs default_funcs = {
 };
 
 
-/* other slots are in order, so keep only last around: */
-#define NUM_DTYPE_SLOTS 8
-
-
 int
 PyArrayInitDTypeMeta_FromSpec(
         PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *spec)
@@ -155,10 +138,12 @@ PyArrayInitDTypeMeta_FromSpec(
     }
 
     /* Check and handle flags: */
-    if (spec->flags & ~(NPY_DT_PARAMETRIC|NPY_DT_ABSTRACT)) {
+    int allowed_flags = NPY_DT_PARAMETRIC | NPY_DT_ABSTRACT | NPY_DT_NUMERIC;
+    if (spec->flags & ~(allowed_flags)) {
         PyErr_SetString(PyExc_RuntimeError,
-                "invalid DType flags specified, only parametric and abstract "
-                "are valid flags for user DTypes.");
+                "invalid DType flags specified, only NPY_DT_PARAMETRIC, "
+                "NPY_DT_ABSTRACT, and NPY_DT_NUMERIC are valid flags for "
+                "user DTypes.");
         return -1;
     }
 
@@ -178,6 +163,8 @@ PyArrayInitDTypeMeta_FromSpec(
     NPY_DT_SLOTS(DType)->common_instance = NULL;
     NPY_DT_SLOTS(DType)->setitem = NULL;
     NPY_DT_SLOTS(DType)->getitem = NULL;
+    NPY_DT_SLOTS(DType)->get_clear_loop = NULL;
+    NPY_DT_SLOTS(DType)->f = default_funcs;
 
     PyType_Slot *spec_slot = spec->slots;
     while (1) {
@@ -187,20 +174,100 @@ PyArrayInitDTypeMeta_FromSpec(
         if (slot == 0) {
             break;
         }
-        if (slot > NUM_DTYPE_SLOTS || slot < 0) {
+        if ((slot < 0) ||
+            ((slot > NPY_NUM_DTYPE_SLOTS) &&
+             (slot <= _NPY_DT_ARRFUNCS_OFFSET)) ||
+            (slot > NPY_DT_MAX_ARRFUNCS_SLOT)) {
             PyErr_Format(PyExc_RuntimeError,
                     "Invalid slot with value %d passed in.", slot);
             return -1;
         }
         /*
-         * It is up to the user to get this right, and slots are sorted
-         * exactly like they are stored right now:
+         * It is up to the user to get this right, the slots in the public API
+         * are sorted exactly like they are stored in the NPY_DT_Slots struct
+         * right now:
          */
-        void **current = (void **)(&(
-                NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
-        current += slot - 1;
-        *current = pfunc;
+        if (slot <= NPY_NUM_DTYPE_SLOTS) {
+            // slot > NPY_NUM_DTYPE_SLOTS are PyArray_ArrFuncs
+            void **current = (void **)(&(
+                    NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
+            current += slot - 1;
+            *current = pfunc;
+        }
+        else {
+            int f_slot = slot - _NPY_DT_ARRFUNCS_OFFSET;
+            if (1 <= f_slot && f_slot <= NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS) {
+                switch (f_slot) {
+                    case 1:
+                        NPY_DT_SLOTS(DType)->f.getitem = pfunc;
+                        break;
+                    case 2:
+                        NPY_DT_SLOTS(DType)->f.setitem = pfunc;
+                        break;
+                    case 3:
+                        NPY_DT_SLOTS(DType)->f.copyswapn = pfunc;
+                        break;
+                    case 4:
+                        NPY_DT_SLOTS(DType)->f.copyswap = pfunc;
+                        break;
+                    case 5:
+                        NPY_DT_SLOTS(DType)->f.compare = pfunc;
+                        break;
+                    case 6:
+                        NPY_DT_SLOTS(DType)->f.argmax = pfunc;
+                        break;
+                    case 7:
+                        NPY_DT_SLOTS(DType)->f.dotfunc = pfunc;
+                        break;
+                    case 8:
+                        NPY_DT_SLOTS(DType)->f.scanfunc = pfunc;
+                        break;
+                    case 9:
+                        NPY_DT_SLOTS(DType)->f.fromstr = pfunc;
+                        break;
+                    case 10:
+                        NPY_DT_SLOTS(DType)->f.nonzero = pfunc;
+                        break;
+                    case 11:
+                        NPY_DT_SLOTS(DType)->f.fill = pfunc;
+                        break;
+                    case 12:
+                        NPY_DT_SLOTS(DType)->f.fillwithscalar = pfunc;
+                        break;
+                    case 13:
+                        *NPY_DT_SLOTS(DType)->f.sort = pfunc;
+                        break;
+                    case 14:
+                        *NPY_DT_SLOTS(DType)->f.argsort = pfunc;
+                        break;
+                    case 15:
+                    case 16:
+                    case 17:
+                    case 18:
+                    case 19:
+                    case 20:
+                    case 21:
+                        PyErr_Format(
+                            PyExc_RuntimeError,
+                            "PyArray_ArrFunc casting slot with value %d is disabled.",
+                            f_slot
+                        );
+                        return -1;
+                    case 22:
+                        NPY_DT_SLOTS(DType)->f.argmin = pfunc;
+                        break;
+                }
+            } else {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Invalid PyArray_ArrFunc slot with value %d passed in.",
+                        f_slot
+                    );
+                    return -1;
+            }
+        }
     }
+
     if (NPY_DT_SLOTS(DType)->setitem == NULL
             || NPY_DT_SLOTS(DType)->getitem == NULL) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -229,7 +296,7 @@ PyArrayInitDTypeMeta_FromSpec(
             return -1;
         }
     }
-    NPY_DT_SLOTS(DType)->f = default_funcs;
+
     /* invalid type num. Ideally, we get away with it! */
     DType->type_num = -1;
 
@@ -443,12 +510,12 @@ _get_experimental_dtype_api(PyObject *NPY_UNUSED(mod), PyObject *arg)
     if (error_converting(version)) {
         return NULL;
     }
-    if (version != EXPERIMENTAL_DTYPE_API_VERSION) {
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
         PyErr_Format(PyExc_RuntimeError,
                 "Experimental DType API version %d requested, but NumPy "
                 "is exporting version %d.  Recompile your DType and/or upgrade "
                 "NumPy to match.",
-                version, EXPERIMENTAL_DTYPE_API_VERSION);
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index ab35548ed..7022eb231 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -797,20 +797,17 @@ array_imag_get(PyArrayObject *self, void *NPY_UNUSED(ignored))
     }
     else {
         Py_INCREF(PyArray_DESCR(self));
-        ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
-                                                    PyArray_DESCR(self),
-                                                    PyArray_NDIM(self),
-                                                    PyArray_DIMS(self),
-                                                    NULL, NULL,
-                                                    PyArray_ISFORTRAN(self),
-                                                    (PyObject *)self);
+        ret = (PyArrayObject *)PyArray_NewFromDescr_int(
+                Py_TYPE(self),
+                PyArray_DESCR(self),
+                PyArray_NDIM(self),
+                PyArray_DIMS(self),
+                NULL, NULL,
+                PyArray_ISFORTRAN(self),
+                (PyObject *)self, NULL, _NPY_ARRAY_ZEROED);
         if (ret == NULL) {
             return NULL;
         }
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
         PyArray_CLEARFLAGS(ret, NPY_ARRAY_WRITEABLE);
     }
     return (PyObject *) ret;
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index e5331b2b4..f42ae7c2d 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -17,10 +17,12 @@
 
 #include "multiarraymodule.h"
 #include "common.h"
+#include "dtype_transfer.h"
 #include "arrayobject.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
 #include "array_assign.h"
+#include "refcount.h"
 
 #include "npy_sort.h"
 #include "npy_partition.h"
@@ -39,7 +41,26 @@ npy_fasttake_impl(
         PyArray_Descr *dtype, int axis)
 {
     NPY_BEGIN_THREADS_DEF;
-    NPY_BEGIN_THREADS_DESCR(dtype);
+
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    NPY_cast_info_init(&cast_info);
+
+    if (!needs_refcounting) {
+        /* if "refcounting" is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS;
+    }
+    else {
+        if (PyArray_GetDTypeTransferFunction(
+                1, itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            return -1;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+    }
+
     switch (clipmode) {
         case NPY_RAISE:
             for (npy_intp i = 0; i < n; i++) {
@@ -47,22 +68,23 @@ npy_fasttake_impl(
                     npy_intp tmp = indices[j];
                     if (check_and_adjust_index(&tmp, max_item, axis,
                                                _save) < 0) {
-                        return -1;
+                        goto fail;
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -83,18 +105,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -111,18 +134,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -130,7 +154,13 @@ npy_fasttake_impl(
     }
 
     NPY_END_THREADS;
+    NPY_cast_info_xfree(&cast_info);
     return 0;
+
+  fail:
+    /* NPY_END_THREADS already ensured. */
+    NPY_cast_info_xfree(&cast_info);
+    return -1;
 }
 
 
@@ -322,11 +352,17 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
               NPY_CLIPMODE clipmode)
 {
     PyArrayObject  *indices, *values;
-    npy_intp i, chunk, ni, max_item, nv, tmp;
+    npy_intp i, itemsize, ni, max_item, nv, tmp;
     char *src, *dest;
     int copied = 0;
     int overlap = 0;
 
+    NPY_BEGIN_THREADS_DEF;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+
+    NPY_cast_info_init(&cast_info);
+
     indices = NULL;
     values = NULL;
     if (!PyArray_Check(self)) {
@@ -372,25 +408,51 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     }
     max_item = PyArray_SIZE(self);
     dest = PyArray_DATA(self);
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
+
+    int has_references = PyDataType_REFCHK(PyArray_DESCR(self));
+
+    if (!has_references) {
+        /* if has_references is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS_THRESHOLDED(ni);
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DESCR(self);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS_THRESHOLDED(ni);
+        }
+    }
+
+
+    if (has_references) {
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
 
-    if (PyDataType_REFCHK(PyArray_DESCR(self))) {
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk*(i % nv);
+                src = PyArray_BYTES(values) + itemsize*(i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
-                if (check_and_adjust_index(&tmp, max_item, 0, NULL) < 0) {
+                if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
+                    goto fail;
+                }
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
                     goto fail;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp*chunk, src, chunk);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -402,14 +464,18 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -417,30 +483,32 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         }
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_THRESHOLDED(ni);
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
                     goto fail;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -452,12 +520,12 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -465,14 +533,16 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         }
-        NPY_END_THREADS;
     }
+    NPY_END_THREADS;
 
  finish:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(values);
     Py_XDECREF(indices);
     if (copied) {
@@ -482,6 +552,8 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     Py_RETURN_NONE;
 
  fail:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(indices);
     Py_XDECREF(values);
     if (copied) {
@@ -562,11 +634,12 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
 {
     PyArrayObject *mask, *values;
     PyArray_Descr *dtype;
-    npy_intp chunk, ni, nv;
+    npy_intp itemsize, ni, nv;
     char *src, *dest;
     npy_bool *mask_data;
     int copied = 0;
     int overlap = 0;
+    NPY_BEGIN_THREADS_DEF;
 
     mask = NULL;
     values = NULL;
@@ -627,31 +700,49 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
         self = obj;
     }
 
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
     dest = PyArray_DATA(self);
 
     if (PyDataType_REFCHK(PyArray_DESCR(self))) {
+        NPY_cast_info cast_info;
+        NPY_ARRAYMETHOD_FLAGS flags;
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
+
+        NPY_cast_info_init(&cast_info);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+
         for (npy_intp i = 0, j = 0; i < ni; i++, j++) {
             if (j >= nv) {
                 j = 0;
             }
             if (mask_data[i]) {
-                char *src_ptr = src + j*chunk;
-                char *dest_ptr = dest + i*chunk;
-
-                PyArray_Item_INCREF(src_ptr, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest_ptr, PyArray_DESCR(self));
-                memmove(dest_ptr, src_ptr, chunk);
+                char *data[2] = {src + j*itemsize, dest + i*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    NPY_cast_info_xfree(&cast_info);
+                    goto fail;
+                }
             }
         }
+        NPY_cast_info_xfree(&cast_info);
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(self));
-        npy_fastputmask(dest, src, mask_data, ni, nv, chunk);
-        NPY_END_THREADS;
+        NPY_BEGIN_THREADS;
+        npy_fastputmask(dest, src, mask_data, ni, nv, itemsize);
     }
 
+    NPY_END_THREADS;
+
     Py_XDECREF(values);
     Py_XDECREF(mask);
     if (copied) {
@@ -670,6 +761,81 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
     return NULL;
 }
 
+static NPY_GCC_OPT_3 inline int
+npy_fastrepeat_impl(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    npy_intp i, j, k;
+    for (i = 0; i < n_outer; i++) {
+        for (j = 0; j < n; j++) {
+            npy_intp tmp = broadcast ? counts[0] : counts[j];
+            for (k = 0; k < tmp; k++) {
+                if (!needs_refcounting) {
+                    memcpy(new_data, old_data, chunk);
+                }
+                else {
+                    char *data[2] = {old_data, new_data};
+                    npy_intp strides[2] = {elsize, elsize};
+                    if (cast_info.func(&cast_info.context, data, &nel,
+                                       strides, cast_info.auxdata) < 0) {
+                        return -1;
+                    }
+                }
+                new_data += chunk;
+            }
+            old_data += chunk;
+        }
+    }
+    return 0;
+}
+
+static NPY_GCC_OPT_3 int
+npy_fastrepeat(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    if (!needs_refcounting) {
+        if (chunk == 1) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 2) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 4) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 8) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 16) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 32) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+    }
+
+    return npy_fastrepeat_impl(
+        n_outer, n, nel, chunk, broadcast, counts, new_data, old_data, elsize,
+        cast_info, needs_refcounting);    
+}
+
+
 /*NUMPY_API
  * Repeat the array.
  */
@@ -677,13 +843,16 @@ NPY_NO_EXPORT PyObject *
 PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 {
     npy_intp *counts;
-    npy_intp n, n_outer, i, j, k, chunk;
+    npy_intp i, j, n, n_outer, chunk, elsize, nel;
     npy_intp total = 0;
     npy_bool broadcast = NPY_FALSE;
     PyArrayObject *repeats = NULL;
     PyObject *ap = NULL;
     PyArrayObject *ret = NULL;
     char *new_data, *old_data;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    int needs_refcounting;
 
     repeats = (PyArrayObject *)PyArray_ContiguousFromAny(op, NPY_INTP, 0, 1);
     if (repeats == NULL) {
@@ -707,6 +876,8 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 
     aop = (PyArrayObject *)ap;
     n = PyArray_DIM(aop, axis);
+    NPY_cast_info_init(&cast_info);
+    needs_refcounting = PyDataType_REFCHK(PyArray_DESCR(aop));
 
     if (!broadcast && PyArray_SIZE(repeats) != n) {
         PyErr_Format(PyExc_ValueError,
@@ -744,35 +915,41 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     new_data = PyArray_DATA(ret);
     old_data = PyArray_DATA(aop);
 
-    chunk = PyArray_DESCR(aop)->elsize;
+    nel = 1;
+    elsize = PyArray_DESCR(aop)->elsize;
     for(i = axis + 1; i < PyArray_NDIM(aop); i++) {
-        chunk *= PyArray_DIMS(aop)[i];
+        nel *= PyArray_DIMS(aop)[i];
     }
+    chunk = nel*elsize;
 
     n_outer = 1;
     for (i = 0; i < axis; i++) {
         n_outer *= PyArray_DIMS(aop)[i];
     }
-    for (i = 0; i < n_outer; i++) {
-        for (j = 0; j < n; j++) {
-            npy_intp tmp = broadcast ? counts[0] : counts[j];
-            for (k = 0; k < tmp; k++) {
-                memcpy(new_data, old_data, chunk);
-                new_data += chunk;
-            }
-            old_data += chunk;
+
+    if (needs_refcounting) {
+        if (PyArray_GetDTypeTransferFunction(
+                1, elsize, elsize, PyArray_DESCR(aop), PyArray_DESCR(aop), 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
         }
     }
 
+    if (npy_fastrepeat(n_outer, n, nel, chunk, broadcast, counts, new_data,
+                       old_data, elsize, cast_info, needs_refcounting) < 0) {
+        goto fail;
+    }
+
     Py_DECREF(repeats);
-    PyArray_INCREF(ret);
     Py_XDECREF(aop);
+    NPY_cast_info_xfree(&cast_info);
     return (PyObject *)ret;
 
  fail:
     Py_DECREF(repeats);
     Py_XDECREF(aop);
     Py_XDECREF(ret);
+    NPY_cast_info_xfree(&cast_info);
     return NULL;
 }
 
@@ -945,10 +1122,10 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
     npy_intp elsize = (npy_intp)PyArray_ITEMSIZE(op);
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
-    int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int is_aligned = IsAligned(op);
+    int needcopy = !is_aligned || swap || astride != elsize;
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
 
-    PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
     char *buffer = NULL;
 
     PyArrayIterObject *it;
@@ -956,6 +1133,12 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
     int ret = 0;
 
+    PyArray_Descr *descr = PyArray_DESCR(op);
+    PyArray_Descr *odescr = NULL;
+
+    NPY_cast_info to_cast_info = {.func = NULL};
+    NPY_cast_info from_cast_info = {.func = NULL};
+
     NPY_BEGIN_THREADS_DEF;
 
     /* Check if there is any sorting to do */
@@ -980,32 +1163,48 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+            memset(buffer, 0, N * elsize);
+        }
+
+        if (swap) {
+            odescr = PyArray_DescrNewByteorder(descr, NPY_SWAP);
+        }
+        else {
+            odescr = descr;
+            Py_INCREF(odescr);
+        }
+
+        NPY_ARRAYMETHOD_FLAGS to_transfer_flags;
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, astride, elsize, descr, odescr, 0, &to_cast_info,
+                &to_transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
+
+        NPY_ARRAYMETHOD_FLAGS from_transfer_flags;
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, elsize, astride, odescr, descr, 0, &from_cast_info,
+                &from_transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
     }
 
-    NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_BEGIN_THREADS_DESCR(descr);
 
     while (size--) {
         char *bufptr = it->dataptr;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized buffer, or leak a reference to each
-                 * object if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                _unaligned_strided_byte_copy(buffer, elsize,
-                                             it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(buffer, elsize, it->dataptr, astride, N, swap, op);
+            char *args[2] = {it->dataptr, buffer};
+            npy_intp strides[2] = {astride, elsize};
+
+            if (NPY_UNLIKELY(to_cast_info.func(
+                                 &to_cast_info.context, args, &N, strides,
+                                 to_cast_info.auxdata) < 0)) {
+                goto fail;
             }
             bufptr = buffer;
         }
@@ -1019,7 +1218,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
         if (part == NULL) {
             ret = sort(bufptr, N, op);
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1032,7 +1231,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             npy_intp i;
             for (i = 0; i < nkth; ++i) {
                 ret = part(bufptr, N, kth[i], pivots, &npiv, op);
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1042,15 +1241,13 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         }
 
         if (needcopy) {
-            if (hasrefs) {
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-                _unaligned_strided_byte_copy(it->dataptr, astride,
-                                             buffer, elsize, N, elsize);
-            }
-            else {
-                copyswapn(it->dataptr, astride, buffer, elsize, N, swap, op);
+            char *args[2] = {buffer, it->dataptr};
+            npy_intp strides[2] = {elsize, astride};
+
+            if (NPY_UNLIKELY(from_cast_info.func(
+                                 &from_cast_info.context, args, &N, strides,
+                                 from_cast_info.auxdata) < 0)) {
+                goto fail;
             }
         }
 
@@ -1058,15 +1255,21 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
     }
 
 fail:
-    NPY_END_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_END_THREADS_DESCR(descr);
     /* cleanup internal buffer */
-    PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(odescr, buffer, elsize, N, 1);
+        PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+        Py_DECREF(odescr);
+    }
     if (ret < 0 && !PyErr_Occurred()) {
         /* Out of memory during sorting or buffer creation */
         PyErr_NoMemory();
     }
     Py_DECREF(it);
     Py_DECREF(mem_handler);
+    NPY_cast_info_xfree(&to_cast_info);
+    NPY_cast_info_xfree(&from_cast_info);
 
     return ret;
 }
@@ -1080,11 +1283,11 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     npy_intp elsize = (npy_intp)PyArray_ITEMSIZE(op);
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
-    int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int is_aligned = IsAligned(op);
+    int needcopy = !is_aligned || swap || astride != elsize;
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
     int needidxbuffer;
 
-    PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
     char *valbuffer = NULL;
     npy_intp *idxbuffer = NULL;
 
@@ -1096,6 +1299,12 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     int ret = 0;
 
+    PyArray_Descr *descr = PyArray_DESCR(op);
+    PyArray_Descr *odescr = NULL;
+
+    NPY_ARRAYMETHOD_FLAGS transfer_flags;
+    NPY_cast_info cast_info = {.func = NULL};
+
     NPY_BEGIN_THREADS_DEF;
 
     PyObject *mem_handler = PyDataMem_GetHandler();
@@ -1134,6 +1343,23 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+            memset(valbuffer, 0, N * elsize);
+        }
+
+        if (swap) {
+            odescr = PyArray_DescrNewByteorder(descr, NPY_SWAP);
+        }
+        else {
+            odescr = descr;
+            Py_INCREF(odescr);
+        }
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, astride, elsize, descr, odescr, 0, &cast_info,
+                &transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
     }
 
     if (needidxbuffer) {
@@ -1145,7 +1371,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         }
     }
 
-    NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_BEGIN_THREADS_DESCR(descr);
 
     while (size--) {
         char *valptr = it->dataptr;
@@ -1153,25 +1379,13 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         npy_intp *iptr, i;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized valbuffer, or leak a reference to
-                 * each object item if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                 _unaligned_strided_byte_copy(valbuffer, elsize,
-                                              it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(valbuffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(valbuffer, elsize,
-                          it->dataptr, astride, N, swap, op);
+            char *args[2] = {it->dataptr, valbuffer};
+            npy_intp strides[2] = {astride, elsize};
+
+            if (NPY_UNLIKELY(cast_info.func(
+                                 &cast_info.context, args, &N, strides,
+                                 cast_info.auxdata) < 0)) {
+                goto fail;
             }
             valptr = valbuffer;
         }
@@ -1188,7 +1402,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         if (argpart == NULL) {
             ret = argsort(valptr, idxptr, N, op);
             /* Object comparisons may raise an exception in Python 3 */
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1202,7 +1416,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             for (i = 0; i < nkth; ++i) {
                 ret = argpart(valptr, idxptr, N, kth[i], pivots, &npiv, op);
                 /* Object comparisons may raise an exception in Python 3 */
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1226,9 +1440,13 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     }
 
 fail:
-    NPY_END_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_END_THREADS_DESCR(descr);
     /* cleanup internal buffers */
-    PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(odescr, valbuffer, elsize, N, 1);
+        PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+        Py_DECREF(odescr);
+    }
     PyDataMem_UserFREE(idxbuffer, N * sizeof(npy_intp), mem_handler);
     if (ret < 0) {
         if (!PyErr_Occurred()) {
@@ -1241,6 +1459,7 @@ fail:
     Py_XDECREF(it);
     Py_XDECREF(rit);
     Py_DECREF(mem_handler);
+    NPY_cast_info_xfree(&cast_info);
 
     return (PyObject *)rop;
 }
@@ -2642,7 +2861,7 @@ PyArray_Nonzero(PyArrayObject *self)
                 }
             }
             /*
-             * Fallback to a branchless strategy to avoid branch misprediction 
+             * Fallback to a branchless strategy to avoid branch misprediction
              * stalls that are very expensive on most modern processors.
              */
             else {
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 3dd488220..b7fcf2ee2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -889,13 +889,13 @@ get_view_from_index(PyArrayObject *self, PyArrayObject **view,
 
     /* Create the new view and set the base array */
     Py_INCREF(PyArray_DESCR(self));
-    *view = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+    *view = (PyArrayObject *)PyArray_NewFromDescr_int(
             ensure_array ? &PyArray_Type : Py_TYPE(self),
             PyArray_DESCR(self),
             new_dim, new_shape, new_strides, data_ptr,
             PyArray_FLAGS(self),
             ensure_array ? NULL : (PyObject *)self,
-            (PyObject *)self);
+            (PyObject *)self, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (*view == NULL) {
         return -1;
     }
@@ -1361,7 +1361,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_BYTES(arr) + offset,
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (*view == NULL) {
             return 0;
         }
@@ -1415,7 +1416,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_DATA(arr),
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
 
         if (*view == NULL) {
             return 0;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index fff772769..bc3efc295 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -21,12 +21,14 @@
 #include "ctors.h"
 #include "calculation.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
 #include "item_selection.h"
 #include "conversion_utils.h"
 #include "shape.h"
 #include "strfuncs.h"
 #include "array_assign.h"
 #include "npy_dlpack.h"
+#include "multiarraymodule.h"
 
 #include "methods.h"
 #include "alloc.h"
@@ -435,7 +437,7 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
             PyArray_BYTES(self) + offset,
             PyArray_FLAGS(self) & ~NPY_ARRAY_F_CONTIGUOUS,
             (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ALLOW_EMPTY_STRING);
     return ret;
 }
 
@@ -851,29 +853,35 @@ static PyObject *
 array_astype(PyArrayObject *self,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyArray_Descr *dtype = NULL;
     /*
      * TODO: UNSAFE default for compatibility, I think
      *       switching to SAME_KIND by default would be good.
      */
+    npy_dtype_info dt_info;
     NPY_CASTING casting = NPY_UNSAFE_CASTING;
     NPY_ORDER order = NPY_KEEPORDER;
     _PyArray_CopyMode forcecopy = 1;
     int subok = 1;
+
     NPY_PREPARE_ARGPARSER;
     if (npy_parse_arguments("astype", args, len_args, kwnames,
-            "dtype", &PyArray_DescrConverter, &dtype,
+            "dtype", &PyArray_DTypeOrDescrConverterRequired, &dt_info,
             "|order", &PyArray_OrderConverter, &order,
             "|casting", &PyArray_CastingConverter, &casting,
             "|subok", &PyArray_PythonPyIntFromInt, &subok,
             "|copy", &PyArray_CopyConverter, &forcecopy,
             NULL, NULL, NULL) < 0) {
-        Py_XDECREF(dtype);
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
         return NULL;
     }
 
     /* If it is not a concrete dtype instance find the best one for the array */
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(self, (PyObject *)dtype));
+    PyArray_Descr *dtype;
+
+    dtype = PyArray_AdaptDescriptorToArray(self, dt_info.dtype, dt_info.descr);
+    Py_XDECREF(dt_info.descr);
+    Py_DECREF(dt_info.dtype);
     if (dtype == NULL) {
         return NULL;
     }
@@ -916,29 +924,28 @@ array_astype(PyArrayObject *self,
 
     PyArrayObject *ret;
 
-    /* This steals the reference to dtype, so no DECREF of dtype */
+    /* This steals the reference to dtype */
+    Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewLikeArray(
                                 self, order, dtype, subok);
     if (ret == NULL) {
+        Py_DECREF(dtype);
         return NULL;
     }
-    /* NumPy 1.20, 2020-10-01 */
-    if ((PyArray_NDIM(self) != PyArray_NDIM(ret)) &&
-            DEPRECATE_FUTUREWARNING(
-                "casting an array to a subarray dtype "
-                "will not use broadcasting in the future, but cast each "
-                "element to the new dtype and then append the dtype's shape "
-                "to the new array. You can opt-in to the new behaviour, by "
-                "additional field to the cast: "
-                "`arr.astype(np.dtype([('f', dtype)]))['f']`.\n"
-                "This may lead to a different result or to current failures "
-                "succeeding.  "
-                "(FutureWarning since NumPy 1.20)") < 0) {
-        Py_DECREF(ret);
-        return NULL;
-    }
 
-    if (PyArray_CopyInto(ret, self) < 0) {
+    /* Decrease the number of dimensions removing subarray ones again */
+    int out_ndim = PyArray_NDIM(ret);
+    PyArray_Descr *out_descr = PyArray_DESCR(ret);
+    ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(self);
+    ((PyArrayObject_fields *)ret)->descr = dtype;
+
+    int success = PyArray_CopyInto(ret, self);
+
+    Py_DECREF(dtype);
+    ((PyArrayObject_fields *)ret)->nd = out_ndim;
+    ((PyArrayObject_fields *)ret)->descr = out_descr;
+
+    if (success < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -1095,7 +1102,7 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
     int nin, nout;
     PyObject *out_kwd_obj;
     PyObject *fast;
-    PyObject **in_objs, **out_objs;
+    PyObject **in_objs, **out_objs, *where_obj;
 
     /* check inputs */
     nin = PyTuple_Size(args);
@@ -1126,6 +1133,17 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
         }
     }
     Py_DECREF(out_kwd_obj);
+    /* check where if it exists */
+    where_obj = PyDict_GetItemWithError(kwds, npy_ma_str_where);
+    if (where_obj == NULL) {
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+    } else {
+        if (PyUFunc_HasOverride(where_obj)){
+            return 1;
+        }
+    }
     return 0;
 }
 
@@ -2785,9 +2803,7 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     PyArray_Descr *dtype;
     PyObject *c;
 
-    if (PyArray_SIZE(self) != 1) {
-        PyErr_SetString(PyExc_TypeError,
-                "only length-1 arrays can be converted to Python scalars");
+    if (check_is_convertible_to_scalar(self) < 0) {
         return NULL;
     }
 
@@ -2832,25 +2848,15 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 static PyObject *
 array_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > 2) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 2 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 NPY_NO_EXPORT PyMethodDef array_methods[] = {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5da3d66df..d7d19493b 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -66,6 +66,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "compiled_base.h"
 #include "mem_overlap.h"
 #include "typeinfo.h"
+#include "convert.h" /* for PyArray_AssignZero */
 
 #include "get_attr_string.h"
 #include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
@@ -98,6 +99,12 @@ _umath_strings_richcompare(
  * future.
  */
 int npy_legacy_print_mode = INT_MAX;
+/*
+ * Global variable to check whether NumPy 2.0 behavior is opted in.
+ * This flag is considered a runtime constant.
+ */
+int npy_numpy2_behavior = NPY_FALSE;
+
 
 static PyObject *
 set_legacy_print_mode(PyObject *NPY_UNUSED(self), PyObject *args)
@@ -467,7 +474,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays,  (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -488,7 +495,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr, ndim, shape, strides, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -583,7 +590,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays, (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -593,7 +600,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr,  1, &shape, &stride, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -1036,12 +1043,11 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
 #endif
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-        result = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
-        result = (PyArrayObject *)Py_TYPE(result)->tp_as_number->nb_multiply(
-                                        (PyObject *)ap1, (PyObject *)ap2);
+        PyObject *mul_res = PyObject_CallFunctionObjArgs(
+                n_ops.multiply, ap1, ap2, out, NULL);
         Py_DECREF(ap1);
         Py_DECREF(ap2);
-        return (PyObject *)result;
+        return mul_res;
     }
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_NDIM(ap2) > 1) {
@@ -1079,7 +1085,9 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     }
     /* Ensure that multiarray.dot(<Nx0>,<0xM>) -> zeros((N,M)) */
     if (PyArray_SIZE(ap1) == 0 && PyArray_SIZE(ap2) == 0) {
-        memset(PyArray_DATA(out_buf), 0, PyArray_NBYTES(out_buf));
+        if (PyArray_AssignZero(out_buf, NULL) < 0) {
+            goto fail;
+        }
     }
 
     dot = PyArray_DESCR(out_buf)->f->dotfunc;
@@ -1492,19 +1500,26 @@ fail:
     return NULL;
 }
 
-
 static PyObject *
-array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
+array_putmask(PyObject *NPY_UNUSED(module), PyObject *const *args,
+                Py_ssize_t len_args, PyObject *kwnames )
 {
     PyObject *mask, *values;
     PyObject *array;
 
-    static char *kwlist[] = {"arr", "mask", "values", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!OO:putmask", kwlist,
-                &PyArray_Type, &array, &mask, &values)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("putmask", args, len_args, kwnames,
+            "", NULL, &array,
+            "mask", NULL, &mask,
+            "values", NULL, &values,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+    if (!PyArray_Check(array)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument a of putmask must be a numpy array");
+    }
+
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
@@ -1622,25 +1637,35 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin, NPY_ORDER order)
                  ((order) == NPY_CORDER && PyArray_IS_C_CONTIGUOUS(op)) || \
                  ((order) == NPY_FORTRANORDER && PyArray_IS_F_CONTIGUOUS(op)))
 
+
 static inline PyObject *
 _array_fromobject_generic(
-        PyObject *op, PyArray_Descr *type, _PyArray_CopyMode copy, NPY_ORDER order,
-        npy_bool subok, int ndmin)
+        PyObject *op, PyArray_Descr *in_descr, PyArray_DTypeMeta *in_DType,
+        _PyArray_CopyMode copy, NPY_ORDER order, npy_bool subok, int ndmin)
 {
     PyArrayObject *oparr = NULL, *ret = NULL;
     PyArray_Descr *oldtype = NULL;
     int nd, flags = 0;
 
+    /* Hold on to `in_descr` as `dtype`, since we may also set it below. */
+    Py_XINCREF(in_descr);
+    PyArray_Descr *dtype = in_descr;
+
     if (ndmin > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
                 "ndmin bigger than allowable number of dimensions "
                 "NPY_MAXDIMS (=%d)", NPY_MAXDIMS);
-        return NULL;
+        goto finish;
     }
     /* fast exit if simple call */
     if (PyArray_CheckExact(op) || (subok && PyArray_Check(op))) {
         oparr = (PyArrayObject *)op;
-        if (type == NULL) {
+
+        if (dtype == NULL && in_DType == NULL) {
+            /*
+             * User did not ask for a specific dtype instance or class. So
+             * we can return either self or a copy.
+             */
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
                 ret = oparr;
                 Py_INCREF(ret);
@@ -1650,17 +1675,30 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
                 goto finish;
             }
         }
+        else if (dtype == NULL) {
+            /*
+             * If the user passed a DType class but not a dtype instance,
+             * we must use `PyArray_AdaptDescriptorToArray` to find the
+             * correct dtype instance.
+             * Even if the fast-path doesn't work we will use this.
+             */
+            dtype = PyArray_AdaptDescriptorToArray(oparr, in_DType, NULL);
+            if (dtype == NULL) {
+                goto finish;
+            }
+        }
+
         /* One more chance for faster exit if user specified the dtype. */
         oldtype = PyArray_DESCR(oparr);
-        if (PyArray_EquivTypes(oldtype, type)) {
+        if (PyArray_EquivTypes(oldtype, dtype)) {
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
-                if (oldtype == type) {
+                if (oldtype == dtype) {
                     Py_INCREF(op);
                     ret = oparr;
                 }
@@ -1668,10 +1706,10 @@ _array_fromobject_generic(
                     /* Create a new PyArrayObject from the caller's
                      * PyArray_Descr. Use the reference `op` as the base
                      * object. */
-                    Py_INCREF(type);
+                    Py_INCREF(dtype);
                     ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
                             Py_TYPE(op),
-                            type,
+                            dtype,
                             PyArray_NDIM(oparr),
                             PyArray_DIMS(oparr),
                             PyArray_STRIDES(oparr),
@@ -1687,10 +1725,10 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
-                if (oldtype == type || ret == NULL) {
+                if (oldtype == dtype || ret == NULL) {
                     goto finish;
                 }
                 Py_INCREF(oldtype);
@@ -1721,11 +1759,13 @@ _array_fromobject_generic(
     }
 
     flags |= NPY_ARRAY_FORCECAST;
-    Py_XINCREF(type);
-    ret = (PyArrayObject *)PyArray_CheckFromAny(op, type,
-                                                0, 0, flags, NULL);
+
+    ret = (PyArrayObject *)PyArray_CheckFromAny_int(
+            op, dtype, in_DType, 0, 0, flags, NULL);
 
 finish:
+    Py_XDECREF(dtype);
+
     if (ret == NULL) {
         return NULL;
     }
@@ -1752,7 +1792,7 @@ array_array(PyObject *NPY_UNUSED(ignored),
     npy_bool subok = NPY_FALSE;
     _PyArray_CopyMode copy = NPY_COPY_ALWAYS;
     int ndmin = 0;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1760,21 +1800,23 @@ array_array(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("array", args, len_args, kwnames,
                 "object", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$copy", &PyArray_CopyConverter, &copy,
                 "$order", &PyArray_OrderConverter, &order,
                 "$subok", &PyArray_BoolConverter, &subok,
                 "$ndmin", &PyArray_PythonPyIntFromInt, &ndmin,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "array", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1785,8 +1827,9 @@ array_array(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, copy, order, subok, ndmin);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, copy, order, subok, ndmin);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1795,7 +1838,7 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1803,18 +1846,20 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1824,8 +1869,9 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_FALSE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_FALSE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1834,7 +1880,7 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1842,18 +1888,20 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asanyarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asanyarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1863,8 +1911,9 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_TRUE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_TRUE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1874,24 +1923,26 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("ascontiguousarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "ascontiguousarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1901,8 +1952,10 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_CORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_CORDER, NPY_FALSE,
+            1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1912,24 +1965,26 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asfortranarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asfortranarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1939,8 +1994,10 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_FORTRANORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_FORTRANORDER,
+            NPY_FALSE, 1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -2046,10 +2103,9 @@ fail:
 }
 
 static PyObject *
-array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_empty_like(PyObject *NPY_UNUSED(ignored),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-
-    static char *kwlist[] = {"prototype", "dtype", "order", "subok", "shape", NULL};
     PyArrayObject *prototype = NULL;
     PyArray_Descr *dtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -2058,12 +2114,15 @@ array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     /* -1 is a special value meaning "not specified" */
     PyArray_Dims shape = {NULL, -1};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&iO&:empty_like", kwlist,
-                &PyArray_Converter, &prototype,
-                &PyArray_DescrConverter2, &dtype,
-                &PyArray_OrderConverter, &order,
-                &subok,
-                &PyArray_OptionalIntpConverter, &shape)) {
+    NPY_PREPARE_ARGPARSER;
+
+    if (npy_parse_arguments("empty_like", args, len_args, kwnames,
+            "prototype", &PyArray_Converter, &prototype,
+            "|dtype", &PyArray_DescrConverter2, &dtype,
+            "|order", &PyArray_OrderConverter, &order,
+            "|subok", &PyArray_PythonPyIntFromInt, &subok,
+            "|shape", &PyArray_OptionalIntpConverter, &shape,
+            NULL, NULL, NULL) < 0) {
         goto fail;
     }
     /* steals the reference to dtype if it's not NULL */
@@ -2249,12 +2308,15 @@ fail:
 }
 
 static PyObject *
-array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArrayObject *array;
     npy_intp count;
 
-    if (!PyArg_ParseTuple(args, "O&:count_nonzero", PyArray_Converter, &array)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("count_nonzero", args, len_args, NULL,
+            "", PyArray_Converter, &array,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -2460,7 +2522,8 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
 }
 
 static PyObject *
-array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array_concatenate(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *a0;
     PyObject *out = NULL;
@@ -2469,10 +2532,15 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     PyObject *casting_obj = NULL;
     PyObject *res;
     int axis = 0;
-    static char *kwlist[] = {"seq", "axis", "out", "dtype", "casting", NULL};
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O$O&O:concatenate", kwlist,
-                &a0, PyArray_AxisConverter, &axis, &out,
-                PyArray_DescrConverter2, &dtype, &casting_obj)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("concatenate", args, len_args, kwnames,
+            "seq", NULL, &a0,
+            "|axis", &PyArray_AxisConverter, &axis,
+            "|out", NULL, &out,
+            "$dtype", &PyArray_DescrConverter2, &dtype,
+            "$casting", NULL, &casting_obj,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     int casting_not_passed = 0;
@@ -2504,25 +2572,34 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *b0, *a0;
 
-    if (!PyArg_ParseTuple(args, "OO:innerproduct", &a0, &b0)) {
-        return NULL;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("innerproduct", args, len_args, NULL,
+            "", NULL, &a0,
+            "", NULL, &b0,
+            NULL, NULL, NULL) < 0) {
+    return NULL;
     }
+
     return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(a0, b0));
 }
 
 static PyObject *
-array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
+array_matrixproduct(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *v, *a, *o = NULL;
     PyArrayObject *ret;
-    static char* kwlist[] = {"a", "b", "out", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O:matrixproduct",
-                                     kwlist, &a, &v, &o)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("dot", args, len_args, kwnames,
+            "a", NULL, &a,
+            "b", NULL, &v,
+            "|out", NULL, &o,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     if (o != NULL) {
@@ -2540,7 +2617,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 
 
 static PyObject *
-array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     int typenum;
     char *ip1, *ip2, *op;
@@ -2553,7 +2630,11 @@ array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyArray_DotFunc *vdot;
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTuple(args, "OO:vdot", &op1, &op2)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("vdot", args, len_args, NULL,
+            "", NULL, &op1,
+            "", NULL, &op2,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -3130,13 +3211,9 @@ PyArray_GetNDArrayCFeatureVersion(void)
 }
 
 static PyObject *
-array__get_ndarray_c_version(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array__get_ndarray_c_version(
+        PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
-    static char *kwlist[] = {NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
-        return NULL;
-    }
     return PyLong_FromLong( (long) PyArray_GetNDArrayCVersion() );
 }
 
@@ -3431,32 +3508,42 @@ fail:
 #undef INNER_WHERE_LOOP
 
 static PyObject *
-array_where(PyObject *NPY_UNUSED(ignored), PyObject *args)
+array_where(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj = NULL, *x = NULL, *y = NULL;
 
-    if (!PyArg_ParseTuple(args, "O|OO:where", &obj, &x, &y)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("where", args, len_args, NULL,
+            "", NULL, &obj,
+            "|x", NULL, &x,
+            "|y", NULL, &y,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+
     return PyArray_Where(obj, x, y);
 }
 
 static PyObject *
-array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
     int axis = -1;
     PyObject *obj;
-    static char *kwlist[] = {"keys", "axis", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:lexsort", kwlist, &obj, &axis)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("lexsort", args, len_args, kwnames,
+            "keys", NULL, &obj,
+            "|axis", PyArray_PythonPyIntFromInt, &axis,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     return PyArray_Return((PyArrayObject *)PyArray_LexSort(obj, axis));
 }
 
 static PyObject *
-array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
-        PyObject *kwds)
+array_can_cast_safely(PyObject *NPY_UNUSED(self),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *from_obj = NULL;
     PyArray_Descr *d1 = NULL;
@@ -3464,12 +3551,13 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
     int ret;
     PyObject *retobj = NULL;
     NPY_CASTING casting = NPY_SAFE_CASTING;
-    static char *kwlist[] = {"from_", "to", "casting", NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:can_cast", kwlist,
-                &from_obj,
-                PyArray_DescrConverter2, &d2,
-                PyArray_CastingConverter, &casting)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("can_cast", args, len_args, kwnames,
+            "from_", NULL, &from_obj,
+            "to", &PyArray_DescrConverter2, &d2,
+            "|casting", &PyArray_CastingConverter, &casting,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
     if (d2 == NULL) {
@@ -3512,13 +3600,17 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
 }
 
 static PyObject *
-array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArray_Descr *d1 = NULL;
     PyArray_Descr *d2 = NULL;
     PyObject *ret = NULL;
-    if (!PyArg_ParseTuple(args, "O&O&:promote_types",
-                PyArray_DescrConverter2, &d1, PyArray_DescrConverter2, &d2)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("promote_types", args, len_args, NULL,
+            "", PyArray_DescrConverter2, &d1,
+            "", PyArray_DescrConverter2, &d2,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
 
@@ -3558,14 +3650,13 @@ array_min_scalar_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
 }
 
 static PyObject *
-array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len)
 {
-    npy_intp i, len, narr = 0, ndtypes = 0;
+    npy_intp i, narr = 0, ndtypes = 0;
     PyArrayObject **arr = NULL;
     PyArray_Descr **dtypes = NULL;
     PyObject *ret = NULL;
 
-    len = PyTuple_GET_SIZE(args);
     if (len == 0) {
         PyErr_SetString(PyExc_ValueError,
                         "at least one array or dtype is required");
@@ -3579,7 +3670,7 @@ array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
     dtypes = (PyArray_Descr**)&arr[len];
 
     for (i = 0; i < len; ++i) {
-        PyObject *obj = PyTuple_GET_ITEM(args, i);
+        PyObject *obj = args[i];
         if (PyArray_Check(obj)) {
             Py_INCREF(obj);
             arr[narr] = (PyArrayObject *)obj;
@@ -3785,6 +3876,34 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
                               TrimMode_LeaveOneZero, -1, -1);
 }
 
+/*
+ * returns 1 if array is a user-defined string dtype, sets an error and
+ * returns 0 otherwise
+ */
+static int _is_user_defined_string_array(PyArrayObject* array)
+{
+    if (NPY_DT_is_user_defined(PyArray_DESCR(array))) {
+        PyTypeObject* scalar_type = NPY_DTYPE(PyArray_DESCR(array))->scalar_type;
+        if (PyType_IsSubtype(scalar_type, &PyBytes_Type) ||
+            PyType_IsSubtype(scalar_type, &PyUnicode_Type)) {
+            return 1;
+        }
+        else {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "string comparisons are only allowed for dtypes with a "
+                "scalar type that is a subtype of str or bytes.");
+            return 0;
+        }
+    }
+    else {
+        PyErr_SetString(
+            PyExc_TypeError,
+            "string operation on non-string array");
+        return 0;
+    }
+}
+
 
 /*
  * The only purpose of this function is that it allows the "rstrip".
@@ -3861,6 +3980,9 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     else {
         PyErr_SetString(PyExc_TypeError,
                 "comparison of non-string arrays");
+        Py_DECREF(newarr);
+        Py_DECREF(newoth);
+        return NULL;
     }
     Py_DECREF(newarr);
     Py_DECREF(newoth);
@@ -4061,10 +4183,15 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
         method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
     }
     else {
-        PyErr_SetString(PyExc_TypeError,
-                "string operation on non-string array");
-        Py_DECREF(type);
-        goto err;
+        if (_is_user_defined_string_array(char_array)) {
+            PyTypeObject* scalar_type =
+                NPY_DTYPE(PyArray_DESCR(char_array))->scalar_type;
+            method = PyObject_GetAttr((PyObject*)scalar_type, method_name);
+        }
+        else {
+            Py_DECREF(type);
+            goto err;
+        }
     }
     if (method == NULL) {
         Py_DECREF(type);
@@ -4339,13 +4466,22 @@ _reload_guard(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) {
     Py_RETURN_NONE;
 }
 
+
+static PyObject *
+_using_numpy2_behavior(
+        PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyBool_FromLong(npy_numpy2_behavior);
+}
+
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
         METH_VARARGS, NULL},
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_NOARGS, NULL},
     {"_reconstruct",
         (PyCFunction)array__reconstruct,
         METH_VARARGS, NULL},
@@ -4390,25 +4526,25 @@ static struct PyMethodDef array_module_methods[] = {
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"empty_like",
         (PyCFunction)array_empty_like,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"scalar",
         (PyCFunction)array_scalar,
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"where",
         (PyCFunction)array_where,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"lexsort",
         (PyCFunction)array_lexsort,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"putmask",
         (PyCFunction)array_putmask,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"fromstring",
         (PyCFunction)array_fromstring,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4417,16 +4553,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"concatenate",
         (PyCFunction)array_concatenate,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"inner",
         (PyCFunction)array_innerproduct,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"dot",
         (PyCFunction)array_matrixproduct,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"vdot",
         (PyCFunction)array_vdot,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"c_einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4447,16 +4583,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"can_cast",
         (PyCFunction)array_can_cast_safely,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"promote_types",
         (PyCFunction)array_promote_types,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"min_scalar_type",
         (PyCFunction)array_min_scalar_type,
         METH_VARARGS, NULL},
     {"result_type",
         (PyCFunction)array_result_type,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"shares_memory",
         (PyCFunction)array_shares_memory,
         METH_VARARGS | METH_KEYWORDS, NULL},
@@ -4495,27 +4631,24 @@ static struct PyMethodDef array_module_methods[] = {
     {"_vec_string",
         (PyCFunction)_vec_string,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_insert", (PyCFunction)arr_insert,
+    {"_place", (PyCFunction)arr_place,
         METH_VARARGS | METH_KEYWORDS,
         "Insert vals sequentially into equivalent 1-d positions "
         "indicated by mask."},
     {"bincount", (PyCFunction)arr_bincount,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"implement_array_function",
-        (PyCFunction)array_implement_array_function,
-        METH_VARARGS, NULL},
     {"interp", (PyCFunction)arr_interp,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"interp_complex", (PyCFunction)arr_interp_complex,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"ravel_multi_index", (PyCFunction)arr_ravel_multi_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unravel_index", (PyCFunction)arr_unravel_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"add_docstring", (PyCFunction)arr_add_docstring,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"packbits", (PyCFunction)io_pack,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unpackbits", (PyCFunction)io_unpack,
@@ -4525,7 +4658,7 @@ static struct PyMethodDef array_module_methods[] = {
     {"set_legacy_print_mode", (PyCFunction)set_legacy_print_mode,
         METH_VARARGS, NULL},
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
@@ -4538,10 +4671,10 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"seterrobj",
         (PyCFunction) ufunc_seterr,
-        METH_VARARGS, NULL},
+        METH_O, NULL},
     {"geterrobj",
         (PyCFunction) ufunc_geterr,
-        METH_VARARGS, NULL},
+        METH_NOARGS, NULL},
     {"get_handler_name",
         (PyCFunction) get_handler_name,
         METH_VARARGS, NULL},
@@ -4567,6 +4700,8 @@ static struct PyMethodDef array_module_methods[] = {
     {"_reload_guard", (PyCFunction)_reload_guard,
         METH_NOARGS,
         "Give a warning on reload and big warning in sub-interpreters."},
+    {"_using_numpy2_behavior", (PyCFunction)_using_numpy2_behavior,
+        METH_NOARGS, NULL},
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
@@ -4752,6 +4887,7 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis1 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_like = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_numpy = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_where = NULL;
 
 static int
 intern_strings(void)
@@ -4808,6 +4944,10 @@ intern_strings(void)
     if (npy_ma_str_numpy == NULL) {
         return -1;
     }
+    npy_ma_str_where = PyUnicode_InternFromString("where");
+    if (npy_ma_str_where == NULL) {
+        return -1;
+    }
     return 0;
 }
 
@@ -4839,6 +4979,12 @@ initialize_static_globals(void)
         return -1;
     }
 
+    /* Initialize from certain environment variabels: */
+    char *env = getenv("NPY_NUMPY_2_BEHAVIOR");
+    if (env != NULL && strcmp(env, "0") != 0) {
+        npy_numpy2_behavior = NPY_TRUE;
+    }
+
     return 0;
 }
 
@@ -5076,6 +5222,12 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (set_typeinfo(d) != 0) {
         goto err;
     }
+    if (PyType_Ready(&PyArrayFunctionDispatcher_Type) < 0) {
+        goto err;
+    }
+    PyDict_SetItemString(
+            d, "_ArrayFunctionDispatcher",
+            (PyObject *)&PyArrayFunctionDispatcher_Type);
     if (PyType_Ready(&PyArrayMethod_Type) < 0) {
         goto err;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 809736cd2..9ba2a1831 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -1,6 +1,8 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 
+NPY_VISIBILITY_HIDDEN extern int npy_numpy2_behavior;
+
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_current_allocator;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_function;
@@ -14,5 +16,6 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis1;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_where;
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_ */
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 37e8acd84..28b7bf6e6 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -17,6 +17,7 @@
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
+#include "refcount.h"
 
 /* Internal helper functions private to this file */
 static npy_intp
@@ -1945,8 +1946,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * only copy back when this flag is on.
          */
         if ((transferinfo[iop].write.func != NULL) &&
-               (op_itflags[iop]&(NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER))
-                        == (NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER)) {
+               (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             npy_intp op_transfersize;
 
             npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
@@ -2059,27 +2059,18 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * The flag USINGBUFFER is set when the buffer was used, so
          * only decrement refs when this flag is on.
          */
-        else if (transferinfo[iop].write.func != NULL &&
-                       (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER) != 0) {
-            NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
-                                "of operand %d\n", (int)iop);
-            /* Decrement refs */
+        else if (transferinfo[iop].clear.func != NULL &&
+                    (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+            NPY_IT_DBG_PRINT1(
+                    "Iterator: clearing refs of operand %d\n", (int)iop);
             npy_intp buf_stride = dtypes[iop]->elsize;
-            if (transferinfo[iop].write.func(
-                    &transferinfo[iop].write.context,
-                    &buffer, &transfersize, &buf_stride,
-                    transferinfo[iop].write.auxdata) < 0) {
+            if (transferinfo[iop].clear.func(
+                    NULL, transferinfo[iop].clear.descr, buffer, transfersize,
+                    buf_stride, transferinfo[iop].clear.auxdata) < 0) {
                 /* Since this should only decrement, it should never error */
                 assert(0);
                 return -1;
             }
-            /*
-             * Zero out the memory for safety.  For instance,
-             * if during iteration some Python code copied an
-             * array pointing into the buffer, it will get None
-             * values for its references after this.
-             */
-            memset(buffer, 0, dtypes[iop]->elsize*transfersize);
         }
     }
 
@@ -2626,54 +2617,36 @@ npyiter_clear_buffers(NpyIter *iter)
         return;
     }
 
-    if (!(NIT_ITFLAGS(iter) & NPY_ITFLAG_NEEDSAPI)) {
-        /* Buffers do not require clearing, but should not be copied back */
-        NBF_SIZE(bufferdata) = 0;
-        return;
-    }
-
     /*
-     * The iterator may be using a dtype with references, which always
-     * requires the API. In that case, further cleanup may be necessary.
-     *
-     * TODO: At this time, we assume that a dtype having references
-     *       implies the need to hold the GIL at all times. In theory
-     *       we could broaden this definition for a new
-     *       `PyArray_Item_XDECREF` API and the assumption may become
-     *       incorrect.
+     * The iterator may be using a dtype with references (as of writing this
+     * means Python objects, but does not need to stay that way).
+     * In that case, further cleanup may be necessary and we clear buffers
+     * explicitly.
      */
     PyObject *type, *value, *traceback;
     PyErr_Fetch(&type,  &value, &traceback);
 
     /* Cleanup any buffers with references */
     char **buffers = NBF_BUFFERS(bufferdata);
+
+    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     for (int iop = 0; iop < nop; ++iop, ++buffers) {
-        /*
-         * We may want to find a better way to do this, on the other hand,
-         * this cleanup seems rare and fairly special.  A dtype using
-         * references (right now only us) must always keep the buffer in
-         * a well defined state (either NULL or owning the reference).
-         * Only we implement cleanup
-         */
-        if (!PyDataType_REFCHK(dtypes[iop]) ||
-                !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+        if (transferinfo[iop].clear.func == NULL ||
+                 !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             continue;
         }
         if (*buffers == 0) {
             continue;
         }
         int itemsize = dtypes[iop]->elsize;
-        for (npy_intp i = 0; i < NBF_SIZE(bufferdata); i++) {
-            /*
-             * See above comment, if this API is expanded the GIL assumption
-             * could become incorrect.
-             */
-            PyArray_Item_XDECREF(*buffers + (itemsize * i), dtypes[iop]);
+        if (transferinfo[iop].clear.func(NULL,
+                dtypes[iop], *buffers, NBF_SIZE(bufferdata), itemsize,
+                transferinfo[iop].clear.auxdata) < 0) {
+            /* This should never fail; if it does write it out */
+            PyErr_WriteUnraisable(NULL);
         }
-        /* Clear out the buffer just to be sure */
-        memset(*buffers, 0, NBF_SIZE(bufferdata) * itemsize);
     }
     /* Signal that the buffers are empty */
     NBF_SIZE(bufferdata) = 0;
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index b969c9f1d..dfa84c51c 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -19,6 +19,8 @@
 #include "array_coercion.h"
 #include "templ_common.h"
 #include "array_assign.h"
+#include "dtype_traversal.h"
+
 
 /* Internal helper functions private to this file */
 static int
@@ -630,6 +632,18 @@ NpyIter_Copy(NpyIter *iter)
                     }
                 }
             }
+
+            if (transferinfo[iop].clear.func != NULL) {
+                if (out_of_memory) {
+                    transferinfo[iop].clear.func = NULL;  /* No cleanup */
+                }
+                else {
+                    if (NPY_traverse_info_copy(&transferinfo[iop].clear,
+                                               &transferinfo[iop].clear) < 0) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
         }
 
         /* Initialize the buffers to the current iterindex */
@@ -706,6 +720,7 @@ NpyIter_Deallocate(NpyIter *iter)
         for (iop = 0; iop < nop; ++iop, ++transferinfo) {
             NPY_cast_info_xfree(&transferinfo->read);
             NPY_cast_info_xfree(&transferinfo->write);
+            NPY_traverse_info_xfree(&transferinfo->clear);
         }
     }
 
@@ -1106,7 +1121,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                                              NPY_ITEM_IS_POINTER))) != 0)) {
                 PyErr_SetString(PyExc_TypeError,
                         "Iterator operand or requested dtype holds "
-                        "references, but the REFS_OK flag was not enabled");
+                        "references, but the NPY_ITER_REFS_OK flag was not "
+                        "enabled");
                 return 0;
             }
         }
@@ -1118,7 +1134,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         if (op_request_dtype != NULL) {
             /* We just have a borrowed reference to op_request_dtype */
             Py_SETREF(*op_dtype, PyArray_AdaptDescriptorToArray(
-                                            *op, (PyObject *)op_request_dtype));
+                                            *op, NULL, op_request_dtype));
             if (*op_dtype == NULL) {
                 return 0;
             }
@@ -1133,7 +1149,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                           PyArray_DescrNewByteorder(*op_dtype, NPY_NATIVE));
                 if (*op_dtype == NULL) {
                     return 0;
-                }                
+                }
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_NBO\n");
                 /* Indicate that byte order or alignment needs fixing */
@@ -3186,31 +3202,33 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                     cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                 }
             }
-            /* If no write back but there are references make a decref fn */
-            else if (PyDataType_REFCHK(op_dtype[iop])) {
+            else {
+                transferinfo[iop].write.func = NULL;
+            }
+
+            /* Get the decref function, used for no-writeback and on error */
+            if (PyDataType_REFCHK(op_dtype[iop])) {
                 /*
                  * By passing NULL to dst_type and setting move_references
                  * to 1, we get back a function that just decrements the
                  * src references.
                  */
-                if (PyArray_GetDTypeTransferFunction(
+                if (PyArray_GetClearFunction(
                         (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
-                        op_dtype[iop]->elsize, 0,
-                        op_dtype[iop], NULL,
-                        1,
-                        &transferinfo[iop].write,
-                        &nc_flags) != NPY_SUCCEED) {
+                        op_dtype[iop]->elsize, op_dtype[iop],
+                        &transferinfo[iop].clear, &nc_flags) < 0) {
                     goto fail;
                 }
                 cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
             }
             else {
-                transferinfo[iop].write.func = NULL;
+                transferinfo[iop].clear.func = NULL;
             }
         }
         else {
             transferinfo[iop].read.func = NULL;
             transferinfo[iop].write.func = NULL;
+            transferinfo[iop].clear.func = NULL;
         }
     }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 5bf47fabd..a33e4f90f 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -23,6 +23,8 @@
 
 #include "lowlevel_strided_loops.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
+
 
 /********** ITERATOR CONSTRUCTION TIMING **************/
 #define NPY_IT_CONSTRUCTION_TIMING 0
@@ -242,6 +244,7 @@ typedef npy_int16 npyiter_opitflags;
 struct NpyIter_TransferInfo_tag {
     NPY_cast_info read;
     NPY_cast_info write;
+    NPY_traverse_info clear;
     /* Probably unnecessary, but make sure what follows is intp aligned: */
     npy_intp _unused_ensure_alignment[];
 };
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index df814a796..35e4af79c 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -53,6 +53,8 @@ static PyObject *
 array_inplace_remainder(PyArrayObject *m1, PyObject *m2);
 static PyObject *
 array_inplace_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo));
+static PyObject *
+array_inplace_matrix_multiply(PyArrayObject *m1, PyObject *m2);
 
 /*
  * Dictionary can contain any of the numeric operations, by name.
@@ -339,7 +341,6 @@ array_divmod(PyObject *m1, PyObject *m2)
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.divmod);
 }
 
-/* Need this to be version dependent on account of the slot check */
 static PyObject *
 array_matrix_multiply(PyObject *m1, PyObject *m2)
 {
@@ -348,13 +349,70 @@ array_matrix_multiply(PyObject *m1, PyObject *m2)
 }
 
 static PyObject *
-array_inplace_matrix_multiply(
-        PyArrayObject *NPY_UNUSED(m1), PyObject *NPY_UNUSED(m2))
+array_inplace_matrix_multiply(PyArrayObject *self, PyObject *other)
 {
-    PyErr_SetString(PyExc_TypeError,
-                    "In-place matrix multiplication is not (yet) supported. "
-                    "Use 'a = a @ b' instead of 'a @= b'.");
-    return NULL;
+    static PyObject *AxisError_cls = NULL;
+    npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
+    if (AxisError_cls == NULL) {
+        return NULL;
+    }
+
+    INPLACE_GIVE_UP_IF_NEEDED(self, other,
+            nb_inplace_matrix_multiply, array_inplace_matrix_multiply);
+
+    /*
+     * Unlike `matmul(a, b, out=a)` we ensure that the result is not broadcast
+     * if the result without `out` would have less dimensions than `a`.
+     * Since the signature of matmul is '(n?,k),(k,m?)->(n?,m?)' this is the
+     * case exactly when the second operand has both core dimensions.
+     *
+     * The error here will be confusing, but for now, we enforce this by
+     * passing the correct `axes=`.
+     */
+    static PyObject *axes_1d_obj_kwargs = NULL;
+    static PyObject *axes_2d_obj_kwargs = NULL;
+    if (NPY_UNLIKELY(axes_1d_obj_kwargs == NULL)) {
+        axes_1d_obj_kwargs = Py_BuildValue(
+                "{s, [(i), (i, i), (i)]}", "axes", -1, -2, -1, -1);
+        if (axes_1d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+    if (NPY_UNLIKELY(axes_2d_obj_kwargs == NULL)) {
+        axes_2d_obj_kwargs = Py_BuildValue(
+                "{s, [(i, i), (i, i), (i, i)]}", "axes", -2, -1, -2, -1, -2, -1);
+        if (axes_2d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+
+    PyObject *args = PyTuple_Pack(3, self, other, self);
+    if (args == NULL) {
+        return NULL;
+    }
+    PyObject *kwargs;
+    if (PyArray_NDIM(self) == 1) {
+        kwargs = axes_1d_obj_kwargs;
+    }
+    else {
+        kwargs = axes_2d_obj_kwargs;
+    }
+    PyObject *res = PyObject_Call(n_ops.matmul, args, kwargs);
+    Py_DECREF(args);
+
+    if (res == NULL) {
+        /*
+         * AxisError should indicate that the axes argument didn't work out
+         * which should mean the second operand not being 2 dimensional.
+         */
+        if (PyErr_ExceptionMatches(AxisError_cls)) {
+            PyErr_SetString(PyExc_ValueError,
+                "inplace matrix multiplication requires the first operand to "
+                "have at least one and the second at least two dimensions.");
+        }
+    }
+
+    return res;
 }
 
 /*
@@ -539,47 +597,10 @@ array_power(PyObject *a1, PyObject *o2, PyObject *modulo)
 static PyObject *
 array_positive(PyArrayObject *m1)
 {
-    /*
-     * For backwards compatibility, where + just implied a copy,
-     * we cannot just call n_ops.positive.  Instead, we do the following
-     * 1. Try n_ops.positive
-     * 2. If we get an exception, check whether __array_ufunc__ is
-     *    overridden; if so, we live in the future and we allow the
-     *    TypeError to be passed on.
-     * 3. If not, give a deprecation warning and return a copy.
-     */
-    PyObject *value;
     if (can_elide_temp_unary(m1)) {
-        value = PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
-    }
-    else {
-        value = PyArray_GenericUnaryFunction(m1, n_ops.positive);
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
     }
-    if (value == NULL) {
-        /*
-         * We first fetch the error, as it needs to be clear to check
-         * for the override.  When the deprecation is removed,
-         * this whole stanza can be deleted.
-         */
-        PyObject *exc, *val, *tb;
-        PyErr_Fetch(&exc, &val, &tb);
-        if (PyUFunc_HasOverride((PyObject *)m1)) {
-            PyErr_Restore(exc, val, tb);
-            return NULL;
-        }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
-
-        /* 2018-06-28, 1.16.0 */
-        if (DEPRECATE("Applying '+' to a non-numerical array is "
-                      "ill-defined. Returning a copy, but in the future "
-                      "this will error.") < 0) {
-            return NULL;
-        }
-        value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
-    }
-    return value;
+    return PyArray_GenericUnaryFunction(m1, n_ops.positive);
 }
 
 static PyObject *
@@ -848,13 +869,11 @@ array_scalar_forward(PyArrayObject *v,
                      PyObject *(*builtin_func)(PyObject *),
                      const char *where)
 {
-    PyObject *scalar;
-    if (PyArray_SIZE(v) != 1) {
-        PyErr_SetString(PyExc_TypeError, "only size-1 arrays can be"\
-                        " converted to Python scalars");
+    if (check_is_convertible_to_scalar(v) < 0) {
         return NULL;
     }
 
+    PyObject *scalar;
     scalar = PyArray_GETITEM(v, PyArray_DATA(v));
     if (scalar == NULL) {
         return NULL;
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index a1c310700..876bb53e1 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -2,6 +2,10 @@
  * This module corresponds to the `Special functions for NPY_OBJECT`
  * section in the numpy reference for C-API.
  */
+#include "array_method.h"
+#include "dtype_traversal.h"
+#include "lowlevel_strided_loops.h"
+#include "pyerrors.h"
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
@@ -12,13 +16,101 @@
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "iterators.h"
+#include "dtypemeta.h"
+#include "refcount.h"
 
 #include "npy_config.h"
 
 #include "npy_pycompat.h"
 
-static void
-_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+/*
+ * Helper function to clear a strided memory (normally or always contiguous)
+ * from all Python (or other) references.  The function does nothing if the
+ * array dtype does not indicate holding references.
+ *
+ * It is safe to call this function more than once, failing here is usually
+ * critical (during cleanup) and should be set up to minimize the risk or
+ * avoid it fully.
+ */
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned)
+{
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+
+    int res = clear_info.func(
+            NULL, clear_info.descr, data, size, stride, clear_info.auxdata);
+    NPY_traverse_info_xfree(&clear_info);
+    return res;
+}
+
+
+/*
+ * Helper function to clear whole array.  It seems plausible that we should
+ * be able to get away with assuming the the array is contiguous.
+ *
+ * Must only be called on arrays which own their data (and asserts this).
+ */
+ NPY_NO_EXPORT int
+ PyArray_ClearArray(PyArrayObject *arr)
+ {
+    assert(PyArray_FLAGS(arr) & NPY_ARRAY_OWNDATA);
+
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+    /*
+     * The contiguous path should cover practically all important cases since
+     * it is difficult to create a non-contiguous array which owns its memory
+     * and only arrays which own their memory should clear it.
+     */
+    int aligned = PyArray_ISALIGNED(arr);
+    if (PyArray_ISCONTIGUOUS(arr)) {
+        return PyArray_ClearBuffer(
+                descr, PyArray_BYTES(arr), descr->elsize,
+                PyArray_SIZE(arr), aligned);
+    }
+    int idim, ndim;
+    npy_intp shape_it[NPY_MAXDIMS], strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    char *data_it;
+    if (PyArray_PrepareOneRawArrayIter(
+                    PyArray_NDIM(arr), PyArray_DIMS(arr),
+                    PyArray_BYTES(arr), PyArray_STRIDES(arr),
+                    &ndim, shape_it, &data_it, strides_it) < 0) {
+        return -1;
+    }
+    npy_intp inner_stride = strides_it[0];
+    npy_intp inner_shape = shape_it[0];
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, inner_stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        if (clear_info.func(NULL, clear_info.descr,
+                data_it, inner_shape, inner_stride, clear_info.auxdata) < 0) {
+            return -1;
+        }
+    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
+                            shape_it, data_it, strides_it);
+    return 0;
+}
 
 
 /*NUMPY_API
@@ -204,6 +296,10 @@ PyArray_INCREF(PyArrayObject *mp)
 /*NUMPY_API
   Decrement all internal references for object arrays.
   (or arrays with object fields)
+
+  The use of this function is strongly discouraged, within NumPy
+  use PyArray_Clear, which DECREF's and sets everything to NULL and can
+  work with any dtype.
 */
 NPY_NO_EXPORT int
 PyArray_XDECREF(PyArrayObject *mp)
@@ -254,15 +350,28 @@ PyArray_XDECREF(PyArrayObject *mp)
     return 0;
 }
 
+
+static void
+_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+
+
 /*NUMPY_API
  * Assumes contiguous
  */
 NPY_NO_EXPORT void
 PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
 {
+    PyArray_Descr* descr = PyArray_DESCR(arr);
+
+    // non-legacy dtypes are responsible for initializing
+    // their own internal references
+    if (!NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+        return;
+    }
+
     npy_intp i,n;
     n = PyArray_SIZE(arr);
-    if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) {
+    if (descr->type_num == NPY_OBJECT) {
         PyObject **optr;
         optr = (PyObject **)(PyArray_DATA(arr));
         n = PyArray_SIZE(arr);
@@ -282,8 +391,8 @@ PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
         char *optr;
         optr = PyArray_DATA(arr);
         for (i = 0; i < n; i++) {
-            _fillobject(optr, obj, PyArray_DESCR(arr));
-            optr += PyArray_DESCR(arr)->elsize;
+            _fillobject(optr, obj, descr);
+            optr += descr->elsize;
         }
     }
 }
diff --git a/numpy/core/src/multiarray/refcount.h b/numpy/core/src/multiarray/refcount.h
index 959eef5ba..16d34e292 100644
--- a/numpy/core/src/multiarray/refcount.h
+++ b/numpy/core/src/multiarray/refcount.h
@@ -1,6 +1,14 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned);
+
+NPY_NO_EXPORT int
+PyArray_ClearArray(PyArrayObject *arr);
+
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr);
 
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index bb5d36ba1..ff30737b5 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -15,6 +15,7 @@
 
 #include "npy_pycompat.h"
 
+#include "arraytypes.h"
 #include "npy_config.h"
 #include "mapping.h"
 #include "ctors.h"
@@ -278,7 +279,43 @@ static PyObject *
 genint_type_str(PyObject *self)
 {
     PyObject  *item, *item_str;
-    item = gentype_generic_method(self, NULL, NULL, "item");
+    PyArray_Descr *descr = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(self));
+    void *val = scalar_value(self, descr);
+    switch (descr->type_num) {
+        case NPY_BYTE:
+            item = PyLong_FromLong(*(int8_t *)val);
+            break;
+        case NPY_UBYTE:
+            item = PyLong_FromUnsignedLong(*(uint8_t *)val);
+            break;
+        case NPY_SHORT:
+            item = PyLong_FromLong(*(int16_t *)val);
+            break;
+        case NPY_USHORT:
+            item = PyLong_FromUnsignedLong(*(uint16_t *)val);
+            break;
+        case NPY_INT:
+            item = PyLong_FromLong(*(int32_t *)val);
+            break;
+        case NPY_UINT:
+            item = PyLong_FromUnsignedLong(*(uint32_t *)val);
+            break;
+        case NPY_LONG:
+            item = PyLong_FromLong(*(int64_t *)val);
+            break;
+        case NPY_ULONG:
+            item = PyLong_FromUnsignedLong(*(uint64_t *)val);
+            break;
+        case NPY_LONGLONG:
+            item = PyLong_FromLongLong(*(long long *)val);
+            break;
+        case NPY_ULONGLONG:
+            item = PyLong_FromUnsignedLongLong(*(unsigned long long *)val);
+            break;
+        default:
+            item = gentype_generic_method(self, NULL, NULL, "item");
+            break;
+    }
     if (item == NULL) {
         return NULL;
     }
@@ -896,15 +933,21 @@ static PyObject *
 @name@type_@kind@_either(npy_@name@ val, TrimMode trim_pos, TrimMode trim_sci,
                          npy_bool sign)
 {
-    npy_@name@ absval;
 
     if (npy_legacy_print_mode <= 113) {
         return legacy_@name@_format@kind@(val);
     }
 
-    absval = val < 0 ? -val : val;
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_@name@ absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
 
-    if (absval == 0 || (absval < 1.e16L && absval >= 1.e-4L) ) {
+    if (use_positional) {
         return format_@name@(val, 0, -1, sign, trim_pos, -1, -1, -1);
     }
     return format_@name@(val, 1, -1, sign, trim_sci, -1, -1, -1);
@@ -1072,6 +1115,8 @@ gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
         }
     }
 
+   RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+
     arr = PyArray_FromScalar(self, NULL);
     if (arr == NULL) {
         return NULL;
@@ -1844,10 +1889,7 @@ gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
 static PyObject *
 numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     int args_len_expected;
 
     /* complexfloating should take 2 parameters, all others take 1 */
@@ -1859,20 +1901,13 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
         args_len_expected = 1;
     }
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > args_len_expected) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > args_len_expected ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 /*
@@ -1883,14 +1918,9 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 static PyObject *
 numbertype_class_getitem(PyObject *cls, PyObject *args)
 {
-#ifdef Py_GENERICALIASOBJECT_H
     PyErr_Format(PyExc_TypeError,
                  "There are no type variables left in %s",
                  ((PyTypeObject *)cls)->tp_name);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-#endif
     return NULL;
 }
 
@@ -2278,7 +2308,7 @@ static PyGetSetDef inttype_getsets[] = {
 };
 
 static PyMethodDef numbertype_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem_abc,
         METH_CLASS | METH_O, NULL},
@@ -2292,7 +2322,7 @@ static PyMethodDef @name@type_methods[] = {
     {"__complex__",
         (PyCFunction)@name@_complex,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2333,7 +2363,7 @@ static PyMethodDef @name@type_methods[] = {
     {"is_integer",
         (PyCFunction)@name@_is_integer,
         METH_NOARGS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2345,7 +2375,7 @@ static PyMethodDef @name@type_methods[] = {
  * #name = timedelta, cdouble#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2358,7 +2388,7 @@ static PyMethodDef @name@type_methods[] = {
  *         long, ulong, longlong, ulonglong#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -3588,7 +3618,7 @@ object_arrtype_call(PyObjectScalarObject *obj, PyObject *args, PyObject *kwds)
 
 NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.object_",
+    .tp_name = "numpy." NPY_OBJECT_name,
     .tp_basicsize = sizeof(PyObjectScalarObject),
     .tp_dealloc = (destructor)object_arrtype_dealloc,
     .tp_as_sequence = &object_arrtype_as_sequence,
@@ -3624,60 +3654,29 @@ gen_arrtype_subscript(PyObject *self, PyObject *key)
 }
 
 
-#define NAME_bool "bool"
-#define NAME_void "void"
-#define NAME_string "bytes"
-#define NAME_unicode "str"
-
 /**begin repeat
- * #name = bool, string, unicode, void#
- * #NAME = Bool, String, Unicode, Void#
- * #ex = _,_,_,#
+ * #Name = Bool,
+ *         Byte, Short, Int, Long, LongLong,
+ *         UByte, UShort, UInt, ULong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble,
+ *         String, Unicode, Void,
+ *         Datetime, Timedelta#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID,
+ *         DATETIME, TIMEDELTA#
  */
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy." NAME_@name@ "@ex@",
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-};
-/**end repeat**/
-
-#undef NAME_bool
-#undef NAME_void
-#undef NAME_string
-#undef NAME_unicode
 
-/**begin repeat
- * #NAME = Byte, Short, Int, Long, LongLong, UByte, UShort, UInt, ULong,
- *         ULongLong, Half, Float, Double, LongDouble, Datetime, Timedelta#
- * #name = int*5, uint*5, float*4, datetime, timedelta#
- * #CNAME = (CHAR, SHORT, INT, LONG, LONGLONG)*2, HALF, FLOAT, DOUBLE,
- *          LONGDOUBLE, DATETIME, TIMEDELTA#
- */
-#if NPY_BITSOF_@CNAME@ == 8
-#define _THIS_SIZE "8"
-#elif NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "16"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "80"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "96"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "256"
-#endif
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+NPY_NO_EXPORT PyTypeObject Py@Name@ArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
+    .tp_name = "numpy." NPY_@NAME@_name,
+    .tp_basicsize = sizeof(Py@Name@ScalarObject),
 };
 
-
-#undef _THIS_SIZE
 /**end repeat**/
 
 
@@ -3686,37 +3685,6 @@ static PyMappingMethods gentype_as_mapping = {
 };
 
 
-/**begin repeat
- * #NAME = CFloat, CDouble, CLongDouble#
- * #name = complex*3#
- * #CNAME = FLOAT, DOUBLE, LONGDOUBLE#
- */
-#if NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "160"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "192"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "256"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "512"
-#endif
-
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(0, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-};
-#undef _THIS_SIZE
-
-/**end repeat**/
-
 /*
  * This table maps the built-in type numbers to their scalar
  * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
@@ -4105,36 +4073,6 @@ initialize_numeric_types(void)
 
     PyArrayIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMapIter_Type.tp_iter = PyObject_SelfIter;
-
-    /*
-     * Give types different names when they are the same size (gh-9799).
-     * `np.intX` always refers to the first int of that size in the sequence
-     * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
-     */
-#if (NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT)
-    PyByteArrType_Type.tp_name = "numpy.byte";
-    PyUByteArrType_Type.tp_name = "numpy.ubyte";
-#endif
-#if (NPY_SIZEOF_SHORT == NPY_SIZEOF_INT)
-    PyShortArrType_Type.tp_name = "numpy.short";
-    PyUShortArrType_Type.tp_name = "numpy.ushort";
-#endif
-#if (NPY_SIZEOF_INT == NPY_SIZEOF_LONG)
-    PyIntArrType_Type.tp_name = "numpy.intc";
-    PyUIntArrType_Type.tp_name = "numpy.uintc";
-#endif
-#if (NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG)
-    PyLongLongArrType_Type.tp_name = "numpy.longlong";
-    PyULongLongArrType_Type.tp_name = "numpy.ulonglong";
-#endif
-
-    /*
-    Do the same for longdouble
-    */
-#if (NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE)
-    PyLongDoubleArrType_Type.tp_name = "numpy.longdouble";
-    PyCLongDoubleArrType_Type.tp_name = "numpy.clongdouble";
-#endif
 }
 
 typedef struct {
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 25af43bbc..15b166423 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -285,7 +285,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             Py_TYPE(self), PyArray_DESCR(self),
             ndim, dimensions, strides, PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     Py_DECREF(self);
     return (PyObject *)ret;
 }
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index 9c490b3f7..b8a6943dc 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -318,10 +318,19 @@ read_rows(stream *s,
         }
 
         if (!usecols && (actual_num_fields != current_num_fields)) {
-            PyErr_Format(PyExc_ValueError,
+            if (homogeneous) {
+                PyErr_Format(PyExc_ValueError,
                     "the number of columns changed from %zd to %zd at row %zd; "
                     "use `usecols` to select a subset and avoid this error",
                     actual_num_fields, current_num_fields, row_count+1);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "the dtype passed requires %zd columns but %zd were found "
+                    "at row %zd; "
+                    "use `usecols` to select a subset and avoid this error",
+                    actual_num_fields, current_num_fields, row_count+1);
+            }
             goto error;
         }
 
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 02c64263e..e0ddc393d 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
         const UCS *chunk_start, const UCS *chunk_end)
 {
     npy_intp chunk_length = chunk_end - chunk_start;
-    npy_intp size = chunk_length + ts->field_buffer_pos + 2;
+    /* Space for length +1 termination, +2 additional padding for add_field */
+    npy_intp size = chunk_length + ts->field_buffer_pos + 3;
 
     if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
         npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
@@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
     ts->num_fields += 1;
     /* Ensure this (currently empty) word is NUL terminated. */
     ts->field_buffer[ts->field_buffer_pos] = '\0';
+    assert(ts->field_buffer_length > ts->field_buffer_pos);
     return 0;
 }
 
@@ -221,7 +223,8 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config)
             }
             else {
                 /* continue parsing as if unquoted */
-                ts->state = TOKENIZE_UNQUOTED;
+                /* Set to TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE */
+                ts->state = ts->unquoted_state;
             }
             break;
 
@@ -287,7 +290,7 @@ NPY_NO_EXPORT int
 npy_tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
 {
     assert(ts->fields_size >= 2);
-    assert(ts->field_buffer_length >= 2*(ssize_t)sizeof(Py_UCS4));
+    assert(ts->field_buffer_length >= 2*(Py_ssize_t)sizeof(Py_UCS4));
 
     int finished_reading_file = 0;
 
@@ -446,6 +449,8 @@ npy_tokenizer_init(tokenizer_state *ts, parser_config *config)
 
     ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
     if (ts->fields == nullptr) {
+        PyMem_Free(ts->field_buffer);
+        ts->field_buffer = nullptr;
         PyErr_NoMemory();
         return -1;
     }
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index d0ea46383..53e97760f 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -46,8 +46,9 @@ typedef struct {
     char *pos;
     char *end;
     /*
-     * Space to copy words into.  The buffer must always be at least two NUL
-     * entries longer (8 bytes) than the actual word (including initially).
+     * Space to copy words into.  Due to `add_field` not growing the buffer
+     * but writing a \0 termination, the buffer must always be two larger
+     * (add_field can be called twice if a row ends in a delimiter: "123,").
      * The first byte beyond the current word is always NUL'ed on write, the
      * second byte is there to allow easy appending of an additional empty
      * word at the end (this word is also NUL terminated).
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index a338d712d..5e36c914b 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -41,6 +41,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "scalartypes.h"
 #include "array_method.h"
 #include "convert_datatype.h"
+#include "dtype_traversal.h"
 #include "legacy_dtype_implementation.h"
 
 
@@ -223,6 +224,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_ValueError, "missing typeobject");
         return -1;
     }
+
+    int use_void_clearimpl = 0;
     if (descr->flags & (NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT)) {
         /*
          * User dtype can't actually do reference counting, however, there
@@ -231,6 +234,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
          * so we have to support this. But such a structure must be constant
          * (i.e. fixed at registration time, this is the case for `xpress`).
          */
+        use_void_clearimpl = 1;
+
         if (descr->names == NULL || descr->fields == NULL ||
             !PyDict_CheckExact(descr->fields)) {
             PyErr_Format(PyExc_ValueError,
@@ -256,14 +261,53 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         return -1;
     }
 
+    /*
+     * Legacy user DTypes classes cannot have a name, since the user never
+     * defined one.  So we create a name for them here. These DTypes are
+     * effectively static types.
+     *
+     * Note: we have no intention of freeing the memory again since this
+     * behaves identically to static type definition.
+     */
+
+    const char *scalar_name = descr->typeobj->tp_name;
+    /*
+     * We have to take only the name, and ignore the module to get
+     * a reasonable __name__, since static types are limited in this regard
+     * (this is not ideal, but not a big issue in practice).
+     * This is what Python does to print __name__ for static types.
+     */
+    const char *dot = strrchr(scalar_name, '.');
+    if (dot) {
+        scalar_name = dot + 1;
+    }
+    Py_ssize_t name_length = strlen(scalar_name) + 14;
+
+    char *name = PyMem_Malloc(name_length);
+    if (name == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    snprintf(name, name_length, "numpy.dtype[%s]", scalar_name);
+
     userdescrs[NPY_NUMUSERTYPES++] = descr;
 
     descr->type_num = typenum;
-    if (dtypemeta_wrap_legacy_descriptor(descr) < 0) {
+    if (dtypemeta_wrap_legacy_descriptor(descr, name, NULL) < 0) {
         descr->type_num = -1;
         NPY_NUMUSERTYPES--;
+        PyMem_Free(name);  /* free the name only on failure */
         return -1;
     }
+    if (use_void_clearimpl) {
+        /* See comment where use_void_clearimpl is set... */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_clear_void_and_legacy_user_dtype_loop);
+        /* Also use the void zerofill since there may be objects */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_zerofill_void_and_legacy_user_dtype_loop);
+    }
 
     return typenum;
 }
@@ -597,7 +641,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     if (from == to) {
         spec.flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &legacy_same_dtype_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
@@ -606,7 +650,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     else {
         spec.flags = NPY_METH_REQUIRES_PYAPI;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &simple_cast_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
diff --git a/numpy/core/src/npymath/arm64_exports.c b/numpy/core/src/npymath/arm64_exports.c
index d65bb4f6d..dfa8054d7 100644
--- a/numpy/core/src/npymath/arm64_exports.c
+++ b/numpy/core/src/npymath/arm64_exports.c
@@ -1,12 +1,14 @@
 #if defined(__arm64__) && defined(__APPLE__)
 
 #include <math.h>
-/* 
+/*
  * Export these for scipy, since the SciPy build for macos arm64
  * downloads the macos x86_64 NumPy, and does not error when the
  * linker fails to use npymathlib.a. Importing numpy will expose
  * these external functions
  * See https://github.com/numpy/numpy/issues/22673#issuecomment-1327520055
+ *
+ * This file is actually compiled as part of the main module.
  */
 
 double npy_asinh(double x) {
@@ -20,4 +22,9 @@ double npy_copysign(double y, double x) {
 double npy_log1p(double x) {
     return log1p(x);
 }
+
+double npy_nextafter(double x, double y) {
+    return nextafter(x, y);
+}
+
 #endif
diff --git a/numpy/core/src/npymath/halffloat.c b/numpy/core/src/npymath/halffloat.c
deleted file mode 100644
index 51948c736..000000000
--- a/numpy/core/src/npymath/halffloat.c
+++ /dev/null
@@ -1,555 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "numpy/halffloat.h"
-
-/*
- * This chooses between 'ties to even' and 'ties away from zero'.
- */
-#define NPY_HALF_ROUND_TIES_TO_EVEN 1
-/*
- * If these are 1, the conversions try to trigger underflow,
- * overflow, and invalid exceptions in the FP system when needed.
- */
-#define NPY_HALF_GENERATE_OVERFLOW 1
-#define NPY_HALF_GENERATE_UNDERFLOW 1
-#define NPY_HALF_GENERATE_INVALID 1
-
-/*
- ********************************************************************
- *                   HALF-PRECISION ROUTINES                        *
- ********************************************************************
- */
-
-float npy_half_to_float(npy_half h)
-{
-    union { float ret; npy_uint32 retbits; } conv;
-    conv.retbits = npy_halfbits_to_floatbits(h);
-    return conv.ret;
-}
-
-double npy_half_to_double(npy_half h)
-{
-    union { double ret; npy_uint64 retbits; } conv;
-    conv.retbits = npy_halfbits_to_doublebits(h);
-    return conv.ret;
-}
-
-npy_half npy_float_to_half(float f)
-{
-    union { float f; npy_uint32 fbits; } conv;
-    conv.f = f;
-    return npy_floatbits_to_halfbits(conv.fbits);
-}
-
-npy_half npy_double_to_half(double d)
-{
-    union { double d; npy_uint64 dbits; } conv;
-    conv.d = d;
-    return npy_doublebits_to_halfbits(conv.dbits);
-}
-
-int npy_half_iszero(npy_half h)
-{
-    return (h&0x7fff) == 0;
-}
-
-int npy_half_isnan(npy_half h)
-{
-    return ((h&0x7c00u) == 0x7c00u) && ((h&0x03ffu) != 0x0000u);
-}
-
-int npy_half_isinf(npy_half h)
-{
-    return ((h&0x7fffu) == 0x7c00u);
-}
-
-int npy_half_isfinite(npy_half h)
-{
-    return ((h&0x7c00u) != 0x7c00u);
-}
-
-int npy_half_signbit(npy_half h)
-{
-    return (h&0x8000u) != 0;
-}
-
-npy_half npy_half_spacing(npy_half h)
-{
-    npy_half ret;
-    npy_uint16 h_exp = h&0x7c00u;
-    npy_uint16 h_sig = h&0x03ffu;
-    if (h_exp == 0x7c00u) {
-#if NPY_HALF_GENERATE_INVALID
-        npy_set_floatstatus_invalid();
-#endif
-        ret = NPY_HALF_NAN;
-    } else if (h == 0x7bffu) {
-#if NPY_HALF_GENERATE_OVERFLOW
-        npy_set_floatstatus_overflow();
-#endif
-        ret = NPY_HALF_PINF;
-    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
-        if (h_exp > 0x2c00u) { /* If result is normalized */
-            ret = h_exp - 0x2c00u;
-        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-            ret = 1 << ((h_exp >> 10) - 2);
-        } else {
-            ret = 0x0001u; /* Smallest subnormal half */
-        }
-    } else if (h_exp > 0x2800u) { /* If result is still normalized */
-        ret = h_exp - 0x2800u;
-    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-        ret = 1 << ((h_exp >> 10) - 1);
-    } else {
-        ret = 0x0001u;
-    }
-
-    return ret;
-}
-
-npy_half npy_half_copysign(npy_half x, npy_half y)
-{
-    return (x&0x7fffu) | (y&0x8000u);
-}
-
-npy_half npy_half_nextafter(npy_half x, npy_half y)
-{
-    npy_half ret;
-
-    if (npy_half_isnan(x) || npy_half_isnan(y)) {
-        ret = NPY_HALF_NAN;
-    } else if (npy_half_eq_nonan(x, y)) {
-        ret = x;
-    } else if (npy_half_iszero(x)) {
-        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
-    } else if (!(x&0x8000u)) { /* x > 0 */
-        if ((npy_int16)x > (npy_int16)y) { /* x > y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    } else {
-        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    }
-#if NPY_HALF_GENERATE_OVERFLOW
-    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
-        npy_set_floatstatus_overflow();
-    }
-#endif
-
-    return ret;
-}
-
-int npy_half_eq_nonan(npy_half h1, npy_half h2)
-{
-    return (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_eq(npy_half h1, npy_half h2)
-{
-    /*
-     * The equality cases are as follows:
-     *   - If either value is NaN, never equal.
-     *   - If the values are equal, equal.
-     *   - If the values are both signed zeros, equal.
-     */
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) &&
-           (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_ne(npy_half h1, npy_half h2)
-{
-    return !npy_half_eq(h1, h2);
-}
-
-int npy_half_lt_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) > (h2&0x7fffu);
-        } else {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 != 0x8000u) || (h2 != 0x0000u);
-        }
-    } else {
-        if (h2&0x8000u) {
-            return 0;
-        } else {
-            return (h1&0x7fffu) < (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_lt(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_lt_nonan(h1, h2);
-}
-
-int npy_half_gt(npy_half h1, npy_half h2)
-{
-    return npy_half_lt(h2, h1);
-}
-
-int npy_half_le_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) >= (h2&0x7fffu);
-        } else {
-            return 1;
-        }
-    } else {
-        if (h2&0x8000u) {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 == 0x0000u) && (h2 == 0x8000u);
-        } else {
-            return (h1&0x7fffu) <= (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_le(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_le_nonan(h1, h2);
-}
-
-int npy_half_ge(npy_half h1, npy_half h2)
-{
-    return npy_half_le(h2, h1);
-}
-
-npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
-{
-    float fh1 = npy_half_to_float(h1);
-    float fh2 = npy_half_to_float(h2);
-    float div, mod;
-
-    div = npy_divmodf(fh1, fh2, &mod);
-    *modulus = npy_float_to_half(mod);
-    return npy_float_to_half(div);
-}
-
-
-
-/*
- ********************************************************************
- *                     BIT-LEVEL CONVERSIONS                        *
- ********************************************************************
- */
-
-npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
-{
-    npy_uint32 f_exp, f_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (npy_uint16) ((f&0x80000000u) >> 16);
-    f_exp = (f&0x7f800000u);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (f_exp >= 0x47800000u) {
-        if (f_exp == 0x7f800000u) {
-            /* Inf or NaN */
-            f_sig = (f&0x007fffffu);
-            if (f_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (f_sig >> 13));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return (npy_uint16) (h_sgn + 0x7c00u);
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return (npy_uint16) (h_sgn + 0x7c00u);
-        }
-    }
-
-    /* Exponent underflow converts to a subnormal half or signed zero */
-    if (f_exp <= 0x38000000u) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (f_exp < 0x33000000u) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If f != 0, it underflowed to 0 */
-            if ((f&0x7fffffff) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        f_exp >>= 23;
-        f_sig = (0x00800000u + (f&0x007fffffu));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((f_sig&(((npy_uint32)1 << (126 - f_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Usually the significand is shifted by 13. For subnormals an
-         * additional shift needs to occur. This shift is one for the largest
-         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
-         * offsets the new first bit. At most the shift can be 1+10 bits.
-         */
-        f_sig >>= (113 - f_exp);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand. However, the (113 - f_exp)
-         * shift can lose up to 11 bits, so the || checks them in the original.
-         * In all other cases, we can just add one.
-         */
-        if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
-            f_sig += 0x00001000u;
-        }
-#else
-        f_sig += 0x00001000u;
-#endif
-        h_sig = (npy_uint16) (f_sig >> 13);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return (npy_uint16) (h_sgn + h_sig);
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((f_exp - 0x38000000u) >> 13);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    f_sig = (f&0x007fffffu);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((f_sig&0x00003fffu) != 0x00001000u) {
-        f_sig += 0x00001000u;
-    }
-#else
-    f_sig += 0x00001000u;
-#endif
-    h_sig = (npy_uint16) (f_sig >> 13);
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
-{
-    npy_uint64 d_exp, d_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (d&0x8000000000000000ULL) >> 48;
-    d_exp = (d&0x7ff0000000000000ULL);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (d_exp >= 0x40f0000000000000ULL) {
-        if (d_exp == 0x7ff0000000000000ULL) {
-            /* Inf or NaN */
-            d_sig = (d&0x000fffffffffffffULL);
-            if (d_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (d_sig >> 42));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return h_sgn + 0x7c00u;
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return h_sgn + 0x7c00u;
-        }
-    }
-
-    /* Exponent underflow converts to subnormal half or signed zero */
-    if (d_exp <= 0x3f00000000000000ULL) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (d_exp < 0x3e60000000000000ULL) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If d != 0, it underflowed to 0 */
-            if ((d&0x7fffffffffffffffULL) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        d_exp >>= 52;
-        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Unlike floats, doubles have enough room to shift left to align
-         * the subnormal significand leading to no loss of the last bits.
-         * The smallest possible exponent giving a subnormal is:
-         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
-         * shifted with respect to it. This adds a shift of 10+1 bits the final
-         * right shift when comparing it to the one in the normal branch.
-         */
-        assert(d_exp - 998 >= 0);
-        d_sig <<= (d_exp - 998);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand.  In all other cases, we do.
-         */
-        if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
-            d_sig += 0x0010000000000000ULL;
-        }
-#else
-        d_sig += 0x0010000000000000ULL;
-#endif
-        h_sig = (npy_uint16) (d_sig >> 53);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return h_sgn + h_sig;
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000ULL) >> 42);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    d_sig = (d&0x000fffffffffffffULL);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
-        d_sig += 0x0000020000000000ULL;
-    }
-#else
-    d_sig += 0x0000020000000000ULL;
-#endif
-    h_sig = (npy_uint16) (d_sig >> 42);
-
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint32 f_sgn, f_exp, f_sig;
-
-    h_exp = (h&0x7c00u);
-    f_sgn = ((npy_uint32)h&0x8000u) << 16;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return f_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23;
-            f_sig = ((npy_uint32)(h_sig&0x03ffu)) << 13;
-            return f_sgn + f_exp + f_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return f_sgn + 0x7f800000u + (((npy_uint32)(h&0x03ffu)) << 13);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return f_sgn + (((npy_uint32)(h&0x7fffu) + 0x1c000u) << 13);
-    }
-}
-
-npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint64 d_sgn, d_exp, d_sig;
-
-    h_exp = (h&0x7c00u);
-    d_sgn = ((npy_uint64)h&0x8000u) << 48;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return d_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            d_exp = ((npy_uint64)(1023 - 15 - h_exp)) << 52;
-            d_sig = ((npy_uint64)(h_sig&0x03ffu)) << 42;
-            return d_sgn + d_exp + d_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return d_sgn + 0x7ff0000000000000ULL +
-                                (((npy_uint64)(h&0x03ffu)) << 42);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return d_sgn + (((npy_uint64)(h&0x7fffu) + 0xfc000u) << 42);
-    }
-}
diff --git a/numpy/core/src/npymath/halffloat.cpp b/numpy/core/src/npymath/halffloat.cpp
new file mode 100644
index 000000000..aa582c1b9
--- /dev/null
+++ b/numpy/core/src/npymath/halffloat.cpp
@@ -0,0 +1,238 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+/*
+ * If these are 1, the conversions try to trigger underflow,
+ * overflow, and invalid exceptions in the FP system when needed.
+ */
+#define NPY_HALF_GENERATE_OVERFLOW 1
+#define NPY_HALF_GENERATE_INVALID 1
+
+#include "numpy/halffloat.h"
+
+#include "common.hpp"
+/*
+ ********************************************************************
+ *                   HALF-PRECISION ROUTINES                        *
+ ********************************************************************
+ */
+using namespace np;
+
+float npy_half_to_float(npy_half h)
+{
+    return static_cast<float>(Half::FromBits(h));
+}
+
+double npy_half_to_double(npy_half h)
+{
+    return static_cast<double>(Half::FromBits(h));
+}
+
+npy_half npy_float_to_half(float f)
+{
+    return Half(f).Bits();
+}
+
+npy_half npy_double_to_half(double d)
+{
+    return Half(d).Bits();
+}
+
+int npy_half_iszero(npy_half h)
+{
+    return (h&0x7fff) == 0;
+}
+
+int npy_half_isnan(npy_half h)
+{
+    return Half::FromBits(h).IsNaN();
+}
+
+int npy_half_isinf(npy_half h)
+{
+    return ((h&0x7fffu) == 0x7c00u);
+}
+
+int npy_half_isfinite(npy_half h)
+{
+    return ((h&0x7c00u) != 0x7c00u);
+}
+
+int npy_half_signbit(npy_half h)
+{
+    return (h&0x8000u) != 0;
+}
+
+npy_half npy_half_spacing(npy_half h)
+{
+    npy_half ret;
+    npy_uint16 h_exp = h&0x7c00u;
+    npy_uint16 h_sig = h&0x03ffu;
+    if (h_exp == 0x7c00u) {
+#if NPY_HALF_GENERATE_INVALID
+        npy_set_floatstatus_invalid();
+#endif
+        ret = NPY_HALF_NAN;
+    } else if (h == 0x7bffu) {
+#if NPY_HALF_GENERATE_OVERFLOW
+        npy_set_floatstatus_overflow();
+#endif
+        ret = NPY_HALF_PINF;
+    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
+        if (h_exp > 0x2c00u) { /* If result is normalized */
+            ret = h_exp - 0x2c00u;
+        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+            ret = 1 << ((h_exp >> 10) - 2);
+        } else {
+            ret = 0x0001u; /* Smallest subnormal half */
+        }
+    } else if (h_exp > 0x2800u) { /* If result is still normalized */
+        ret = h_exp - 0x2800u;
+    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+        ret = 1 << ((h_exp >> 10) - 1);
+    } else {
+        ret = 0x0001u;
+    }
+
+    return ret;
+}
+
+npy_half npy_half_copysign(npy_half x, npy_half y)
+{
+    return (x&0x7fffu) | (y&0x8000u);
+}
+
+npy_half npy_half_nextafter(npy_half x, npy_half y)
+{
+    npy_half ret;
+
+    if (npy_half_isnan(x) || npy_half_isnan(y)) {
+        ret = NPY_HALF_NAN;
+    } else if (npy_half_eq_nonan(x, y)) {
+        ret = x;
+    } else if (npy_half_iszero(x)) {
+        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
+    } else if (!(x&0x8000u)) { /* x > 0 */
+        if ((npy_int16)x > (npy_int16)y) { /* x > y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    } else {
+        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    }
+#if NPY_HALF_GENERATE_OVERFLOW
+    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
+        npy_set_floatstatus_overflow();
+    }
+#endif
+
+    return ret;
+}
+
+int npy_half_eq_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Equal(Half::FromBits(h2));
+}
+
+int npy_half_eq(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) == Half::FromBits(h2);
+}
+
+int npy_half_ne(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) != Half::FromBits(h2);
+}
+
+int npy_half_lt_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Less(Half::FromBits(h2));
+}
+
+int npy_half_lt(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) < Half::FromBits(h2);
+}
+
+int npy_half_gt(npy_half h1, npy_half h2)
+{
+    return npy_half_lt(h2, h1);
+}
+
+int npy_half_le_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).LessEqual(Half::FromBits(h2));
+}
+
+int npy_half_le(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) <= Half::FromBits(h2);
+}
+
+int npy_half_ge(npy_half h1, npy_half h2)
+{
+    return npy_half_le(h2, h1);
+}
+
+npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
+{
+    float fh1 = npy_half_to_float(h1);
+    float fh2 = npy_half_to_float(h2);
+    float div, mod;
+
+    div = npy_divmodf(fh1, fh2, &mod);
+    *modulus = npy_float_to_half(mod);
+    return npy_float_to_half(div);
+}
+
+
+/*
+ ********************************************************************
+ *                     BIT-LEVEL CONVERSIONS                        *
+ ********************************************************************
+ */
+
+npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint16_t>(Half(BitCast<float>(f)));
+    }
+    else {
+        return half_private::FromFloatBits(f);
+    }
+}
+
+npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint16_t>(Half(BitCast<double>(d)));
+    }
+    else {
+        return half_private::FromDoubleBits(d);
+    }
+}
+
+npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint32_t>(static_cast<float>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToFloatBits(h);
+    }
+}
+
+npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint64_t>(static_cast<double>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToDoubleBits(h);
+    }
+}
+
diff --git a/numpy/core/src/npymath/npy_math_complex.c.src b/numpy/core/src/npymath/npy_math_complex.c.src
index eff7f8e13..eb9d810b7 100644
--- a/numpy/core/src/npymath/npy_math_complex.c.src
+++ b/numpy/core/src/npymath/npy_math_complex.c.src
@@ -535,6 +535,443 @@ npy_cpow@c@ (@ctype@ a, @ctype@ b)
 #endif
 }
 
+
+#ifndef HAVE_CCOS@C@
+@ctype@
+npy_ccos@c@(@ctype@ z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccosh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CSIN@C@
+@ctype@
+npy_csin@c@(@ctype@ z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CTAN@C@
+@ctype@
+npy_ctan@c@(@ctype@ z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return (npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+@ctype@
+npy_ccosh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_cosh@c@(x), x * y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_cosh@c@(x) * npy_cos@c@(y),
+                                npy_sinh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(absx) * 0.5@c@;
+            return npy_cpack@c@(h * npy_cos@c@(y),
+                                npy_copysign@c@(h, x) * npy_sin@c@(y));
+        }
+        else if (absx < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z),
+                                npy_cimag@c@(z) * npy_copysign@c@(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpack@c@(h * h * npy_cos@c@(y), h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(y - y, npy_copysign@c@(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpack@c@(x * x, npy_copysign@c@(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@((x * x) * npy_cos@c@(y), x * npy_sin@c@(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+@ctype@
+npy_csinh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@ x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_sinh@c@(x), y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_sinh@c@(x) * npy_cos@c@(y),
+                                npy_cosh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(npy_fabs@c@(x)) * 0.5@c@;
+            return npy_cpack@c@(npy_copysign@c@(h, x) * npy_cos@c@(y),
+                                h * npy_sin@c@(y));
+        }
+        else if (x < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z) * npy_copysign@c@(1, x),
+                                npy_cimag@c@(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpack@c@(h * npy_cos@c@(y), h * h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(npy_copysign@c@(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpack@c@(x, npy_copysign@c@(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@(x * npy_cos@c@(y),
+                            NPY_INFINITY@C@ * npy_sin@c@(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+@ctype@
+npy_ctanh@c@(@ctype@ z)
+{
+    @type@ x, y;
+    @type@ t, beta, s, rho, denom;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpack@c@(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpack@c@(npy_copysign@c@(1,x),
+                                npy_copysign@c@(0,
+                                npy_isinf(y) ?
+                                    y : npy_sin@c@(y) * npy_cos@c@(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpack@c@(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabs@c@(x) >= TANH@C@_HUGE) {
+        @type@ exp_mx = npy_exp@c@(-npy_fabs@c@(x));
+        return npy_cpack@c@(npy_copysign@c@(1, x),
+                            4 * npy_sin@c@(y) * npy_cos@c@(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tan@c@(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinh@c@(x);
+    rho = npy_sqrt@c@(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpack@c@((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
 #if !defined (HAVE_CACOS@C@) || !defined (HAVE_CASINH@C@)
 /*
  * Complex inverse trig functions taken from the msum library in FreeBSD
@@ -1327,8 +1764,10 @@ npy_@kind@@c@(@ctype@ z)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = cexp,clog,csqrt,cacos,casin,catan,cacosh,casinh,catanh#
- * #KIND = CEXP,CLOG,CSQRT,CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
+ * #kind = cexp,clog,csqrt,ccos,csin,ctan,ccosh,csinh,ctanh,
+ *         cacos,casin,catan,cacosh,casinh,catanh#
+ * #KIND = CEXP,CLOG,CSQRT,CCOS,CSIN,CTAN,CCOSH,CSINH,CTANH,
+ *         CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
  */
 #ifdef HAVE_@KIND@@C@
 @ctype@
@@ -1343,20 +1782,5 @@ npy_@kind@@c@(@ctype@ z)
 #endif
 /**end repeat1**/
 
-/* Mandatory complex functions */
-
-/**begin repeat1
- * #kind = csin,csinh,ccos,ccosh,ctan,ctanh#
- */
-@ctype@
-npy_@kind@@c@(@ctype@ z)
-{
-    __@ctype@_to_c99_cast z1;
-    __@ctype@_to_c99_cast ret;
-    z1.npy_z = z;
-    ret.c99_z = @kind@@c@(z1.c99_z);
-    return ret.npy_z;
-}
-/**end repeat1**/
 
 /**end repeat**/
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index a474b3de3..20c94f98a 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -21,6 +21,7 @@
 #include <Python.h>
 #ifdef __cplusplus
 #include <cmath>
+#include <complex>
 using std::isgreater;
 using std::isless;
 #else
@@ -494,8 +495,9 @@ do {                                                            \
  * Microsoft C defines _MSC_VER
  * Intel compiler does not use MSVC complex types, but defines _MSC_VER by
  * default.
+ * since c++17 msvc is no longer support them.
  */
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#if !defined(__cplusplus) && defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 typedef union {
         npy_cdouble npy_z;
         _Dcomplex c99_z;
diff --git a/numpy/core/src/npysort/heapsort.cpp b/numpy/core/src/npysort/heapsort.cpp
index de39367c2..77a4bda74 100644
--- a/numpy/core/src/npysort/heapsort.cpp
+++ b/numpy/core/src/npysort/heapsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -55,6 +55,9 @@ npy_heapsort(void *start, npy_intp num, void *varr)
     PyArrayObject *arr = (PyArrayObject *)varr;
     npy_intp elsize = PyArray_ITEMSIZE(arr);
     PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    if (elsize == 0) {
+        return 0;  /* no need for sorting elements of no size */
+    }
     char *tmp = (char *)malloc(elsize);
     char *a = (char *)start - elsize;
     npy_intp i, j, l;
diff --git a/numpy/core/src/npysort/mergesort.cpp b/numpy/core/src/npysort/mergesort.cpp
index f892dd185..f53feb320 100644
--- a/numpy/core/src/npysort/mergesort.cpp
+++ b/numpy/core/src/npysort/mergesort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -171,6 +171,7 @@ amergesort_(type *v, npy_intp *tosort, npy_intp num)
 }
 
 /*
+ 
  *****************************************************************************
  **                             STRING SORTS                                **
  *****************************************************************************
diff --git a/numpy/core/src/npysort/npysort_heapsort.h b/numpy/core/src/npysort/npysort_heapsort.h
index 442320094..16750b817 100644
--- a/numpy/core/src/npysort/npysort_heapsort.h
+++ b/numpy/core/src/npysort/npysort_heapsort.h
@@ -128,6 +128,10 @@ int string_heapsort_(type *start, npy_intp n, void *varr)
 {
     PyArrayObject *arr = (PyArrayObject *)varr;
     size_t len = PyArray_ITEMSIZE(arr) / sizeof(type);
+    if (len == 0) {
+        return 0;  /* no need for sorting if strings are empty */
+    }
+
     type *tmp = (type *)malloc(PyArray_ITEMSIZE(arr));
     type *a = (type *)start - len;
     npy_intp i, j, l;
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3e351dd84..7497ebaa3 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -54,15 +54,11 @@
 #include "npysort_common.h"
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
+#include "simd_qsort.hpp"
 
-#include "x86-qsort.h"
 #include <cstdlib>
 #include <utility>
 
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
 #define NOT_USED NPY_UNUSED(unused)
 /*
  * pushing largest partition has upper bound of log2(n) space
@@ -73,70 +69,50 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
+// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure
+// out why it has test failures
+#if defined(_MSC_VER) || defined(__CYGWIN__)
+template<typename T>
+inline bool quicksort_dispatch(T*, npy_intp)
+{
+    return false;
+}
+#else
+template<typename T>
+inline bool quicksort_dispatch(T *start, npy_intp num)
+{
+    using TF = typename np::meta::FixedWidth<T>::Type;
+    void (*dispfunc)(TF*, intptr_t) = nullptr;
+    if (sizeof(T) == sizeof(uint16_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort_16bit.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    if (dispfunc) {
+        (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
+        return true;
+    }
+    return false;
+}
+#endif // _MSC_VER || CYGWIN
+
 /*
  *****************************************************************************
  **                            NUMERIC SORTS                                **
  *****************************************************************************
  */
 
-namespace {
-
-template <typename Tag>
-struct x86_dispatch {
-    static bool quicksort(typename Tag::type *, npy_intp) { return false; }
-};
-
-template <>
-struct x86_dispatch<npy::int_tag> {
-    static bool quicksort(npy_int *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::uint_tag> {
-    static bool quicksort(npy_uint *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::float_tag> {
-    static bool quicksort(npy_float *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-}  // namespace
-
 template <typename Tag, typename type>
 static int
 quicksort_(type *start, npy_intp num)
 {
-    if (x86_dispatch<Tag>::quicksort(start, num))
-        return 0;
-
     type vp;
     type *pl = start;
     type *pr = pl + num - 1;
@@ -729,56 +705,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr))
 NPY_NO_EXPORT int
 quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_short *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::short_tag>((npy_short *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ushort *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ushort_tag>((npy_ushort *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_int *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::int_tag>((npy_int *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_uint *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::uint_tag>((npy_uint *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_long *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::long_tag>((npy_long *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulong_tag>((npy_ulong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_longlong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::longlong_tag>((npy_longlong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulonglong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((np::Half *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::half_tag>((npy_half *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_float *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::float_tag>((npy_float *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_double *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::double_tag>((npy_double *)start, n);
 }
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/npysort/selection.cpp b/numpy/core/src/npysort/selection.cpp
index ebe188b71..450858df9 100644
--- a/numpy/core/src/npysort/selection.cpp
+++ b/numpy/core/src/npysort/selection.cpp
@@ -423,8 +423,8 @@ static constexpr std::array<arg_map, sizeof...(Tags)>
 make_partition_map(npy::taglist<Tags...>)
 {
     return std::array<arg_map, sizeof...(Tags)>{
-            arg_map{Tags::type_value, &introselect_noarg<Tags>,
-                    &introselect_arg<Tags>}...};
+            arg_map{Tags::type_value, {&introselect_noarg<Tags>},
+                {&introselect_arg<Tags>}}...};
 }
 
 struct partition_t {
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
new file mode 100644
index 000000000..101bb3dcc
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -0,0 +1,44 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif  // NPY_HAVE_AVX512_SKX
+
+}} // namespace np::simd
diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp
new file mode 100644
index 000000000..7cdee774d
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.hpp
@@ -0,0 +1,19 @@
+#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+
+#include "common.hpp"
+
+namespace np { namespace qsort_simd {
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort_16bit.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+} } // np::qsort_simd
+#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
new file mode 100644
index 000000000..3f5099758
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -0,0 +1,39 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl avx512_spr
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SPR) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512fp16-16bit-qsort.hpp"
+#elif defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if !defined(_MSC_VER)
+#if defined(NPY_HAVE_AVX512_ICL) || defined(NPY_HAVE_AVX512_SPR)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
+{
+#if defined(NPY_HAVE_AVX512_SPR)
+    avx512_qsort(reinterpret_cast<_Float16*>(arr), size);
+#else
+    avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
+#endif
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_ICL || SPR
+#endif // _MSC_VER
+
+}} // namespace np::qsort_simd
diff --git a/numpy/core/src/npysort/timsort.cpp b/numpy/core/src/npysort/timsort.cpp
index 27294af0c..9438fb293 100644
--- a/numpy/core/src/npysort/timsort.cpp
+++ b/numpy/core/src/npysort/timsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
deleted file mode 100644
index 8e88cc667..000000000
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ /dev/null
@@ -1,835 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "numpy/npy_math.h"
-
-#include "npy_sort.h"
-#include "numpy_tag.h"
-
-#include "simd/simd.h"
-#include <immintrin.h>
-
-template <typename Tag, typename type>
-NPY_NO_EXPORT int
-heapsort_(type *start, npy_intp n);
-
-/*
- * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
- * based on these two research papers:
- * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * High level idea: Vectorize the quicksort partitioning using AVX-512
- * compressstore instructions. The algorithm to pick the pivot is to use median
- * of 72 elements picked at random. If the array size is < 128, then use
- * Bitonic sorting network. Good resource for bitonic sorting network:
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340
- * for potential problems when converting this code to universal intrinsics
- * framework.
- */
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
-#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/*
- * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
- * (1) vnext generates 2 64-bit random integers
- * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
- *     the length of the array
- */
-#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
-    _mm256_or_si256(_mm256_slli_epi64((x), (k)),               \
-                    _mm256_srli_epi64((x), 64 - (k)))
-
-static inline __m256i
-vnext(__m256i *s0, __m256i *s1)
-{
-    *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
-    *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
-                           _mm256_slli_epi64(*s1, 16));
-    *s1 = VROTL(*s1, 37);
-    return _mm256_add_epi64(*s0, *s1); /* return random vector */
-}
-
-/* transform random numbers to the range between 0 and bound - 1 */
-static inline __m256i
-rnd_epu32(__m256i rnd_vec, __m256i bound)
-{
-    __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
-    __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
-    return _mm256_blend_epi32(odd, even, 0b01010101);
-}
-
-template <typename type>
-struct vector;
-
-template <>
-struct vector<npy_int> {
-    using tag = npy::int_tag;
-    using type_t = npy_int;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_INT32; }
-    static type_t type_min() { return NPY_MIN_INT32; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
-};
-template <>
-struct vector<npy_uint> {
-    using tag = npy::uint_tag;
-    using type_t = npy_uint;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
-struct vector<npy_float> {
-    using tag = npy::float_tag;
-    using type_t = npy_float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
-
-    static type_t type_max() { return NPY_INFINITYF; }
-    static type_t type_min() { return -NPY_INFINITYF; }
-    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
-};
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-void
-COEX(mm_t &a, mm_t &b)
-{
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
-{
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
-}
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-sort_zmm(zmm_t zmm)
-{
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-bitonic_merge_zmm(zmm_t zmm)
-{
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_four_zmm(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_eight_zmm(zmm_t *zmm)
-{
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_16(type_t *arr, npy_int N)
-{
-    __mmask16 load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm =
-            vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_32(type_t *arr, npy_int N)
-{
-    if (N <= 16) {
-        sort_16<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm<vtype>(zmm1);
-    zmm2 = sort_zmm<vtype>(zmm2);
-    bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_64(type_t *arr, npy_int N)
-{
-    if (N <= 32) {
-        sort_32<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    if (N < 48) {
-        load_mask1 = (0x0001 << (N - 32)) - 0x0001;
-        load_mask2 = 0x0000;
-    }
-    else if (N < 64) {
-        load_mask2 = (0x0001 << (N - 48)) - 0x0001;
-    }
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_128(type_t *arr, npy_int N)
-{
-    if (N <= 64) {
-        sort_64<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N < 80) {
-        load_mask1 = (0x0001 << (N - 64)) - 0x0001;
-        load_mask2 = 0x0000;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 96) {
-        load_mask2 = (0x0001 << (N - 80)) - 0x0001;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 112) {
-        load_mask3 = (0x0001 << (N - 96)) - 0x0001;
-        load_mask4 = 0x0000;
-    }
-    else {
-        load_mask4 = (0x0001 << (N - 112)) - 0x0001;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm<vtype>(zmm[4]);
-    zmm[5] = sort_zmm<vtype>(zmm[5]);
-    zmm[6] = sort_zmm<vtype>(zmm[6]);
-    zmm[7] = sort_zmm<vtype>(zmm[7]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    bitonic_merge_four_zmm<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename type_t>
-static inline void
-swap(type_t *arr, npy_intp ii, npy_intp jj)
-{
-    type_t temp = arr[ii];
-    arr[ii] = arr[jj];
-    arr[jj] = temp;
-}
-
-// Median of 3 strategy
-// template<typename type_t>
-// static inline
-// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
-// right) {
-//    return (rand() % (right + 1 - left)) + left;
-//    //npy_intp middle = ((right-left)/2) + left;
-//    //type_t a = arr[left], b = arr[middle], c = arr[right];
-//    //if ((b >= a && b <= c) || (b <= a && b >= c))
-//    //    return middle;
-//    //if ((a >= b && a <= c) || (a <= b && a >= c))
-//    //    return left;
-//    //else
-//    //    return right;
-//}
-
-/*
- * Picking the pivot: Median of 72 array elements chosen at random.
- */
-
-template <typename vtype, typename type_t>
-static inline type_t
-get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
-{
-    /* seeds for vectorized random number generator */
-    __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
-                                    1324281658759788278, 6214952190349879213);
-    __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
-                                    7874578921548791257, 1998265912745817298);
-    s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
-    s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
-
-    npy_intp arrsize = right - left + 1;
-    __m256i bound =
-            _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
-    __m512i left_vec = _mm512_set1_epi64(left);
-    __m512i right_vec = _mm512_set1_epi64(right);
-    using ymm_t = typename vtype::ymm_t;
-    ymm_t v[9];
-    /* fill 9 vectors with random numbers */
-    for (npy_int i = 0; i < 9; ++i) {
-        __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
-        __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(
-                rand_64, bound)); /* random numbers between 0 and bound - 1 */
-        __m512i indices;
-        if (i < 5)
-            indices =
-                    _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
-        else
-            indices =
-                    _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
-
-        v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr);
-    }
-
-    /* median network for 9 elements */
-    COEX<vtype>(v[0], v[1]);
-    COEX<vtype>(v[2], v[3]);
-    COEX<vtype>(v[4], v[5]);
-    COEX<vtype>(v[6], v[7]);
-    COEX<vtype>(v[0], v[2]);
-    COEX<vtype>(v[1], v[3]);
-    COEX<vtype>(v[4], v[6]);
-    COEX<vtype>(v[5], v[7]);
-    COEX<vtype>(v[0], v[4]);
-    COEX<vtype>(v[1], v[2]);
-    COEX<vtype>(v[5], v[6]);
-    COEX<vtype>(v[3], v[7]);
-    COEX<vtype>(v[1], v[5]);
-    COEX<vtype>(v[2], v[6]);
-    COEX<vtype>(v[3], v[5]);
-    COEX<vtype>(v[2], v[4]);
-    COEX<vtype>(v[3], v[4]);
-    COEX<vtype>(v[3], v[8]);
-    COEX<vtype>(v[4], v[8]);
-
-    // technically v[4] needs to be sorted before we pick the correct median,
-    // picking the 4th element works just as well for performance
-    type_t *temp = (type_t *)&v[4];
-
-    return temp[4];
-}
-
-/*
- * Partition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline npy_int
-partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
-              const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
-{
-    /* which elements are larger than the pivot */
-    __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec);
-    npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
-    vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask,
-                               curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-
-/*
- * Partition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline npy_intp
-partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
-                 type_t *smallest, type_t *biggest)
-{
-    /* make array length divisible by 16 , shortening the array */
-    for (npy_int i = (right - left) % 16; i > 0; --i) {
-        *smallest = MIN(*smallest, arr[left]);
-        *biggest = MAX(*biggest, arr[left]);
-        if (arr[left] > pivot) {
-            swap(arr, left, --right);
-        }
-        else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than 16 elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == 16) {
-        zmm_t vec = vtype::loadu(arr + left);
-        npy_int amount_gt_pivot = partition_vec<vtype>(
-                arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (16 - amount_gt_pivot);
-    }
-
-    // first and last 16 values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - 16));
-    // store points of the vectors
-    npy_intp r_store = right - 16;
-    npy_intp l_store = left;
-    // indices for loading the elements
-    left += 16;
-    right -= 16;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + 16) - right < left - l_store) {
-            right -= 16;
-            curr_vec = vtype::loadu(arr + right);
-        }
-        else {
-            curr_vec = vtype::loadu(arr + left);
-            left += 16;
-        }
-        // partition the current vector and save it on both sides of the array
-        npy_int amount_gt_pivot =
-                partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec,
-                                     pivot_vec, &min_vec, &max_vec);
-        ;
-        r_store -= amount_gt_pivot;
-        l_store += (16 - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    npy_int amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + 16, vec_left,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, l_store + 16, vec_right,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-template <typename vtype, typename type_t>
-static inline void
-qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
-{
-    /*
-     * Resort to heapsort if quicksort isn't making any progress
-     */
-    if (max_iters <= 0) {
-        heapsort_<typename vtype::tag>(arr + left, right + 1 - left);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128<vtype>(arr + left, (npy_int)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot,
-                                                   &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-static inline npy_intp
-replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
-{
-    npy_intp nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) {
-            loadmask = (0x0001 << arrsize) - 0x0001;
-        }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((npy_int)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-static inline void
-replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
-{
-    for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = NPY_NANF;
-        nan_count -= 1;
-    }
-}
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1,
-                                         2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1,
-                                           2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize);
-        qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1,
-                                             2 * (npy_int)log2(arrsize));
-        replace_inf_with_nan((npy_float *)arr, arrsize, nan_count);
-    }
-}
-
-#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
deleted file mode 100644
index 6340e2bc7..000000000
--- a/numpy/core/src/npysort/x86-qsort.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
new file mode 160000
+Subproject ac6c10cd71a8aaa93996860f33dd81e15447f58
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index 2d6ba3574..c26ace9f1 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -26,6 +26,14 @@
 #include "dispatching.h"
 
 
+/* TODO: from wrapping_array_method.c, use proper public header eventually  */
+NPY_NO_EXPORT int
+PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
+        translate_given_descrs_func *translate_given_descrs,
+        translate_loop_descrs_func *translate_loop_descrs);
+
+
 typedef struct {
     PyArray_Descr base;
     double scaling;
@@ -240,7 +248,7 @@ static PyArray_DTypeMeta PyArray_SFloatDType = {{{
     }},
     .type_num = -1,
     .scalar_type = NULL,
-    .flags = NPY_DT_PARAMETRIC,
+    .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC,
     .dt_slots = &sfloat_slots,
 };
 
@@ -439,7 +447,7 @@ sfloat_to_bool_resolve_descriptors(
 
 
 static int
-init_casts(void)
+sfloat_init_casts(void)
 {
     PyArray_DTypeMeta *dtypes[2] = {&PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[4] = {{0, NULL}};
@@ -645,13 +653,55 @@ add_sfloats_resolve_descriptors(
 }
 
 
+/*
+ * We define the hypot loop using the "PyUFunc_AddWrappingLoop" API.
+ * We use this very narrowly for mapping to the double hypot loop currently.
+ */
 static int
-add_loop(const char *ufunc_name,
-        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+translate_given_descrs_to_double(
+        int nin, int nout, PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    for (int i = 0; i < 3; i++) {
+        if (given_descrs[i] == NULL) {
+            new_descrs[i] = NULL;
+        }
+        else {
+            new_descrs[i] = PyArray_DescrFromType(NPY_DOUBLE);
+        }
+    }
+    return 0;
+}
+
+
+static int
+translate_loop_descrs(
+        int nin, int nout, PyArray_DTypeMeta *new_dtypes[],
+        PyArray_Descr *given_descrs[],
+        PyArray_Descr *NPY_UNUSED(original_descrs[]),
+        PyArray_Descr *loop_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    loop_descrs[0] = sfloat_common_instance(
+            given_descrs[0], given_descrs[1]);
+    if (loop_descrs[0] == 0) {
+        return -1;
+    }
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[1] = loop_descrs[0];
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[2] = loop_descrs[0];
+    return 0;
+}
+
+
+static PyObject *
+sfloat_get_ufunc(const char *ufunc_name)
 {
     PyObject *mod = PyImport_ImportModule("numpy");
     if (mod == NULL) {
-        return -1;
+        return NULL;
     }
     PyObject *ufunc = PyObject_GetAttrString(mod, ufunc_name);
     Py_DECREF(mod);
@@ -659,6 +709,18 @@ add_loop(const char *ufunc_name,
         Py_DECREF(ufunc);
         PyErr_Format(PyExc_TypeError,
                 "numpy.%s was not a ufunc!", ufunc_name);
+        return NULL;
+    }
+    return ufunc;
+}
+
+
+static int
+sfloat_add_loop(const char *ufunc_name,
+        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
         return -1;
     }
     PyObject *dtype_tup = PyArray_TupleFromItems(3, (PyObject **)dtypes, 1);
@@ -679,6 +741,24 @@ add_loop(const char *ufunc_name,
 }
 
 
+static int
+sfloat_add_wrapping_loop(const char *ufunc_name, PyArray_DTypeMeta *dtypes[3])
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *double_dt = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+    PyArray_DTypeMeta *wrapped_dtypes[3] = {double_dt, double_dt, double_dt};
+    int res = PyUFunc_AddWrappingLoop(
+        ufunc, dtypes, wrapped_dtypes, &translate_given_descrs_to_double,
+        &translate_loop_descrs);
+    Py_DECREF(ufunc);
+    Py_DECREF(double_dt);
+
+    return res;
+}
+
 
 /*
  * We add some very basic promoters to allow multiplying normal and scaled
@@ -706,7 +786,7 @@ promote_to_sfloat(PyUFuncObject *NPY_UNUSED(ufunc),
  * get less so with the introduction of public API).
  */
 static int
-init_ufuncs(void) {
+sfloat_init_ufuncs(void) {
     PyArray_DTypeMeta *dtypes[3] = {
             &PyArray_SFloatDType, &PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[3] = {{0, NULL}};
@@ -727,7 +807,7 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    int res = add_loop("multiply",
+    int res = sfloat_add_loop("multiply",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
@@ -745,13 +825,18 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    res = add_loop("add",
+    res = sfloat_add_loop("add",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
         return -1;
     }
 
+    /* N.B.: Wrapping isn't actually correct if scaling can be negative */
+    if (sfloat_add_wrapping_loop("hypot", dtypes) < 0) {
+        return -1;
+    }
+
     /*
      * Add a promoter for both directions of multiply with double.
      */
@@ -766,14 +851,14 @@ init_ufuncs(void) {
     if (promoter == NULL) {
         return -1;
     }
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     if (res < 0) {
         Py_DECREF(promoter);
         return -1;
     }
     promoter_dtypes[0] = double_DType;
     promoter_dtypes[1] = &PyArray_SFloatDType;
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     Py_DECREF(promoter);
     if (res < 0) {
         return -1;
@@ -814,11 +899,11 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
         return NULL;
     }
 
-    if (init_casts() < 0) {
+    if (sfloat_init_casts() < 0) {
         return NULL;
     }
 
-    if (init_ufuncs() < 0) {
+    if (sfloat_init_ufuncs() < 0) {
         return NULL;
     }
 
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 1bf459ce6..b427991e5 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -9,6 +9,9 @@
 #include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -19,6 +22,9 @@
 #include "npy_cpu_features.h"
 #include "npy_cpu_dispatch.h"
 #include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+
 
 /*
  *****************************************************************************
@@ -300,7 +306,7 @@ static void
                     ptr_this += stride_d;
                     ptr_that += stride_d;
                 }
-                *(@typ@ *)data_out = npy_@sqrt_func@(out);
+                *(@typ@ *)data_out = @sqrt_func@(out);
                 data_that += stride_n;
                 data_out += stride_p;
             }
@@ -343,6 +349,50 @@ static void
 
 /**end repeat**/
 
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+               char **args, npy_intp const *dimensions,
+               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    char *ip=args[0], *op=args[1];
+    for (i = 0; i < di; i++, ip += is, op += os) {
+        if (i == 3) {
+            *(int32_t *)op = - 100;
+        } else {
+            *(int32_t *)op = - *(int32_t *)ip;
+        }
+    }
+    return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    npy_intp is1 = steps[0], isindex = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    int32_t *indexed;
+    for(i = 0; i < n; i++, indx += isindex) {
+        indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx);
+        if (i == 3) {
+            *indexed = -200;
+        } else {
+            *indexed = - *indexed;
+        }
+    }
+    return 0;
+}
+
+
+
 /*  The following lines were generated using a slightly modified
     version of code_generators/generate_umath.py and adding these
     lines to defdict:
@@ -671,6 +721,43 @@ err:
     return NULL;
 }
 
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+    if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) {
+        return -1;
+    }
+
+    PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+                                    PyUFunc_Zero, "indexed_negative", NULL, 0);
+    if (negative == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType};
+
+    PyType_Slot slots[] = {
+        {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+        {NPY_METH_strided_loop, INT32_negative},
+        {0, NULL}
+    };
+
+    PyArrayMethod_Spec spec = {
+        .name = "negative_indexed_loop",
+        .nin = 1,
+        .nout = 1,
+        .dtypes = dtypes,
+        .slots = slots,
+        .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+    };
+
+    if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+        Py_DECREF(negative);
+        return -1;
+    }
+    PyDict_SetItemString(dict, "indexed_negative", negative);
+    Py_DECREF(negative);
+    return 0;
+}
+
 static PyMethodDef UMath_TestsMethods[] = {
     {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
      "Test signature parsing of ufunc. \n"
@@ -733,5 +820,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
                         "cannot load _umath_tests module.");
         return NULL;
     }
+
+    if (add_INT32_negative_indexed(m, d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
     return m;
 }
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 6d6c481fb..fee77b2d6 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -914,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
-     * Get the actual DTypes we operate with by mixing the operand array
-     * ones with the passed signature.
+     * Get the actual DTypes we operate with by setting op_dtypes[i] from
+     * signature[i].
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL) {
@@ -1240,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc)
 
     return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
 }
+
+/*
+ * Return the PyArrayMethodObject or PyCapsule that matches a registered
+ * tuple of identical dtypes. Return a borrowed ref of the first match.
+ */
+NPY_NO_EXPORT PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                 int ndtypes)
+{
+    PyObject *t_dtypes = PyTuple_New(ndtypes);
+    if (t_dtypes == NULL) {
+        return NULL;
+    }
+    for (int i=0; i < ndtypes; i++) {
+        PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+    }
+    PyObject *loops = ufunc->_loops;
+    Py_ssize_t length = PyList_Size(loops);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *item = PyList_GetItem(loops, i);
+        PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+        int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+                                           t_dtypes, Py_EQ);
+        if (cmp < 0) {
+            Py_DECREF(t_dtypes);
+            return NULL;
+        }
+        if (cmp == 0) {
+            continue;
+        }
+        /* Got the match */
+        Py_DECREF(t_dtypes);
+        return PyTuple_GetItem(item, 1);
+    }
+    Py_DECREF(t_dtypes);
+    Py_RETURN_NONE;
+}
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 5274957f7..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
 
 #include <assert.h>
 
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+    // Enough for compiler unroll
+    #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+    #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
 /*
  * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
  * Very large step size can be as slow as processing it using scalar. The
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
         /* condition allows compiler to optimize the generic macro */ \
         if (IS_BINARY_CONT(tin, tout)) { \
             if (abs_ptrdiff(args[2], args[0]) == 0 && \
-                    abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+                    abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else if (abs_ptrdiff(args[2], args[1]) == 0 && \
-                         abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+                         abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else { \
@@ -378,19 +391,6 @@ abs_ptrdiff(char *a, char *b)
 
 #undef abs_ptrdiff
 
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
-    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
-     npy_is_aligned(args[0], (esize)))
-
 /* align var to alignment */
 #define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
     npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index 56ac96598..965a0eb83 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -13,10 +13,13 @@
 
 #include "convert_datatype.h"
 #include "array_method.h"
+#include "array_coercion.h"
 #include "dtype_transfer.h"
 #include "legacy_array_method.h"
 #include "dtypemeta.h"
 
+#include "ufunc_object.h"
+
 
 typedef struct {
     NpyAuxData base;
@@ -91,7 +94,6 @@ generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context),
     return 0;
 }
 
-
 /*
  * Signal that the old type-resolution function must be used to resolve
  * the descriptors (mainly/only used for datetimes due to the unit).
@@ -234,6 +236,97 @@ get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
 }
 
 
+
+/*
+ * We can shave off a bit of time by just caching the initial and this is
+ * trivial for all internal numeric types.  (Wrapped ufuncs never use
+ * byte-swapping.)
+ */
+static int
+copy_cached_initial(
+        PyArrayMethod_Context *context, npy_bool NPY_UNUSED(reduction_is_empty),
+        char *initial)
+{
+    memcpy(initial, context->method->legacy_initial,
+           context->descriptors[0]->elsize);
+    return 1;
+}
+
+
+/*
+ * The default `get_reduction_initial` attempts to look up the identity
+ * from the calling ufunc.  This might fail, so we only call it when necessary.
+ *
+ * For internal number dtypes, we can easily cache it, so do so after the
+ * first call by overriding the function with `copy_cache_initial`.
+ * This path is not publicly available.  That could be added, and for a
+ * custom initial getter it should be static/compile time data anyway.
+ */
+static int
+get_initial_from_ufunc(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *initial)
+{
+    if (context->caller == NULL
+            || !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) {
+        /* Impossible in NumPy 1.24;  guard in case it becomes possible. */
+        PyErr_SetString(PyExc_ValueError,
+                "getting initial failed because it can only done for legacy "
+                "ufunc loops when the ufunc is provided.");
+        return -1;
+    }
+    npy_bool reorderable;
+    PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+            (PyUFuncObject *)context->caller, &reorderable);
+    if (identity_obj == NULL) {
+        return -1;
+    }
+    if (identity_obj == Py_None) {
+        /* UFunc has no idenity (should not happen) */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+    if (PyTypeNum_ISUNSIGNED(context->descriptors[1]->type_num)
+            && PyLong_CheckExact(identity_obj)) {
+        /*
+         * This is a bit of a hack until we have truly loop specific
+         * identities.  Python -1 cannot be cast to unsigned so convert
+         * it to a NumPy scalar, but we use -1 for bitwise functions to
+         * signal all 1s.
+         * (A builtin identity would not overflow here, although we may
+         * unnecessary convert 0 and 1.)
+         */
+        Py_SETREF(identity_obj, PyObject_CallFunctionObjArgs(
+                     (PyObject *)&PyLongArrType_Type, identity_obj, NULL));
+        if (identity_obj == NULL) {
+            return -1;
+        }
+    }
+    else if (context->descriptors[0]->type_num == NPY_OBJECT
+            && !reduction_is_empty) {
+        /* Allows `sum([object()])` to work, but use 0 when empty. */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+
+    int res = PyArray_Pack(context->descriptors[0], initial, identity_obj);
+    Py_DECREF(identity_obj);
+    if (res < 0) {
+        return -1;
+    }
+
+    if (PyTypeNum_ISNUMBER(context->descriptors[0]->type_num)) {
+        /* For numbers we can cache to avoid going via Python ints */
+        memcpy(context->method->legacy_initial, initial,
+               context->descriptors[0]->elsize);
+        context->method->get_reduction_initial = &copy_cached_initial;
+    }
+
+    /* Reduction can use the initial value */
+    return 1;
+}
+
+
 /*
  * Get the unbound ArrayMethod which wraps the instances of the ufunc.
  * Note that this function stores the result on the ufunc and then only
@@ -273,6 +366,27 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         flags = _NPY_METH_FORCE_CAST_INPUTS;
     }
 
+    get_reduction_initial_function *get_reduction_intial = NULL;
+    if (ufunc->nin == 2 && ufunc->nout == 1) {
+        npy_bool reorderable = NPY_FALSE;
+        PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+                ufunc, &reorderable);
+        if (identity_obj == NULL) {
+            return NULL;
+        }
+        /*
+         * TODO: For object, "reorderable" is needed(?), because otherwise
+         *       we disable multi-axis reductions `arr.sum(0, 1)`. But for
+         *       `arr = array([["a", "b"], ["c", "d"]], dtype="object")`
+         *       it isn't actually reorderable (order changes result).
+         */
+        if (reorderable) {
+            flags |= NPY_METH_IS_REORDERABLE;
+        }
+        if (identity_obj != Py_None) {
+            get_reduction_intial = &get_initial_from_ufunc;
+        }
+    }
     for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
         if (signature[i]->singleton->flags & (
                 NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) {
@@ -283,9 +397,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         }
     }
 
-    PyType_Slot slots[3] = {
-        {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
+    PyType_Slot slots[4] = {
+        {_NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
         {NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors},
+        {NPY_METH_get_reduction_initial, get_reduction_intial},
         {0, NULL},
     };
     if (any_output_flexible) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0b4856847..97a74b425 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
 
 #include "npy_pycompat.h"
 
@@ -31,27 +32,9 @@
  */
 #define PW_BLOCKSIZE    128
 
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
 /** Provides the various *_LOOP macros */
 #include "fast_loop_macros.h"
 
-/*
- * include vectorized functions and dispatchers
- * this file is safe to include also for generic builds
- * platform specific instructions are either masked via the proprocessor or
- * runtime detected
- */
-#include "simd.inc"
-
 /******************************************************************************
  **                          GENERIC FLOAT LOOPS                             **
  *****************************************************************************/
@@ -416,98 +399,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP =  &&, ||#
- * #SC =  ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-        /*
-         * stick with our variant for more reliable performance, only known
-         * platform which outperforms it by ~20% is an i7 with glibc 2.17
-         */
-        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-#else
-        /* for now only use libc on 32-bit/non-x86 */
-        if (steps[1] == 1) {
-            npy_bool * op = (npy_bool *)args[0];
-#if @and@
-            /* np.all(), search for a zero (false) */
-            if (*op) {
-                *op = memchr(args[1], 0, dimensions[0]) == NULL;
-            }
-#else
-            /*
-             * np.any(), search for a non-zero (true) via comparing against
-             * zero blocks, memcmp is faster than memchr on SSE4 machines
-             * with glibc >= 2.12 and memchr can only check for equal 1
-             */
-            static const npy_bool zero[4096]; /* zero by C standard */
-            npy_uintp i, n = dimensions[0];
-
-            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
-                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
-            }
-            if (!*op && n - i > 0) {
-                *op = memcmp(&args[1][i], zero, n - i) != 0;
-            }
-#endif
-            return;
-        }
-#endif
-        else {
-            BINARY_REDUCE_LOOP(npy_bool) {
-                const npy_bool in2 = *(npy_bool *)ip2;
-                io1 = io1 @OP@ in2;
-                if (io1 @SC@ 0) {
-                    break;
-                }
-            }
-            *((npy_bool *)iop1) = io1;
-        }
-    }
-    else {
-        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-        else {
-            BINARY_LOOP {
-                const npy_bool in1 = *(npy_bool *)ip1;
-                const npy_bool in2 = *(npy_bool *)ip2;
-                *((npy_bool *)op1) = in1 @OP@ in2;
-            }
-        }
-    }
-}
-/**end repeat**/
-
-/**begin repeat
- * #kind = absolute, logical_not#
- * #OP =  !=, ==#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
-        return;
-    }
-    else {
-        UNARY_LOOP {
-            npy_bool in1 = *(npy_bool *)ip1;
-            *((npy_bool *)op1) = in1 @OP@ 0;
-        }
-    }
-}
-/**end repeat**/
-
 NPY_NO_EXPORT void
 BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -516,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps,
     }
 }
 
-
-/**begin repeat
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
-}
-
-/**end repeat**/
-
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -552,8 +425,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed
 #define @TYPE@_fmax @TYPE@_maximum
+#define @TYPE@_fmax_indexed @TYPE@_maximum_indexed
 #define @TYPE@_fmin @TYPE@_minimum
+#define @TYPE@_fmin_indexed @TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -562,155 +438,30 @@ NPY_NO_EXPORT void
         *((@type@ *)op1) = 1;
     }
 }
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
-}
-
 /**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
  * #OP = +, -, *, &, |, ^#
  */
 
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
-    }
-    else {
-        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
-    }
-}
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- *       which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                   void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * gcc vectorization of this is not good (PR60575) but manual integer
-     * vectorization is too tedious to be worthwhile
-     */
-    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const int t1 = !!*(@type@ *)ip1;
-        const int t2 = !!*(@type@ *)ip2;
-        *((npy_bool *)op1) = (t1 != t2);
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
-#endif
-
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -752,23 +503,6 @@ NPY_NO_EXPORT void
         *((@type@ *) op1) = out;
     }
 }
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
-}
-/**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -776,19 +510,6 @@ NPY_NO_EXPORT void
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  * #c    = ,,,l,ll#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -802,7 +523,6 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -810,19 +530,6 @@ NPY_NO_EXPORT void
  * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  * #c    = u,u,u,ul,ull#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -836,9 +543,50 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
+/**end repeat**/
+
+
+/*
+ * NOTE: It may be nice to vectorize these, OTOH, these are still faster
+ *       than the cast we used to do.
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 @OP@ in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 @OP@ (npy_ulonglong)in2;
+        }
+    }
+}
 
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 @OP@ 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 @OP@ in2;
+        }
+    }
+}
 /**end repeat**/
 
+
 /*
  *****************************************************************************
  **                           DATETIME LOOPS                                **
@@ -915,12 +663,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -979,6 +721,7 @@ NPY_NO_EXPORT void
         }
     }
 }
+
 /**end repeat1**/
 
 /**begin repeat1
@@ -1398,6 +1141,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
+ *  #fd = 1, 1, 0#
+ *  #VCHK = 1, 1, 0#
  */
 /**begin repeat1
  * #kind = logical_and, logical_or#
@@ -1434,32 +1179,22 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@fd@
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
  **/
-
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- * #isa  = simd, avx512_skx#
- * #CHK  = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((npy_bool *)op1) = @func@(in1) != 0;
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = @func@(in1) != 0;
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-#endif
-/**end repeat2**/
 /**end repeat1**/
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1500,6 +1235,25 @@ NPY_NO_EXPORT void
     }
 }
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1634,6 +1388,26 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
         }
     }
 }
+
+NPY_NO_EXPORT int
+LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1739,6 +1513,26 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         }
     }
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
+    }
+    return 0;
+}
 /**end repeat**/
 
 #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
@@ -1840,6 +1634,27 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
     }
     /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1855,8 +1670,29 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+    /* no need to clear floatstatus_invalid */
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for (i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -1875,6 +1711,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        float v = npy_half_to_float(*(npy_half *)value);
+        float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+        *indexed = npy_float_to_half(div);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1934,12 +1791,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v
     }
 }
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
-}
-
 NPY_NO_EXPORT void
 HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2091,6 +1942,26 @@ NPY_NO_EXPORT void
         }
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_@kind@_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    }
+    return 0;
+}
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -2105,6 +1976,28 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
 #endif // !SIMD
 
 NPY_NO_EXPORT void
@@ -2216,6 +2109,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -2226,6 +2121,7 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
     }
 }
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -2256,6 +2152,8 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
     UNARY_LOOP {
@@ -2275,20 +2173,6 @@ NPY_NO_EXPORT void
         *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
     }
 }
-
-#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
-/**begin repeat1
- * arithmetic
- * #kind = conjugate, square, absolute#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_@kind@(args, dimensions, steps, func);
-    }
-}
-/**end repeat1**/
 #endif
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e3a410968..cce73aff8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -10,24 +10,32 @@
     #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
 #endif
 
-#define BOOL_invert BOOL_logical_not
-#define BOOL_add BOOL_logical_or
-#define BOOL_bitwise_and BOOL_logical_and
-#define BOOL_bitwise_or BOOL_logical_or
-#define BOOL_logical_xor BOOL_not_equal
-#define BOOL_bitwise_xor BOOL_logical_xor
-#define BOOL_multiply BOOL_logical_and
-#define BOOL_maximum BOOL_logical_or
-#define BOOL_minimum BOOL_logical_and
-#define BOOL_fmax BOOL_maximum
-#define BOOL_fmin BOOL_minimum
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
  */
 
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -39,11 +47,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
 /**begin repeat
- * #kind = logical_and, logical_or, absolute, logical_not#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+ * #kind = logical_and, logical_or, logical_not, absolute#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -56,6 +68,17 @@ NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -72,6 +95,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -106,19 +134,40 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
 /**begin repeat
- * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ *         subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *         left_shift, right_shift, logical_and, logical_or,
+ *         logical_xor, isnan, isinf, isfinite,
+ *         absolute, sign#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
 /**begin repeat1
  * both signed and unsigned integer types
  * #s = , u#
  * #S = , U#
  */
-
 #define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
 #define @S@@TYPE@_fmax @S@@TYPE@_maximum
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
+#define @S@@TYPE@_fmax_indexed @S@@TYPE@_maximum_indexed
+#define @S@@TYPE@_fmin_indexed @S@@TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
@@ -127,83 +176,53 @@ NPY_NO_EXPORT void
 @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
  *          left_shift, right_shift#
  * #OP = +, -,*, &, |, ^, <<, >>#
  */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-/**end repeat3**/
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**begin repeat2
  * #kind = maximum, minimum#
- * #OP =  >, <#
  **/
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat2**/
 
-NPY_NO_EXPORT void
-@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
 
 NPY_NO_EXPORT void
-@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat2
- * #kind = isnan, isinf, isfinite#
- **/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**end repeat**/
 
- 
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_unary.dispatch.h"
 #endif
@@ -240,6 +259,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_unary.dispatch.h"
 #endif
 /**begin repeat
@@ -262,10 +295,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 /**end repeat**/
 
@@ -311,15 +347,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
-    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
-
-/**end repeat**/
-
-/**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
@@ -332,27 +359,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat1**/
 /**end repeat**/
 
-/**begin repeat
- *  #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat1**/
-/**end repeat**/
-
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_trigonometric.dispatch.h"
 #endif
+
 /**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
  *  #func = sin, cos#
  */
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, (
     char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
 ))
+/**end repeat1**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -391,6 +411,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #c = f, f, , l#
  *  #C = F, F, , L#
  *  #half = 1, 0, 0, 0#
+ *  #fd = 0, 1, 1, 0#
  */
 
 /**begin repeat1
@@ -400,6 +421,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
 /**end repeat1**/
 
 /**begin repeat1
@@ -419,35 +445,44 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
  **/
 
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 /**end repeat2**/
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = maximum, minimum#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = fmax, fmin#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
@@ -504,6 +539,19 @@ NPY_NO_EXPORT void
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = HALF#
+ */
+/**begin repeat1
+ * #kind = absolute#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 /*
  *****************************************************************************
  **                           COMPLEX LOOPS                                 **
@@ -516,7 +564,21 @@ NPY_NO_EXPORT void
  * #TYPE = CFLOAT, CDOUBLE#
  */
 /**begin repeat1
- * #kind = add, subtract, multiply#
+ * #kind = add, subtract, multiply, conjugate, square#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = absolute#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -543,6 +605,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  */
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+C@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -585,19 +650,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step
 NPY_NO_EXPORT void
 C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
-/**begin repeat1
- * #isa = , _avx512f#
- */
-
 NPY_NO_EXPORT void
-C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
-/**end repeat1**/
+C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -658,9 +718,6 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 #define @TYPE@_isnan @TYPE@_isnat
 
 NPY_NO_EXPORT void
@@ -736,6 +793,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
 #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
 /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = TIMEDELTA, DATETIME#
+ */
+/**begin repeat1
+ * #kind = isinf#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                            OBJECT LOOPS                                 **
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index f637ecfe4..3ab5a968d 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 avx2 avx512f
+ ** sse2 (avx2 fma3)
+ ** neon asimd
+ ** vsx2 vsx3
  ** vx vxe
  **/
 #define _UMATHMODULE
@@ -14,707 +16,392 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
-// TODO: replace raw SIMD with NPYV
+/**
+ * TODO:
+ *  - Improve the implementation of SIMD complex absolute,
+ *    current one kinda slow and it can be optimized by
+ *    at least avoiding the division and keep sqrt.
+ *  - Vectorize reductions
+ *  - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
 //###############################################################################
 //## Real Single/Double precision
 //###############################################################################
 /********************************************************************************
- ** Defining the SIMD kernels
+ ** Defining ufunc inner functions
  ********************************************************************************/
-#ifdef NPY_HAVE_SSE2
+
+/*
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a divide operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the divide operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(_MSC_VER)
+        // clang-cl has the same bug
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+#endif
+
 /**begin repeat
+ * Float types
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #sfx  = f32, f64#
  *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
+ *  #C = F, #
+ *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = add, sub, mul, div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if @PW@
+        *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+    #else
+        @type@ acc = *((@type@*)src0);
+        if (ssrc1 == sizeof(@type@)) {
+            for (; len > 0; --len, src1 += sizeof(@type@)) {
+                acc @OP@= *(@type@ *)src1;
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc @OP@= *(@type@ *)src1;
             }
         }
+        *((@type@*)src0) = acc;
+    #endif
+        return;
     }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+#if @VECTOR@
+    if (len > npyv_nlanes_@sfx@*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_@sfx@;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if @is_div@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for divide and working around clang partial load bug
+            if(len > 0){
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
+            npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for multiply / divide and working around clang partial load bug
+            if(len > 0){
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
+            npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
+        } else {
+            goto loop_scalar;
         }
+        npyv_cleanup();
+        return;
     }
+loop_scalar:
 #endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const @type@ a = *((@type@*)src0);
+        const @type@ b = *((@type@*)src1);
+        *((@type@*)dst) = a @OP@ b;
     }
 }
 
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 /**end repeat1**/
 /**end repeat**/
 
-#else // NPY_HAVE_SSE2
+#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
 
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #sfx = f32, f64#
- *  #CHK = _F32, _F64#
- */
-#if NPY_SIMD@CHK@
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-    /* lots of specializations, to squeeze out max performance */
-    if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
 
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
 
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
 {
-    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
+    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+    npyv_f32x2 r;
+    r.val[0] = fill;
+    r.val[1] = fill;
+    return r;
 }
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif // NPY_HAVE_SSE2
 
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-static inline int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
 {
-#if @vector@ && defined NPY_HAVE_SSE2
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-#if defined NPY_HAVE_AVX512F
-    const npy_uintp vector_size_bytes = 64;
-#elif defined NPY_HAVE_AVX2
-    const npy_uintp vector_size_bytes = 32;
+#if NPY_SIMD_BIGENDIAN
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
 #else
-    const npy_uintp vector_size_bytes = 32;
-#endif
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#elif @VECTOR@
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
 #endif
-    return 0;
+    return npyv_xor_f32(x, mask);
 }
-/**end repeat1**/
-/**end repeat**/
 
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #c = f, #
- *  #C = F, #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
 {
-    if (IS_BINARY_REDUCE) {
-#if @PW@
-        @type@ * iop1 = (@type@ *)args[0];
-        npy_intp n = dimensions[0];
-
-        *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
-#else
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 @OP@= *(@type@ *)ip2;
-        }
-        *((@type@ *)iop1) = io1;
-#endif
-    }
-    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = in1 @OP@ in2;
-        }
-    }
+    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+    // a_im * b_im, a_im * b_re
+    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+    return npyv_muladdsub_f32(a_re, b, ab_iiir);
 }
-/**end repeat1**/
-/**end repeat**/
 
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
-    /**
-     * For somehow MSVC commit aggressive optimization lead
-     * to raises 'RuntimeWarning: invalid value encountered in multiply'
-     *
-     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
-     * investigate about it while moving to NPYV.
-     */
-    #define AVX512F_NOMSVC
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
 #endif
 
-#ifdef AVX512F_NOMSVC
-NPY_FINLINE __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-NPY_FINLINE __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
+#if NPY_SIMD_F64
 
-NPY_FINLINE __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
 {
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+    npyv_f64 r = npyv_setall_f64(a[0]);
+    npyv_f64 i = npyv_setall_f64(a[1]);
+    return npyv_zip_f64(r, i);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
 {
-    return (0x0001 << num_elem) - 0x0001;
+    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+    ));
+    return npyv_xor_f64(x, mask);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
 {
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+    // a_im * b_im, a_im * b_re
+    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+    return npyv_muladdsub_f64(a_re, b, ab_iiir);
 }
 
-NPY_FINLINE @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-NPY_FINLINE @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-/**end repeat**/
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
 #endif
 
 /**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
- *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-#if defined AVX512F_NOMSVC
-static inline void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = 2*array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* ip2 = (@type@*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
-        ip1 += @num_lanes@;
-        ip2 += @num_lanes@;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif // AVX512F_NOMSVC
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
  */
-static inline int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
 {
-#if defined AVX512F_NOMSVC
-    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
-        AVX512F_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
 }
-/**end repeat1**/
+#endif // VECTOR
 /**end repeat**/
 
 /********************************************************************************
@@ -724,55 +411,345 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const
  * complex types
  * #TYPE = CFLOAT, CDOUBLE#
  * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
  * #c = f, #
  * #C = F, #
  */
 /**begin repeat1
  * arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
- * #PW = 1, 0#
+ * #kind = add, subtract, multiply#
+ * #vectorf = npyv_add, npyv_sub, simd_cmul#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    // Parenthesis around @PW@ tells clang dead code is intentional
-    if (IS_BINARY_REDUCE && (@PW@)) {
-        npy_intp n = dimensions[0];
-        @ftype@ * or = ((@ftype@ *)args[0]);
-        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(@ftype@)*2) == 0
+    ) {
+        @ftype@ *rl_im = (@ftype@ *)b_src0;
         @ftype@ rr, ri;
-
-        @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
-        *or @OP@= rr;
-        *oi @OP@= ri;
+        @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] @OP@= rr;
+        rl_im[1] @OP@= ri;
         return;
     }
-    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
-            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+#endif
+#if @VECTOR@
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(@ftype@) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(@ftype@) != 0 ||
+        b_ssrc1 % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src0 = (@ftype@*)b_src0;
+    const @ftype@ *src1 = (@ftype@*)b_src1;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
+    const npy_intp sdst  = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+            npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+            npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+                npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+                npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
         }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if @is_mul@
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+            npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+            npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if @is_mul@
+            npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+        #else
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+        #endif
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
     }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
+        const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
+        const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
+        const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
+    #if @is_mul@
+        ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
+        ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+    #if @is_mul@
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    #endif
+    }
+    return 0;
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+/**begin repeat1
+ *  #kind = conjugate, square#
+ *  #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
-            ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(@ftype@) != 0 ||
+        b_ssrc % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src  = (@ftype@*)b_src;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
+    const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
         }
     }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const @ftype@ rl = ((@ftype@ *)b_src)[0];
+        const @ftype@ im = ((@ftype@ *)b_src)[1];
+    #if @is_square@
+        ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
+        ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((@ftype@ *)b_dst)[0] = rl;
+        ((@ftype@ *)b_dst)[1] = -im;
+    #endif
+    }
 }
+/**end repeat1**/
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index ea8685002..b6f126298 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        @type@ in2 = *(@type@ *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src
new file mode 100644
index 000000000..bdbfa0f86
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src
@@ -0,0 +1,287 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ *          npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply#
+ * #OP = +, -, *#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c    = u,u,u,ul,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2));
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c    = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+/**begin repeat1
+ * #kind = conjugate, invert, isnan, isinf, isfinite,
+ *         logical_and, logical_or, logical_xor, logical_not,
+ *         bitwise_and, bitwise_or, bitwise_xor#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 2cd76c35e..751080871 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -234,7 +234,7 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -258,7 +258,7 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -281,7 +281,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -312,19 +312,23 @@ static inline void
 run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @VECTOR@
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@sfx@(args, dimensions[0]);
-        return;
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
+            simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
+            simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(@type@, npy_bool)) {
+            simd_binary_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
     }
 #endif
 
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 182c57b01..1fac3c150 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
 
 #ifdef SIMD_AVX512F
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src
new file mode 100644
index 000000000..c07525be4
--- /dev/null
+++ b/numpy/core/src/umath/loops_logical.dispatch.c.src
@@ -0,0 +1,377 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+    // tmp is filled with 0xff/0x00, negate and mask to boolean true
+    return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+/*
+ * For logical_and, we have to be careful to handle non-bool inputs where
+ * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
+ * Both evaluate to boolean true, however, a & b is false.  Return value
+ * should be consistent with byte_to_true().
+ */
+NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
+    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
+    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
+    return npyv_andc_u8(truemask, r);
+}
+/*
+ * We don't really need the following, but it simplifies the templating code
+ * below since it is paired with simd_logical_and_u8() above.
+ */
+NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
+{
+    npyv_u8 r = npyv_or_u8(a, b);
+    return byte_to_true(r);
+}
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #and  = 1, 0#
+ * #scalar_op = &&, ||#
+ * #intrin = and, or#
+ * #reduce = min, max#
+ * #scalar_cmp = ==, !=#
+ * #anyall = all, any#
+ */
+static void
+simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
+        npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
+        npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@);
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = simd_logical_@intrin@_u8(a, b);
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 @scalar_op@ *ip2;
+    }
+}
+
+static void
+simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
+        npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
+        npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
+        npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
+        npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);
+
+        npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);
+
+        if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op @scalar_op@ *ip;
+        if (*op @scalar_cmp@ 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+/**end repeat**/ 
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #op = ==, !=#
+ * #not = 1, 0#
+ */
+static void
+simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if @not@
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
+#if @not@
+        npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
+#else
+        npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
+#endif
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if @not@
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip @op@ 0);
+    }
+}
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ */
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP =  &&, ||#
+ * #SC =  ==, !=#
+ * #and = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 @OP@ in2;
+                if (io1 @SC@ 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 @OP@ in2;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #OP = ==, !=#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 @OP@ 0;
+        }
+    }
+}
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 237c8e933..9d8667d38 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -451,6 +451,24 @@ clear_fp:
 #endif
 }
 
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = SCALAR_OP(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 #undef SCALAR_OP
 
 #endif // !fp_only || (is_fp && fp_only)
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 78685e807..1b77592c8 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -9,12 +9,18 @@
 #include "simd/simd.h"
 #include "loops_utils.h"
 #include "loops.h"
+#include "fast_loop_macros.h"
 /*
  * TODO:
  * - use vectorized version of Payne-Hanek style reduction for large elements or
  *   when there's no native FUSED support instead of fallback to libc
  */
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_FMA3  // native support
+/**begin repeat
+ *  #check = F64, F32#
+ *  #sfx  = f64, f32#
+ */
+#if NPY_SIMD_@check@
 /*
  * Vectorized Cody-Waite range reduction technique
  * Performs the reduction step x* = x - y*C in three steps:
@@ -23,14 +29,189 @@
  * 3) x* = x - y*c3
  * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
  */
-NPY_FINLINE npyv_f32
-simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+NPY_FINLINE npyv_@sfx@
+simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
 {
-    npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
-    reduced_x = npyv_muladd_f32(y, c2, reduced_x);
-    reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+    npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
+    reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
+    reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
     return reduced_x;
 }
+#endif
+/**end repeat**/
+
+#if NPY_SIMD_F64
+/**begin repeat
+ *  #op = cos, sin#
+ */
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+    // MSVC doesn't compile with direct vector access, so we copy it here
+    // as we have no npyv_get_lane/npyv_set_lane intrinsics
+    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+    npyv_storea_f64(out_copy, out);
+
+    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+        if (cmp_bits & (1 << i)) {
+            out_copy[i] = npy_@op@(out_copy[i]);
+        }
+    }
+
+    return npyv_loada_f64(out_copy);
+}
+/**end repeat**/
+
+/*
+ * Approximate sine algorithm for x \in [-pi/2, pi/2]
+ * worst-case error is 3.5 ulp.
+ * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
+ */
+NPY_FINLINE npyv_f64
+simd_approx_sine_poly_f64(npyv_f64 r)
+{
+    const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
+    const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
+    const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
+    const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
+    const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
+    const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
+    const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
+
+    npyv_f64 r2 = npyv_mul_f64(r, r);
+    npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
+    y = npyv_muladd_f64(y, r2, poly3);
+    y = npyv_muladd_f64(y, r2, poly4);
+    y = npyv_muladd_f64(y, r2, poly5);
+    y = npyv_muladd_f64(y, r2, poly6);
+    y = npyv_muladd_f64(y, r2, poly7);
+    y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
+
+    return y;
+}
+
+/* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+NPY_FINLINE npyv_f64
+simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
+    const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
+    const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
+    const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
+
+    return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
+}
+
+NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
+    const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)).  */
+    const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND.  */
+
+    return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
+}
+
+NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
+    const npyv_f64 range_val = npyv_setall_f64(0x1p23);
+
+    return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
+}
+
+NPY_FINLINE npyv_f64
+simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+    n = npyv_sub_f64(n, npyv_setall_f64(0.5));
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
+}
+
+NPY_FINLINE npyv_f64
+simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint(|x|/pi).  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
+}
+
+/**begin repeat
+ *  #op = cos, sin#
+ */
+NPY_FINLINE void
+simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+    const int vstep = npyv_nlanes_f64;
+
+    npyv_f64 out = npyv_zero_f64();
+    npyv_f64 x_in;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f64(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+
+        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+        npyv_b64 cmp = simd_@op@_range_check_f64(ir);
+        /* If fenv exceptions are to be triggered correctly, set any special lanes
+        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+        scalar loop later.  */
+        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+        // Some in range, at least one calculation is useful
+        if (!npyv_all_b64(cmp)) {
+            out = simd_@op@_poly_f64(r, ir, sign);
+        }
+
+        if (npyv_any_b64(cmp)) {
+            out = npyv_select_f64(cmp, x_in, out);
+            out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
+        }
+
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
 /*
  * Approximate cosine algorithm for x \in [-PI/4, PI/4]
  * Maximum ULP across all 32-bit floats = 0.875
@@ -124,6 +305,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
         } else {
             x_in = npyv_loadn_tillz_f32(src, ssrc, len);
         }
+        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+    #if NPY_SIMD_CMPSIGNAL
+        // Eliminate NaN to avoid FP invalid exception
+        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+    #endif
         npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
         npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
         /*
@@ -132,7 +318,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
          * these numbers
          */
         if (simd_maski != 0) {
-            npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
             npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
 
             npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
@@ -169,7 +354,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
                 npyv_storen_till_f32(dst, sdst, len, cos);
             }
         }
-        if (simd_maski != ((1 << vstep) - 1)) {
+        if (simd_maski != (npy_uint64)((1 << vstep) - 1)) {
             float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
             npyv_storea_f32(ip_fback, x_in);
 
@@ -194,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
     }
     npyv_cleanup();
 }
-#endif // NPY_SIMD_FMA3
+#endif // NPY_SIMD_FP32
+#endif // NYP_SIMD_FMA3
 
 /**begin repeat
  *  #func = cos, sin#
- *  #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_@func@_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+    }
+#else
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ *  #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
-    const float *src = (float*)args[0];
-          float *dst = (float*)args[1];
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
 
     const int lsize = sizeof(src[0]);
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
     assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
     ) {
@@ -222,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
         simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
     }
 #else
-    for (; len > 0; --len, src += ssrc, dst += sdst) {
-        const float src0 = *src;
-        *dst = npy_@func@f(src0);
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_@func@f(in1);
     }
 #endif
 }
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 46ce51824..89999e879 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -12,10 +12,12 @@
 /**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
+ * #len = 32, 64#
  */
 /**begin repeat1
  * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
  * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ * #fxnan = 0,    0,   0,     64,    0,     0,    0,   0,    0,    0,    0,    0,    0,     0,     0#
  */
 static void
 simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
@@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
                 x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
             }
         #endif
+    #if @fxnan@ == @len@
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x);
+        npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@));
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan);
+        out = npyv_select_@sfx@(nnan_mask, out, x);
+    #else
         npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+    #endif
         if (sdst == 1) {
             npyv_store_till_@sfx@(dst, len, out);
         } else {
@@ -50,32 +60,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-static void
-simd_@func@_f64(const double *src, npy_intp ssrc,
-                      double *dst, npy_intp sdst, npy_intp len)
-{
-    const int vstep = npyv_nlanes_f64;
-    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
-        npyv_f64 x;
-        if (ssrc == 1) {
-            x = npyv_load_tillz_f64(src, len);
-        } else {
-            x = npyv_loadn_tillz_f64(src, ssrc, len);
-        }
-        npyv_f64 out = __svml_@func@8(x);
-        if (sdst == 1) {
-            npyv_store_till_f64(dst, len, out);
-        } else {
-            npyv_storen_till_f64(dst, sdst, len, out);
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
  */
@@ -267,31 +251,3 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 }
 /**end repeat1**/
 /**end repeat**/
-
-/**begin repeat
- *  #func = sin, cos#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-    const double *src = (double*)args[0];
-          double *dst = (double*)args[1];
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
-    const npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
-        npyv_loadable_stride_f64(ssrc) &&
-        npyv_storable_stride_f64(sdst)) {
-        simd_@func@_f64(src, ssrc, dst, sdst, len);
-        return;
-    }
-#endif
-    UNARY_LOOP {
-        const npy_double in1 = *(npy_double *)ip1;
-        *(npy_double *)op1 = npy_@func@(in1);
-    }
-}
-/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
new file mode 100644
index 000000000..052ad464c
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
@@ -0,0 +1,139 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
+{
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
+}
+#endif // VECTOR
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VECTOR@
+    npy_intp len = dimensions[0];
+    npy_intp ssrc = steps[0] / sizeof(@ftype@);
+    npy_intp sdst = steps[1] / sizeof(@ftype@);
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst)
+        && steps[0] % sizeof(@ftype@) == 0
+        && steps[1] % sizeof(@ftype@) == 0
+    ) {
+        const @ftype@ *src = (@ftype@*)args[0];
+              @ftype@ *dst = (@ftype@*)args[1];
+
+        const int vstep = npyv_nlanes_@sfx@;
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && sdst == 1) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+                npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_store_@sfx@(dst, r);
+            }
+        }
+        else {
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc);
+                npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+                npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_storen_@sfx@(dst, sdst, r);
+            }
+        }
+        for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+            npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len);
+            npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im);
+            npyv_storen_till_@sfx@(dst, sdst, len, r);
+        }
+        npyv_cleanup();
+        npy_clear_floatstatus_barrier((char*)&len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @ftype@ re = ((@ftype@ *)ip1)[0];
+        const @ftype@ im = ((@ftype@ *)ip1)[1];
+        *((@ftype@ *)op1) = npy_hypot@c@(re, im);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
new file mode 100644
index 000000000..ba133dc1e
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
@@ -0,0 +1,565 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+ */
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+ */
+// TODO(@seiko2plus): add support for big-endian
+#if NPY_SIMD_BIGENDIAN
+    #undef NPY_SIMD
+    #undef NPY_SIMD_F32
+    #undef NPY_SIMD_F64
+    #define NPY_SIMD 0
+    #define NPY_SIMD_F32 0
+    #define NPY_SIMD_F64 0
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**
+ * We define intrinsics for isnan, isinf, isfinite, and signbit below.  There's
+ * a few flavors of each.  We'll use f32 as an example although f64 versions
+ * are also defined.
+ * 
+ * npyv_u32 npyv_KIND_f32(npyv_f32 v)
+ *   These are mainly used for the single vector loops.  As such, result should
+ *   be bool true / false, ready to write back.
+ * 
+ * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
+ *   These are used by the geneal intrinsics above as well as the multi-vector
+ *   packing intrinsics.  The multi-vector packing intrinsics are the ones
+ *   utilized in the unrolled vector loops.  Results should be vector masks
+ *   of 0x00/0xff.
+ * 
+ * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ *   These are the multi-vector packing intrinsics utilized by unrolled vector
+ *   loops.  They perform the operation on all input vectors and pack the
+ *   results to a single npyv_u8.  Assuming NPY_SIMD == 128, that means we
+ *   can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
+ *   Result should be bool true / false, ready to write back.
+ */
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_isnan_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notnan_f32(v0);
+    npyv_b32 b1 = npyv_notnan_f32(v1);
+    npyv_b32 b2 = npyv_notnan_f32(v2);
+    npyv_b32 b3 = npyv_notnan_f32(v3);
+    npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_isnan_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notnan_f64(v0);
+    npyv_b64 b1 = npyv_notnan_f64(v1);
+    npyv_b64 b2 = npyv_notnan_f64(v2);
+    npyv_b64 b3 = npyv_notnan_f64(v3);
+    npyv_b64 b4 = npyv_notnan_f64(v4);
+    npyv_b64 b5 = npyv_notnan_f64(v5);
+    npyv_b64 b6 = npyv_notnan_f64(v6);
+    npyv_b64 b7 = npyv_notnan_f64(v7);
+    npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+_npyv_isinf_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > FLT_MAX
+    const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
+    return vcagtq_f32(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
+    npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
+    return npyv_cmpeq_u32(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u32
+npyv_isinf_f32(npyv_f32 v)
+{
+    const npyv_u32 truemask = npyv_setall_u32(1==1);
+    return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = _npyv_isinf_f32(v0);
+    npyv_b32 b1 = _npyv_isinf_f32(v1);
+    npyv_b32 b2 = _npyv_isinf_f32(v2);
+    npyv_b32 b3 = _npyv_isinf_f32(v3);
+    npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+_npyv_isinf_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > DBL_MAX
+    const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
+    return vcagtq_f64(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
+    npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
+    return npyv_cmpeq_u64(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u64
+npyv_isinf_f64(npyv_f64 v)
+{
+    const npyv_u64 truemask = npyv_setall_u64(1==1);
+    return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = _npyv_isinf_f64(v0);
+    npyv_b64 b1 = _npyv_isinf_f64(v1);
+    npyv_b64 b2 = _npyv_isinf_f64(v2);
+    npyv_b64 b3 = _npyv_isinf_f64(v3);
+    npyv_b64 b4 = _npyv_isinf_f64(v4);
+    npyv_b64 b5 = _npyv_isinf_f64(v5);
+    npyv_b64 b6 = _npyv_isinf_f64(v6);
+    npyv_b64 b7 = _npyv_isinf_f64(v7);
+    npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F64
+
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+npyv_notfinite_f32(npyv_f32 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mentissa is.
+    const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
+    npyv_u32 bits = npyv_reinterpret_u32_f32(v);
+    bits = npyv_and_u32(bits, exp_mask);
+    return npyv_cmpeq_u32(bits, exp_mask);
+}
+NPY_FINLINE npyv_u32
+npyv_isfinite_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F32 exponent is 8-bits, which means we can pack multiple into
+    // a single vector.  We shift out sign bit so that we're left
+    // with only exponent in high byte.  If not all bits are set,
+    // then we've got a finite number.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+    tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+    tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+    tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+    const npyv_u8 expmask = npyv_setall_u8(0xff);
+    r = npyv_cmpneq_u8(r, expmask);
+    r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notfinite_f32(v0);
+    npyv_b32 b1 = npyv_notfinite_f32(v1);
+    npyv_b32 b2 = npyv_notfinite_f32(v2);
+    npyv_b32 b3 = npyv_notfinite_f32(v3);
+    npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+npyv_notfinite_f64(npyv_f64 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mantissa is.
+    const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
+    npyv_u64 bits = npyv_reinterpret_u64_f64(v);
+    bits = npyv_and_u64(bits, exp_mask);
+    return npyv_cmpeq_u64(bits, exp_mask);
+}
+NPY_FINLINE npyv_u64
+npyv_isfinite_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                       npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F64 exponent is 11-bits, which means we can pack multiple into
+    // a single vector.  We'll need to use u16 to fit all exponent
+    // bits.  If not all bits are set, then we've got a finite number.
+    uint8x16x4_t t0123, t4567;
+    t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+    t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+    t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+    t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+    t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+    t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+    t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+    t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+    const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
+    npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+    npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+    const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+    r0 = npyv_and_u16(r0, expmask);
+    r0 = npyv_cmpneq_u16(r0, expmask);
+    r0 = npyv_shri_u16(r0, 15);
+    r1 = npyv_and_u16(r1, expmask);
+    r1 = npyv_cmpneq_u16(r1, expmask);
+    r1 = npyv_shri_u16(r1, 15);
+
+    npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notfinite_f64(v0);
+    npyv_b64 b1 = npyv_notfinite_f64(v1);
+    npyv_b64 b2 = npyv_notfinite_f64(v2);
+    npyv_b64 b3 = npyv_notfinite_f64(v3);
+    npyv_b64 b4 = npyv_notfinite_f64(v4);
+    npyv_b64 b5 = npyv_notfinite_f64(v5);
+    npyv_b64 b6 = npyv_notfinite_f64(v6);
+    npyv_b64 b7 = npyv_notfinite_f64(v7);
+    npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_signbit_f32(npyv_f32 v)
+{
+    return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+    tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+    tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+    tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
+    npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
+    npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
+    npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
+    npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_signbit_f64(npyv_f64 v)
+{
+    return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                      npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+
+    // vuzp2 faster than vtbl for f64
+    npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+    npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+    npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+    npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+    npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+    npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+    npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
+    npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
+    npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
+    npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
+    npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
+    npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
+    npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
+    npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
+    npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if @STYPE@ == CONTIG
+            // contiguous input
+            npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride);
+            npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride);
+            npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride);
+            npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride);
+            npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride);
+            npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride);
+            npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if @DTYPE@ == CONTIG
+            npyv_store_u8(op, r);
+        #else // @DTYPE@ == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)];
+            }
+        #endif // @DTYPE@ == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+    #else
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+    #endif
+
+        npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+        #if npyv_nlanes_@sfx@ == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_@kind@(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ */
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VCHK@
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_@sfx@);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_@sfx@(istride) &&
+        npyv_storable_stride_@sfx@(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // @VCHK@
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+        *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index d247c2639..167164163 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -23,18 +23,19 @@
  * Returns -1 on failure.
  */
 static int
-get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
+get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
                           PyObject **with_override, PyObject **methods)
 {
     int i;
     int num_override_args = 0;
-    int narg, nout;
+    int narg, nout, nwhere;
 
     narg = (int)PyTuple_GET_SIZE(in_args);
     /* It is valid for out_args to be NULL: */
     nout = (out_args != NULL) ? (int)PyTuple_GET_SIZE(out_args) : 0;
+    nwhere = (wheremask_obj != NULL) ? 1: 0;
 
-    for (i = 0; i < narg + nout; ++i) {
+    for (i = 0; i < narg + nout + nwhere; ++i) {
         PyObject *obj;
         int j;
         int new_class = 1;
@@ -42,9 +43,12 @@ get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
         if (i < narg) {
             obj = PyTuple_GET_ITEM(in_args, i);
         }
-        else {
+        else if (i < narg + nout){
             obj = PyTuple_GET_ITEM(out_args, i - narg);
         }
+        else {
+            obj = wheremask_obj;
+        }
         /*
          * Have we seen this class before?  If so, ignore.
          */
@@ -208,7 +212,7 @@ copy_positional_args_to_kwargs(const char **keywords,
  */
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result)
 {
@@ -227,7 +231,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
      * Check inputs for overrides
      */
     num_override_args = get_array_ufunc_overrides(
-           in_args, out_args, with_override, array_ufunc_methods);
+           in_args, out_args, wheremask_obj, with_override, array_ufunc_methods);
     if (num_override_args == -1) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/override.h b/numpy/core/src/umath/override.h
index 4e9a323ca..20621bb19 100644
--- a/numpy/core/src/umath/override.h
+++ b/numpy/core/src/umath/override.h
@@ -6,7 +6,7 @@
 
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result);
 
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 817f99a04..9416e9a29 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -17,6 +17,9 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_pycompat.h"
+#include "array_assign.h"
+#include "array_coercion.h"
+#include "array_method.h"
 #include "ctors.h"
 
 #include "numpy/ufuncobject.h"
@@ -148,24 +151,15 @@ PyArray_CopyInitialReduceValues(
  * context     : The ArrayMethod context (with ufunc, method, and descriptors).
  * operand     : The array to be reduced.
  * out         : NULL, or the array into which to place the result.
- * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
- *               so that support can be added in the future without breaking
- *               API compatibility. Pass in NULL.
+ * wheremask   : Reduction mask of valid values used for `where=`.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
- * reorderable : If True, the reduction being done is reorderable, which
- *               means specifying multiple axes of reduction at once is ok,
- *               and the reduction code may calculate the reduction in an
- *               arbitrary order. The calculation may be reordered because
- *               of cache behavior or multithreading requirements.
  * keepdims    : If true, leaves the reduction dimensions in the result
  *               with size one.
  * subok       : If true, the result uses the subclass of operand, otherwise
  *               it is always a base class ndarray.
- * identity    : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
- *               this value is used to initialize the result to
- *               the reduction's unit.
+ * initial     : Initial value, if NULL the default is fetched from the
+ *               ArrayMethod (typically as the default from the ufunc).
  * loop        : `reduce_loop` from `ufunc_object.c`.  TODO: Refactor
- * data        : Data which is passed to the inner loop.
  * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
  * errormask   : forwarded from _get_bufsize_errmask
@@ -182,9 +176,9 @@ PyArray_CopyInitialReduceValues(
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask)
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask)
 {
     assert(loop != NULL);
     PyArrayObject *result = NULL;
@@ -198,38 +192,35 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     /* Loop auxdata (must be freed on error) */
     NpyAuxData *auxdata = NULL;
 
-    /* More than one axis means multiple orders are possible */
-    if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' is not reorderable, "
-                     "so at most one axis may be specified",
-                     funcname);
-        return NULL;
-    }
-    /* Can only use where with an initial ( from identity or argument) */
-    if (wheremask != NULL && identity == Py_None) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' does not have an identity, "
-                     "so to use a where mask one has to specify 'initial'",
-                     funcname);
-        return NULL;
-    }
-
-
     /* Set up the iterator */
     op[0] = out;
     op[1] = operand;
     op_dtypes[0] = context->descriptors[0];
     op_dtypes[1] = context->descriptors[1];
 
+    /* Buffer to use when we need an initial value */
+    char *initial_buf = NULL;
+
+    /* More than one axis means multiple orders are possible */
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)
+            && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
+        PyErr_Format(PyExc_ValueError,
+                "reduction operation '%s' is not reorderable, "
+                "so at most one axis may be specified",
+                funcname);
+        goto fail;
+    }
+
     it_flags = NPY_ITER_BUFFERED |
             NPY_ITER_EXTERNAL_LOOP |
             NPY_ITER_GROWINNER |
-            NPY_ITER_DONT_NEGATE_STRIDES |
             NPY_ITER_ZEROSIZE_OK |
             NPY_ITER_REFS_OK |
             NPY_ITER_DELAY_BUFALLOC |
             NPY_ITER_COPY_IF_OVERLAP;
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)) {
+        it_flags |= NPY_ITER_DONT_NEGATE_STRIDES;
+    }
     op_flags[0] = NPY_ITER_READWRITE |
                   NPY_ITER_ALIGNED |
                   NPY_ITER_ALLOCATE |
@@ -297,8 +288,52 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         goto fail;
     }
 
+    npy_bool empty_iteration = NpyIter_GetIterSize(iter) == 0;
     result = NpyIter_GetOperandArray(iter)[0];
 
+    /*
+     * Get the initial value (if it exists).  If the iteration is empty
+     * then we assume the reduction is also empty.  The reason is that when
+     * the outer iteration is empty we just won't use the initial value
+     * in any case.  (`np.sum(np.zeros((0, 3)), axis=0)` is a length 3
+     * reduction but has an empty result.)
+     */
+    if ((initial == NULL && context->method->get_reduction_initial == NULL)
+            || initial == Py_None) {
+        /* There is no initial value, or initial value was explicitly unset */
+    }
+    else {
+        /* Not all functions will need initialization, but init always: */
+        initial_buf = PyMem_Calloc(1, op_dtypes[0]->elsize);
+        if (initial_buf == NULL) {
+            PyErr_NoMemory();
+            goto fail;
+        }
+        if (initial != NULL) {
+            /* must use user provided initial value */
+            if (PyArray_Pack(op_dtypes[0], initial_buf, initial) < 0) {
+                goto fail;
+            }
+        }
+        else {
+            /*
+             * Fetch initial from ArrayMethod, we pretend the reduction is
+             * empty when the iteration is.  This may be wrong, but when it is,
+             * we will not need the identity as the result is also empty.
+             */
+            int has_initial = context->method->get_reduction_initial(
+                    context, empty_iteration, initial_buf);
+            if (has_initial < 0) {
+                goto fail;
+            }
+            if (!has_initial) {
+                /* We have no initial value available, free buffer to indicate */
+                PyMem_FREE(initial_buf);
+                initial_buf = NULL;
+            }
+        }
+    }
+
     PyArrayMethod_StridedLoop *strided_loop;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
 
@@ -313,12 +348,27 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
      * Initialize the result to the reduction unit if possible,
      * otherwise copy the initial values and get a view to the rest.
      */
-    if (identity != Py_None) {
-        if (PyArray_FillWithScalar(result, identity) < 0) {
+    if (initial_buf != NULL) {
+        /* Loop provided an identity or default value, assign to result. */
+        int ret = raw_array_assign_scalar(
+                PyArray_NDIM(result), PyArray_DIMS(result),
+                PyArray_DESCR(result),
+                PyArray_BYTES(result), PyArray_STRIDES(result),
+                op_dtypes[0], initial_buf);
+        if (ret < 0) {
             goto fail;
         }
     }
     else {
+        /* Can only use where with an initial (from identity or argument) */
+        if (wheremask != NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "reduction operation '%s' does not have an identity, "
+                    "so to use a where mask one has to specify 'initial'",
+                    funcname);
+            return NULL;
+        }
+
         /*
          * For 1-D skip_first_count could be optimized to 0, but no-identity
          * reductions are not super common.
@@ -354,7 +404,7 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         }
     }
 
-    if (NpyIter_GetIterSize(iter) != 0) {
+    if (!empty_iteration) {
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
         npy_intp *strideptr;
@@ -387,6 +437,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     }
     Py_INCREF(result);
 
+    if (initial_buf != NULL && PyDataType_REFCHK(PyArray_DESCR(result))) {
+        PyArray_Item_XDECREF(initial_buf, PyArray_DESCR(result));
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (!NpyIter_Deallocate(iter)) {
         Py_DECREF(result);
@@ -395,6 +449,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     return result;
 
 fail:
+    if (initial_buf != NULL && PyDataType_REFCHK(op_dtypes[0])) {
+        PyArray_Item_XDECREF(initial_buf, op_dtypes[0]);
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 2170e27a7..d2cbe4849 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -64,8 +64,8 @@ typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask);
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask);
 
 #endif
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 70fe963b9..a159fdc12 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -453,7 +453,7 @@ static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
 {
     char *args[3] = {(char *)&a, (char *)&b, (char *)out};
-    npy_intp steps[3];
+    npy_intp steps[3] = {0, 0, 0};
     npy_intp size = 1;
     @TYPE@_divide(args, &size, steps, NULL);
     return 0;
@@ -806,7 +806,7 @@ typedef enum {
      */
     CONVERT_PYSCALAR,
     /*
-     * Other object is an unkown scalar or array-like, we (typically) use
+     * Other object is an unknown scalar or array-like, we (typically) use
      * the generic path, which normally ends up in the ufunc machinery.
      */
     OTHER_IS_UNKNOWN_OBJECT,
@@ -1004,7 +1004,7 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
         if (overflow) {
             /* handle as if "unsafe" */
             if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
-                return PROMOTION_REQUIRED;
+                return OTHER_IS_UNKNOWN_OBJECT;
             }
             return CONVERT_PYSCALAR;
         }
@@ -1179,6 +1179,11 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (Half, Float, Double, LongDouble,
  *             CFloat, CDouble, CLongDouble)*4,
  *         (Half, Float, Double, LongDouble)*3#
+ * #NAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *              LONG, ULONG, LONGLONG, ULONGLONG)*12,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
  *         (npy_half, npy_float, npy_double, npy_longdouble,
@@ -1202,24 +1207,12 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (npy_half, npy_float, npy_double, npy_longdouble,
  *             npy_cfloat, npy_cdouble, npy_clongdouble)*4,
  *         (npy_half, npy_float, npy_double, npy_longdouble)*3#
- * #oname = (byte, ubyte, short, ushort, int, uint,
- *              long, ulong, longlong, ulonglong)*11,
- *          double*10,
- *          (half, float, double, longdouble,
- *              cfloat, cdouble, clongdouble)*4,
- *          (half, float, double, longdouble)*3#
  * #OName = (Byte, UByte, Short, UShort, Int, UInt,
  *              Long, ULong, LongLong, ULongLong)*11,
  *          Double*10,
  *          (Half, Float, Double, LongDouble,
  *              CFloat, CDouble, CLongDouble)*4,
  *          (Half, Float, Double, LongDouble)*3#
- * #ONAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *              LONG, ULONG, LONGLONG, ULONGLONG)*11,
- *          DOUBLE*10,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  */
 #define IS_@name@
 /* drop the "true_" from "true_divide" for floating point warnings: */
@@ -1234,7 +1227,7 @@ static PyObject *
 @name@_@oper@(PyObject *a, PyObject *b)
 {
     PyObject *ret;
-    @otype@ arg1, arg2, other_val;
+    @type@ arg1, arg2, other_val;
 
     /*
      * Check if this operation may be considered forward.  Note `is_forward`
@@ -1263,7 +1256,7 @@ static PyObject *
     PyObject *other = is_forward ? b : a;
 
     npy_bool may_need_deferring;
-    conversion_result res = convert_to_@oname@(
+    conversion_result res = convert_to_@name@(
             other, &other_val, &may_need_deferring);
     if (res == CONVERSION_ERROR) {
         return NULL;  /* an error occurred (should never happen) */
@@ -1305,7 +1298,7 @@ static PyObject *
              */
             return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
         case CONVERT_PYSCALAR:
-            if (@ONAME@_setitem(other, (char *)&other_val, NULL) < 0) {
+            if (@NAME@_setitem(other, (char *)&other_val, NULL) < 0) {
                 return NULL;
             }
             break;
@@ -1345,7 +1338,7 @@ static PyObject *
 #if @twoout@
     int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2);
 #else
-    int retstatus = @oname@_ctype_@oper@(arg1, arg2, &out);
+    int retstatus = @name@_ctype_@oper@(arg1, arg2, &out);
 #endif
 
 #if @fperr@
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
deleted file mode 100644
index 6fc1501c9..000000000
--- a/numpy/core/src/umath/simd.inc.src
+++ /dev/null
@@ -1,1148 +0,0 @@
-
-
-/*
- * This file is for the definitions of simd vectorized operations.
- *
- * Currently contains sse2 functions that are built on amd64, x32 or
- * non-generic builds (CFLAGS=-march=...)
- * In future it may contain other instruction sets like AVX or NEON detected
- * at runtime in which case it needs to be included indirectly via a file
- * compiled with special options (or use gcc target attributes) so the binary
- * stays portable.
- */
-
-
-#ifndef __NPY_SIMD_INC
-#define __NPY_SIMD_INC
-
-#include "lowlevel_strided_loops.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_simd_data.h"
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#include <immintrin.h>
-#else
-#undef __AVX2__
-#undef __AVX512F__
-#endif
-#endif
-#include "loops_utils.h" // nomemoverlap
-#include <assert.h>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> /* for memcpy */
-
-#define VECTOR_SIZE_BYTES 16
-
-/*
- * Dispatcher functions
- * decide whether the operation can be vectorized and run it
- * if it was run returns true and false if nothing was done
- */
-
-/*
- *****************************************************************************
- **                           CMPLX DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-
-/**begin repeat1
- *  #func = square, absolute, conjugate#
- *  #outsize = 1, 2, 1#
- *  #max_stride = 2, 8, 8#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
-        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           FLOAT DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
-        AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else {
-        return 0;
-    }
-#endif
-    return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
- */
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
-
-#endif
-
-static inline int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
-        npy_is_aligned(args[0], sizeof(@type@))) {
-        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
-                        npy_intp n);
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
-#endif
-
-static inline int
-run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
-                               (npy_bool*)args[1], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-
-static inline int
-run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
-                                dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
-#endif
-
-static inline int
-run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-
-/*
- * Vectorized operations
- */
-/*
- *****************************************************************************
- **                           FLOAT LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
-* horizontal reductions on a vector
-* # VOP = min, max#
-*/
-
-NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
-{
-    npy_float r;
-    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
-    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
-    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
-    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
-    return r;
-}
-
-NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
-{
-    npy_double r;
-    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
-    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
-    return r;
-}
-/**end repeat**/
-
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
-                              npy_bool * op)
-{
-    const __m128i mask = @vpre@_set1_epi8(0x1);
-    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
-    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
-    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
-    rr = @vpre@_packs_epi16(rr, rr);
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storel_epi64((__m128i*)op, rr);
-#else
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        int r = @vpre@_movemask_@vsuf@(a);
-        if (sizeof(@type@) == 8) {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1);
-        }
-        else {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1) & 1;
-            op[i + 2] = (r >> 2) & 1;
-            op[i + 3] = (r >> 3);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
-    /* signbit mask 0x7FFFFFFF after andnot */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
-                                             @vpre@_setzero_@vsuf@());
-#if @double@
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
-        /* fabs via masking of sign bit */
-        r1 = @vpre@_andnot_@vsuf@(mask, a);
-        r2 = @vpre@_andnot_@vsuf@(mask, b);
-        r3 = @vpre@_andnot_@vsuf@(mask, c);
-        r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
-        /* negative compare against max float, nan is always true */
-        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
-        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
-        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
-        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
-        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
-        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
-        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
-        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
-        /* flip results to what we want (andnot as there is no sse not) */
-        r1 = @vpre@_andnot_@vsuf@(r1, ones);
-        r2 = @vpre@_andnot_@vsuf@(r2, ones);
-        r3 = @vpre@_andnot_@vsuf@(r3, ones);
-        r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
-        r1 = @vpre@_cmpneq_@vsuf@(a, a);
-        r2 = @vpre@_cmpneq_@vsuf@(b, b);
-        r3 = @vpre@_cmpneq_@vsuf@(c, c);
-        r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/* bunch of helper functions used in ISA_exp/log_FLOAT*/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask_ps(void)
-{
-    return _mm256_set1_ps(-1.0);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_full_load_mask_pd(void)
-{
-    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
-{
-    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
-                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + num_lanes - num_elem;
-    return _mm256_loadu_ps(addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
-{
-    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
-    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
-    return _mm256_loadu_si256((__m256i*) addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather_ps(__m256 src,
-                     npy_float* addr,
-                     __m256i vindex,
-                     __m256 mask)
-{
-    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_gather_pd(__m256d src,
-                     npy_double* addr,
-                     __m128i vindex,
-                     __m256d mask)
-{
-    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load_ps(__m256 mask, npy_float* addr)
-{
-    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_load_pd(__m256i mask, npy_double* addr)
-{
-    return _mm256_maskload_pd(addr, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
-{
-    return _mm256_blendv_ps(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
-{
-    return _mm256_blendv_pd(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_blend(__m256 x, __m256 y, __m256 ymask)
-{
-    return _mm256_blendv_ps(x, y, ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_invert_mask_ps(__m256 ymask)
-{
-    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
-    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-/**begin repeat
- *  #vsub = ps, pd#
- *  #vtype = __m256, __m256d#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_abs_@vsub@(@vtype@ x)
-{
-    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_rint_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_floor_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_trunc_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
-}
-/**end repeat**/
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x0001 << num_elem) - 0x0001;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather_ps(__m512 src,
-                        npy_float* addr,
-                        __m512i vindex,
-                        __mmask16 kmask)
-{
-    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_gather_pd(__m512d src,
-                        npy_double* addr,
-                        __m256i vindex,
-                        __mmask8 kmask)
-{
-    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
-{
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
-{
-    return _mm512_mask_blend_ps(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
-{
-    return _mm512_mask_blend_pd(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
-{
-    return _mm512_mask_mov_ps(x, ymask, y);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_abs_@vsub@(@vtype@ x)
-{
-    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
-				    _mm512_set1_@epi_vsub@ (@and_const@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_rint_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x08);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_floor_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x09);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_trunc_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x0B);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cabsolute_@vsub@(const @vtype@ x1,
-                        const @vtype@ x2,
-                        const __m512i re_indices,
-                        const __m512i im_indices)
-{
-    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
-    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
-    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
-    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
-    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
-    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
-    /*
-     * If real or imag = INF, then convert it to inf + j*inf
-     * Handles: inf + j*nan, nan + j*inf
-     */
-    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
-    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
-    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
-    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
-
-    /*
-     * If real or imag = NAN, then convert it to nan + j*nan
-     * Handles: x + j*nan, nan + j*x
-     */
-    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
-    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
-    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
-    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
-
-    @vtype@ larger  = _mm512_max_@vsub@(re, im);
-    @vtype@ smaller = _mm512_min_@vsub@(im, re);
-
-    /*
-     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
-     */
-    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
-    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
-    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
-    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
-    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
-                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
-    return _mm512_mul_@vsub@(hypot, larger);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_conjugate_@vsub@(const @vtype@ x)
-{
-    /*
-     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
-     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
-     */
-    __m512i cast_x = _mm512_cast@vsub@_si512(x);
-    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
-                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
-    return _mm512_castsi512_@vsub@(res);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_csquare_@vsub@(@vtype@ x)
-{
-    return avx512_cmul_@vsub@(x, x);
-}
-
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #or = or_ps, kor#
- * #vsub = , _mask#
- * #mask = __m256, __mmask16#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- **/
-
-#if defined @CHK@
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sqrt_ps(@vtype@ x)
-{
-    return _mm@vsize@_sqrt_ps(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_sqrt_pd(@vtype@d x)
-{
-    return _mm@vsize@_sqrt_pd(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_square_ps(@vtype@ x)
-{
-    return _mm@vsize@_mul_ps(x,x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_square_pd(@vtype@d x)
-{
-    return _mm@vsize@_mul_pd(x,x);
-}
-
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
-    const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
-    npy_intp num_remaining_elements = array_size;
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
-    @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 index_ip[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
-    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
-    __m512i ones = _mm512_set1_@episize@(1);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
-        }
-#if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
-        @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
-        fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
-        __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
-        _mm_mask_storeu_epi8(op, load_mask, out);
-
-        ip += @num_lanes@*stride_ip;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-
-/**begin repeat1
- *  #func = square, conjugate#
- *  #vectorf = avx512_csquare, avx512_conjugate#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@ * op,
-                      @type@ * ip,
-                      const npy_intp array_size,
-                      const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-     /*
-      * Note: while generally indices are npy_intp, we ensure that our maximum index
-      * will fit in an int32 as a precondition for this function via max_stride
-      */
-    npy_int32 index_ip1[16];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_ip1[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
-        }
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        op += @num_lanes@;
-        ip += @num_lanes@*stride_ip1;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_absolute_@TYPE@(@type@ * op,
-                        @type@ * ip,
-                        const npy_intp array_size,
-                        const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via max_stride
-     */
-    npy_int32 index_ip[32];
-    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
-        index_ip[ii] = ii*stride_ip1;
-        index_ip[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
-    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
-
-    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-#if @IS_FLOAT@
-    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
-#else
-    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
-#endif
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-            load_mask2 = 0x0000;
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        } else if (num_remaining_elements < 2*@num_lanes@) {
-            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements - @num_lanes@, @num_lanes@);
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
-            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
-            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
-        }
-
-        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
-
-        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
-        op += @num_lanes@;
-        ip += 2*@num_lanes@*stride_ip1;
-        num_remaining_elements -= 2*@num_lanes@;
-    }
-    npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
-}
-
-#endif
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- * # and = 0, 1#
- * # op = ||, &&#
- * # sc = !=, ==#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vload = _mm_load_si128*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-/*
- * convert any bit set to boolean true so vectorized and normal operations are
- * consistent, should not be required if bool is used correctly everywhere but
- * you never know
- */
-#if !@and@
-NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-    /* get 0xFF for zeros */
-    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
-    /* filled with 0xFF/0x00, negate and mask to boolean true */
-    return @vpre@_andnot_@vsuf@(tmp, truemask);
-}
-#endif
-
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @op@ ip2[i];
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
-        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
-#if @and@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        /* get 0xFF for non zeros*/
-        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
-        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
-        tmp = @vpre@_andnot_@vsuf@(tmp, b);
-#else
-        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
-#endif
-
-        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip1[i] @op@ ip2[i]);
-    }
-}
-
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-    /* unrolled once to replace a slow movmsk with a fast pmaxb */
-    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
-        @vtype@ v = @vload@((@vtype@*)&ip[i]);
-        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
-        v = @vpre@_cmpeq_epi8(v, zero);
-        v2 = @vpre@_cmpeq_epi8(v2, zero);
-#if @and@
-        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
-            *op = 0;
-#else
-        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
-            *op = 1;
-#endif
-            return;
-        }
-    }
-    LOOP_BLOCKED_END {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- * # op = !=, ==#
- * # not = 0, 1#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-static void
-sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = (ip[i] @op@ 0);
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
-#if @not@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-        /* equivalent to byte_to_true but can skip the negation */
-        a = @vpre@_cmpeq_epi8(a, zero);
-        a = @vpre@_and_@vsuf@(a, truemask);
-#else
-        /* abs is kind of pointless but maybe its used for byte_to_true */
-        a = byte_to_true(a);
-#endif
-        @vstore@((@vtype@*)&op[i], a);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip[i] @op@ 0);
-    }
-}
-
-/**end repeat**/
-
-#undef VECTOR_SIZE_BYTES
-#endif  /* NPY_HAVE_SSE2_INTRINSICS */
-#endif
-
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 4e628f59a..94cd73ef6 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -1011,6 +1011,34 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
          * necessary to propagate the information to the legacy type resolution.
          */
         if (npy_mark_tmp_array_if_pyscalar(obj, out_op[i], &out_op_DTypes[i])) {
+            if (PyArray_FLAGS(out_op[i]) & NPY_ARRAY_WAS_PYTHON_INT
+                    && PyArray_TYPE(out_op[i]) != NPY_LONG) {
+                /*
+                 * When `np.array(integer)` is not the default integer (mainly
+                 * object dtype), this confuses many type resolvers.  Simply
+                 * forcing a default integer array is unfortunately easiest.
+                 * In this disables the optional NEP 50 warnings, but in
+                 * practice when this happens we should _usually_ pick the
+                 * default integer loop and that raises an error.
+                 * (An exception is `float64(1.) + 10**100` which silently
+                 * will give a float64 result rather than a Python float.)
+                 *
+                 * TODO: Just like the general dual NEP 50/legacy promotion
+                 * support this is meant as a temporary hack for NumPy 1.25.
+                 */
+                static PyArrayObject *zero_arr = NULL;
+                if (NPY_UNLIKELY(zero_arr == NULL)) {
+                    zero_arr = (PyArrayObject *)PyArray_ZEROS(
+                            0, NULL, NPY_LONG, NPY_FALSE);
+                    if (zero_arr == NULL) {
+                        goto fail;
+                    }
+                    ((PyArrayObject_fields *)zero_arr)->flags |= (
+                        NPY_ARRAY_WAS_PYTHON_INT|NPY_ARRAY_WAS_INT_AND_REPLACED);
+                }
+                Py_INCREF(zero_arr);
+                Py_SETREF(out_op[i], zero_arr);
+            }
             *promoting_pyscalars = NPY_TRUE;
         }
     }
@@ -2085,12 +2113,16 @@ _get_coredim_sizes(PyUFuncObject *ufunc, PyArrayObject **op,
 }
 
 /*
- * Returns a new reference
+ * Returns a new reference to the ufunc identity.  Note that this identity
+ * is only a default identity value stored on the ufunc, since the invidiual
+ * ufunc loop (ArrayMethod) is queried for the actual identity.
+ *
  * TODO: store a reference in the ufunc object itself, rather than
  *       constructing one each time
  */
-static PyObject *
-_get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable)
+{
     switch(ufunc->identity) {
     case PyUFunc_One:
         *reorderable = 1;
@@ -3006,10 +3038,8 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
-    npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
-    PyArrayObject *result = NULL;
-    PyObject *identity;
+
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -3034,9 +3064,6 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         return NULL;
     }
 
-    /*
-     * Promote and fetch ufuncimpl (currently needed to fix up the identity).
-     */
     PyArray_Descr *descrs[3];
     PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
             arr, out, signature, NPY_FALSE, descrs, NPY_UNSAFE_CASTING, "reduce");
@@ -3044,62 +3071,19 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         return NULL;
     }
 
-    /* Get the identity */
-    /* TODO: Both of these should be provided by the ArrayMethod! */
-    identity = _get_identity(ufunc, &reorderable);
-    if (identity == NULL) {
-        goto finish;
-    }
-
-    /* Get the initial value */
-    if (initial == NULL) {
-        initial = identity;
-
-        /*
-        * The identity for a dynamic dtype like
-        * object arrays can't be used in general
-        */
-        if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
-            Py_DECREF(initial);
-            initial = Py_None;
-            Py_INCREF(initial);
-        }
-        else if (PyTypeNum_ISUNSIGNED(descrs[2]->type_num)
-                    && PyLong_CheckExact(initial)) {
-            /*
-             * This is a bit of a hack until we have truly loop specific
-             * identities.  Python -1 cannot be cast to unsigned so convert
-             * it to a NumPy scalar, but we use -1 for bitwise functions to
-             * signal all 1s.
-             * (A builtin identity would not overflow here, although we may
-             * unnecessary convert 0 and 1.)
-             */
-            Py_SETREF(initial, PyObject_CallFunctionObjArgs(
-                        (PyObject *)&PyLongArrType_Type, initial, NULL));
-            if (initial == NULL) {
-                goto finish;
-            }
-        }
-    } else {
-        Py_DECREF(identity);
-        Py_INCREF(initial);  /* match the reference count in the if above */
-    }
-
     PyArrayMethod_Context context = {
         .caller = (PyObject *)ufunc,
         .method = ufuncimpl,
         .descriptors = descrs,
     };
 
-    result = PyUFunc_ReduceWrapper(&context,
-            arr, out, wheremask, axis_flags, reorderable, keepdims,
-            initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+    PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+            arr, out, wheremask, axis_flags, keepdims,
+            initial, reduce_loop, buffersize, ufunc_name, errormask);
 
-  finish:
     for (int i = 0; i < 3; i++) {
         Py_DECREF(descrs[i]);
     }
-    Py_XDECREF(initial);
     return result;
 }
 
@@ -4115,7 +4099,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     int errval = PyUFunc_CheckOverride(ufunc, _reduce_type[operation],
-            full_args.in, full_args.out, args, len_args, kwnames, &override);
+            full_args.in, full_args.out, wheremask_obj, args, len_args, kwnames, &override);
     if (errval) {
         return NULL;
     }
@@ -4887,7 +4871,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     errval = PyUFunc_CheckOverride(ufunc, method,
-            full_args.in, full_args.out,
+            full_args.in, full_args.out, where_obj,
             args, len_args, kwnames, &override);
     if (errval) {
         goto fail;
@@ -4973,9 +4957,13 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
             if (!(orig_flags & NPY_ARRAY_WAS_PYTHON_LITERAL)) {
                 continue;
             }
-            /* If the descriptor matches, no need to worry about conversion */
-            if (PyArray_EquivTypes(
-                    PyArray_DESCR(operands[i]), operation_descrs[i])) {
+            /*
+             * If descriptor matches, no need to convert, but integers may
+             * have been too large.
+             */
+            if (!(orig_flags & NPY_ARRAY_WAS_INT_AND_REPLACED)
+                    && PyArray_EquivTypes(
+                        PyArray_DESCR(operands[i]), operation_descrs[i])) {
                 continue;
             }
             /* Otherwise, replace the operand with a new array */
@@ -5079,14 +5067,11 @@ ufunc_generic_vectorcall(PyObject *ufunc,
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
     PyObject *thedict;
     PyObject *res;
 
-    if (!PyArg_ParseTuple(args, "")) {
-        return NULL;
-    }
     thedict = PyThreadState_GetDict();
     if (thedict == NULL) {
         thedict = PyEval_GetBuiltins();
@@ -5112,16 +5097,13 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg)
 {
     PyObject *thedict;
     int res;
-    PyObject *val;
+    PyObject *val = arg;
     static char *msg = "Error object must be a list of length 3";
 
-    if (!PyArg_ParseTuple(args, "O:seterrobj", &val)) {
-        return NULL;
-    }
     if (!PyList_CheckExact(val) || PyList_GET_SIZE(val) != 3) {
         PyErr_SetString(PyExc_ValueError, msg);
         return NULL;
@@ -5937,6 +5919,7 @@ static inline PyArrayObject *
 new_array_op(PyArrayObject *op_array, char *data)
 {
     npy_intp dims[1] = {1};
+    Py_INCREF(PyArray_DESCR(op_array));  /* NewFromDescr steals a reference */
     PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array),
                                        1, dims, NULL, data,
                                        NPY_ARRAY_WRITEABLE, NULL);
@@ -5944,206 +5927,171 @@ new_array_op(PyArrayObject *op_array, char *data)
 }
 
 /*
- * Call ufunc only on selected array items and store result in first operand.
- * For add ufunc, method call is equivalent to op1[idx] += op2 with no
- * buffering of the first operand.
- * Arguments:
- * op1 - First operand to ufunc
- * idx - Indices that are applied to first operand. Equivalent to op1[idx].
- * op2 - Second operand to ufunc (if needed). Must be able to broadcast
- *       over first operand.
+ * Use an indexed loop to do the work
+ * Returns 0 if successful
  */
-static PyObject *
-ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+static int
+trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_Context *context)
 {
-    PyObject *op1 = NULL;
-    PyObject *idx = NULL;
-    PyObject *op2 = NULL;
-    PyArrayObject *op1_array = NULL;
-    PyArrayObject *op2_array = NULL;
-    PyArrayMapIterObject *iter = NULL;
-    PyArrayIterObject *iter2 = NULL;
-    PyArrayObject *operands[3] = {NULL, NULL, NULL};
-    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
-
-    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
-    PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
-    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
-
-    int nop;
-
-    /* override vars */
-    int errval;
-    PyObject *override = NULL;
-
-    NpyIter *iter_buffer;
-    NpyIter_IterNextFunc *iternext;
-    npy_uint32 op_flags[NPY_MAXARGS];
-    int buffersize;
-    int errormask = 0;
-    char * err_msg = NULL;
-
-    PyArrayMethod_StridedLoop *strided_loop;
-    NpyAuxData *auxdata = NULL;
+    int buffersize=0, errormask = 0;
+    int res;
+    char *args[3];
+    npy_intp steps[3];
+    args[0] = (char *) iter->baseoffset;
+    steps[0] = iter->fancy_strides[0];
+    if (ufuncimpl->nin == 1) {
+        args[2] = NULL;
+        steps[2] = 0;
+    } else {
+        args[2] = (char *)PyArray_DATA(op2_array);
+        if (PyArray_NDIM(op2_array) == 0
+            || PyArray_DIM(op2_array, 0) <= 1) {
+            steps[2] = 0;
+        } else {
+            steps[2] = PyArray_STRIDE(op2_array, 0);
+        }
+    }
 
-    NPY_BEGIN_THREADS_DEF;
+    npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
 
-    if (ufunc->core_enabled) {
-        PyErr_Format(PyExc_TypeError,
-            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
-            ufunc->name, ufunc->name, ufunc->core_signature);
-        return NULL;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
     }
 
-    if (ufunc->nin > 2) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only unary and binary ufuncs supported at this time");
-        return NULL;
-    }
+    do {
+        args[1] = (char *) iter->outer_ptrs[0];
+        steps[1] = iter->outer_strides[0];
 
-    if (ufunc->nout != 1) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only single output ufuncs supported at this time");
-        return NULL;
-    }
+        res = ufuncimpl->contiguous_indexed_loop(
+                context, args, inner_size, steps, NULL);
+        if (args[2] != NULL) {
+            args[2] += (*inner_size) * steps[2];
+        }
+    } while (res == 0 && iter->outer_next(iter->outer));
 
-    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
-        return NULL;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        const char * ufunc_name =
+                        ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        if (_get_bufsize_errmask(NULL, ufunc_name,
+                                 &buffersize, &errormask) < 0) {
+            return -1;
+        }
+        res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
     }
+    return res;
+}
 
-    if (ufunc->nin == 2 && op2 == NULL) {
-        PyErr_SetString(PyExc_ValueError,
-                        "second operand needed for ufunc");
-        return NULL;
-    }
-    errval = PyUFunc_CheckOverride(ufunc, "at",
-            args, NULL, NULL, 0, NULL, &override);
+static int
+ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (errval) {
-        return NULL;
-    }
-    else if (override) {
-        return override;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    if (!PyArray_Check(op1)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "first operand must be array");
-        return NULL;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
     }
 
-    op1_array = (PyArrayObject *)op1;
-
-    /* Create second operand from number array if needed. */
-    if (op2 != NULL) {
-        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
-                                0, 0, 0, NULL);
-        if (op2_array == NULL) {
-            goto fail;
-        }
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
     }
 
-    /* Create map iterator */
-    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
-        op1_array, idx, 1, op2_array);
-    if (iter == NULL) {
-        goto fail;
-    }
-    op1_array = iter->array;  /* May be updateifcopied on overlap */
+    npy_intp strides[3] = {0, 0, 0};
+    /*
+     * Iterate over first and second operands and call ufunc
+     * for each pair of inputs
+     */
+    for (npy_intp i = iter->size; i > 0; i--)
+    {
+        char *dataptr[3];
+        /* one element at a time, no stride required but read by innerloop */
+        npy_intp count = 1;
 
-    if (op2 != NULL) {
         /*
-         * May need to swap axes so that second operand is
-         * iterated over correctly
+         * Set up data pointers for either one or two input operands.
+         * The output data pointer points to the first operand data.
          */
-        if ((iter->subspace != NULL) && (iter->consec)) {
-            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
-            if (op2_array == NULL) {
-                goto fail;
-            }
+        dataptr[0] = iter->dataptr;
+        if (iter2 != NULL) {
+            dataptr[1] = PyArray_ITER_DATA(iter2);
+            dataptr[2] = iter->dataptr;
+        }
+        else {
+            dataptr[1] = iter->dataptr;
+            dataptr[2] = NULL;
         }
 
-        /*
-         * Create array iter object for second operand that
-         * "matches" the map iter object for the first operand.
-         * Then we can just iterate over the first and second
-         * operands at the same time and not have to worry about
-         * picking the correct elements from each operand to apply
-         * the ufunc to.
-         */
-        if ((iter2 = (PyArrayIterObject *)\
-             PyArray_BroadcastToShape((PyObject *)op2_array,
-                                        iter->dimensions, iter->nd))==NULL) {
-            goto fail;
+        res = strided_loop(context, dataptr, &count, strides, auxdata);
+        if (res != 0) {
+            break;
         }
-    }
 
-    /*
-     * Create dtypes array for either one or two input operands.
-     * Compare to the logic in `convert_ufunc_arguments`.
-     * TODO: It may be good to review some of this behaviour, since the
-     *       operand array is special (it is written to) similar to reductions.
-     *       Using unsafe-casting as done here, is likely not desirable.
-     */
-    operands[0] = op1_array;
-    operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
-    Py_INCREF(operand_DTypes[0]);
-    int force_legacy_promotion = 0;
-    int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+        PyArray_MapIterNext(iter);
+        if (iter2 != NULL) {
+            PyArray_ITER_NEXT(iter2);
+        }
+    }
 
-    if (op2_array != NULL) {
-        operands[1] = op2_array;
-        operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
-        Py_INCREF(operand_DTypes[1]);
-        allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
-        operands[2] = operands[0];
-        operand_DTypes[2] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[2]);
+    NPY_END_THREADS;
 
-        nop = 3;
-        if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
-                                       != (PyArray_NDIM(op2_array) == 0))) {
-                /* both are legacy and only one is 0-D: force legacy */
-                force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
-            }
-    }
-    else {
-        operands[1] = operands[0];
-        operand_DTypes[1] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[1]);
-        operands[2] = NULL;
-        nop = 2;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    return res;
+}
 
-    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
-            operands, signature, operand_DTypes,
-            force_legacy_promotion, allow_legacy_promotion,
-            NPY_FALSE, NPY_FALSE);
-    if (ufuncimpl == NULL) {
-        goto fail;
-    }
+static int
+ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArray_Descr *operation_descrs[3],
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    NpyIter *iter_buffer = NULL;
+    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    int nop = 0;
+    NpyIter_IterNextFunc *iternext;
+    char * err_msg = NULL;
+    NPY_BEGIN_THREADS_DEF;
 
-    /* Find the correct descriptors for the operation */
-    if (resolve_descriptors(nop, ufunc, ufuncimpl,
-            operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
-        goto fail;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    Py_INCREF(PyArray_DESCR(op1_array));
     array_operands[0] = new_array_op(op1_array, iter->dataptr);
     if (iter2 != NULL) {
-        Py_INCREF(PyArray_DESCR(op2_array));
         array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2));
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[2] = new_array_op(op1_array, iter->dataptr);
+        nop = 3;
     }
     else {
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[1] = new_array_op(op1_array, iter->dataptr);
         array_operands[2] = NULL;
+        nop = 2;
     }
-
     /* Set up the flags */
+    npy_uint32 op_flags[3];
     op_flags[0] = NPY_ITER_READONLY|
                   NPY_ITER_ALIGNED;
 
@@ -6163,11 +6111,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                       NPY_ITER_NO_BROADCAST|
                       NPY_ITER_NO_SUBTYPE;
     }
-
-    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
-        goto fail;
-    }
-
     /*
      * Create NpyIter object to "iterate" over single element of each input
      * operand. This is an easy way to reuse the NpyIter logic for dealing
@@ -6190,33 +6133,23 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                         -1, NULL, NULL, buffersize);
 
     if (iter_buffer == NULL) {
-        goto fail;
+        /* will fail only on memory allocation errors */
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
+        /* can not really happen, iter_buffer creation is tightly controlled */
         NpyIter_Deallocate(iter_buffer);
-        goto fail;
-    }
-
-    PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
-    };
-
-    NPY_ARRAYMETHOD_FLAGS flags;
-    /* Use contiguous strides; if there is such a loop it may be faster */
-    npy_intp strides[3] = {
-            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
-    if (nop == 3) {
-        strides[2] = operation_descrs[2]->elsize;
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
-    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
-            &strided_loop, &auxdata, &flags) < 0) {
-        goto fail;
-    }
     int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
     needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
@@ -6224,6 +6157,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         npy_clear_floatstatus_barrier((char*)&iter);
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6232,7 +6166,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
      */
-    int res = 0;
     for (npy_intp i = iter->size; i > 0; i--)
     {
         char *dataptr[3];
@@ -6263,7 +6196,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
         buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
 
-        res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+        res = strided_loop(context, buffer_dataptr, &count, strides, auxdata);
         if (res != 0) {
             break;
         }
@@ -6289,18 +6222,284 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /* NOTE: We could check float errors even when `res < 0` */
         res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    NpyIter_Deallocate(iter_buffer);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(array_operands[i]);
+    }
+    return res;
+}
+
+/*
+ * Call ufunc only on selected array items and store result in first operand.
+ * For add ufunc, method call is equivalent to op1[idx] += op2 with no
+ * buffering of the first operand.
+ * Arguments:
+ * op1 - First operand to ufunc
+ * idx - Indices that are applied to first operand. Equivalent to op1[idx].
+ * op2 - Second operand to ufunc (if needed). Must be able to broadcast
+ *       over first operand.
+ */
+static PyObject *
+ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+{
+    PyObject *op1 = NULL;
+    PyObject *idx = NULL;
+    PyObject *op2 = NULL;
+    PyArrayObject *op1_array = NULL;
+    PyArrayObject *op2_array = NULL;
+    PyArrayMapIterObject *iter = NULL;
+    PyArrayIterObject *iter2 = NULL;
+    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
+
+    int nop;
+
+    /* override vars */
+    int errval;
+    PyObject *override = NULL;
+    int res = -1;   /* start with fail condition so "goto fail" will error */
+
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
+    if (ufunc->core_enabled) {
+        PyErr_Format(PyExc_TypeError,
+            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
+            ufunc->name, ufunc->name, ufunc->core_signature);
+        return NULL;
+    }
+
+    if (ufunc->nin > 2) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only unary and binary ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (ufunc->nout != 1) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only single output ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
+        return NULL;
+    }
+
+    if (ufunc->nin == 2 && op2 == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand needed for ufunc");
+        return NULL;
+    }
+
+    if (ufunc->nin == 1 && op2 != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand provided when ufunc is unary");
+        return NULL;
+    }
+    errval = PyUFunc_CheckOverride(ufunc, "at",
+            args, NULL, NULL, NULL, 0, NULL, &override);
+
+    if (errval) {
+        return NULL;
+    }
+    else if (override) {
+        return override;
+    }
+
+    if (!PyArray_Check(op1)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "first operand must be array");
+        return NULL;
+    }
+
+    op1_array = (PyArrayObject *)op1;
+
+    /* Create second operand from number array if needed. */
+    if (op2 == NULL) {
+        nop = 2;
+    }
+    else {
+        nop = 3;
+        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
+                                0, 0, 0, NULL);
+        if (op2_array == NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethodObject *ufuncimpl = NULL;
+    {
+        /* Do all the dtype handling and find the correct ufuncimpl */
+
+        PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+        /*
+         * Create dtypes array for either one or two input operands.
+         * Compare to the logic in `convert_ufunc_arguments`.
+         * TODO: It may be good to review some of this behaviour, since the
+         *       operand array is special (it is written to) similar to reductions.
+         *       Using unsafe-casting as done here, is likely not desirable.
+         */
+        tmp_operands[0] = op1_array;
+        operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+        Py_INCREF(operand_DTypes[0]);
+        int force_legacy_promotion = 0;
+        int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
+        if (op2_array != NULL) {
+            tmp_operands[1] = op2_array;
+            operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+            Py_INCREF(operand_DTypes[1]);
+            allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+            tmp_operands[2] = tmp_operands[0];
+            operand_DTypes[2] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[2]);
+
+            if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+                                           != (PyArray_NDIM(op2_array) == 0))) {
+                    /* both are legacy and only one is 0-D: force legacy */
+                    force_legacy_promotion = should_use_min_scalar(2, tmp_operands, 0, NULL);
+                }
+        }
+        else {
+            tmp_operands[1] = tmp_operands[0];
+            operand_DTypes[1] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[1]);
+            tmp_operands[2] = NULL;
+        }
+
+        ufuncimpl = promote_and_get_ufuncimpl(ufunc, tmp_operands, signature,
+                        operand_DTypes, force_legacy_promotion,
+                        allow_legacy_promotion, NPY_FALSE, NPY_FALSE);
+        if (ufuncimpl == NULL) {
+            for (int i = 0; i < 3; i++) {
+                Py_XDECREF(signature[i]);
+                Py_XDECREF(operand_DTypes[i]);
+            }
+            goto fail;
+        }
+
+        /* Find the correct operation_descrs for the operation */
+        int resolve_result = resolve_descriptors(nop, ufunc, ufuncimpl,
+                tmp_operands, operation_descrs, signature, NPY_UNSAFE_CASTING);
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(signature[i]);
+            Py_XDECREF(operand_DTypes[i]);
+        }
+        if (resolve_result < 0) {
+            goto fail;
+        }
+    }
 
+    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+        op1_array, idx, 1, op2_array);
+    if (iter == NULL) {
+        goto fail;
+    }
+    op1_array = iter->array;  /* May be updateifcopied on overlap */
+
+    if (op2_array != NULL) {
+        /*
+         * May need to swap axes so that second operand is
+         * iterated over correctly
+         */
+        if ((iter->subspace != NULL) && (iter->consec)) {
+            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
+            if (op2_array == NULL) {
+                /* only on memory allocation failure */
+                goto fail;
+            }
+        }
+
+        /*
+         * Create array iter object for second operand that
+         * "matches" the map iter object for the first operand.
+         * Then we can just iterate over the first and second
+         * operands at the same time and not have to worry about
+         * picking the correct elements from each operand to apply
+         * the ufunc to.
+         */
+        if ((iter2 = (PyArrayIterObject *)\
+             PyArray_BroadcastToShape((PyObject *)op2_array,
+                                        iter->dimensions, iter->nd))==NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
+    /* Use contiguous strides; if there is such a loop it may be faster */
+    npy_intp strides[3] = {
+            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+    if (nop == 3) {
+        strides[2] = operation_descrs[2]->elsize;
+    }
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    int fast_path = 1;
+    /* check no casting, alignment */
+    if (PyArray_DESCR(op1_array) != operation_descrs[0]) {
+        fast_path = 0;
+    }
+    if (PyArray_DESCR(op1_array) != operation_descrs[nop - 1]) {
+        /* output casting */
+        fast_path = 0;
+    }
+    if (!PyArray_ISALIGNED(op1_array)) {
+        fast_path = 0;
+    }
+    if (nop >2) {
+        if  (PyArray_DESCR(op2_array) != operation_descrs[1]) {
+            fast_path = 0;
+        }
+        if (!PyArray_ISALIGNED(op2_array)) {
+            fast_path = 0;
+        }
+    }
+    if (fast_path == 1) {
+        /*
+         * Try to use trivial loop (1d, no casting, aligned) if
+         * - the matching info has a indexed loop
+         * - idx must be exactly one integer index array
+         * - all operands are 1d
+         * A future enhancement could loosen the restriction on 1d operands
+         * by adding an iteration loop inside trivial_at_loop
+         */
+        if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
+                (PyArray_NDIM(op1_array) == 1)  &&
+                (op2_array == NULL || PyArray_NDIM(op2_array) <= 1) &&
+                (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
+            res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+                        op2_array, &context);
+
+        }
+        else {
+            /* Couldn't use the fastest path, use the faster path */
+            res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, strided_loop, &context, auxdata);
+        }
+    } else {
+        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, operation_descrs, strided_loop, &context,
+                        auxdata);
+    }
+
+fail:
     NPY_AUXDATA_FREE(auxdata);
-    NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
     for (int i = 0; i < nop; i++) {
-        Py_DECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
         Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
     }
 
     /*
@@ -6309,29 +6508,15 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * (e.g. `power` released the GIL but manually set an Exception).
      */
     if (res != 0 || PyErr_Occurred()) {
+        /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
+        if (PyArray_FLAGS(op1_array) & NPY_ARRAY_WRITEBACKIFCOPY) {
+            PyArray_DiscardWritebackIfCopy(op1_array);
+        }
         return NULL;
     }
     else {
         Py_RETURN_NONE;
     }
-
-fail:
-    /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
-    if (op1_array != (PyArrayObject*)op1) {
-        PyArray_DiscardWritebackIfCopy(op1_array);
-    }
-    Py_XDECREF(op2_array);
-    Py_XDECREF(iter);
-    Py_XDECREF(iter2);
-    for (int i = 0; i < 3; i++) {
-        Py_XDECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
-        Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
-    }
-    NPY_AUXDATA_FREE(auxdata);
-
-    return NULL;
 }
 
 
@@ -6454,16 +6639,10 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
             Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_NewFromDescr_int(
                     &PyArray_Type, descr, 0, NULL, NULL, NULL,
-                    0, NULL, NULL, 0, 1);
+                    0, NULL, NULL, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
             }
-            if (PyArray_DESCR(dummy_arrays[i]) != descr) {
-                PyErr_SetString(PyExc_NotImplementedError,
-                    "dtype was replaced during array creation, the dtype is "
-                    "unsupported currently (a subarray dtype?).");
-                goto finish;
-            }
             DTypes[i] = NPY_DTYPE(descr);
             Py_INCREF(DTypes[i]);
             if (!NPY_DT_is_legacy(DTypes[i])) {
@@ -6473,7 +6652,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
          /* Explicitly allow int, float, and complex for the "weak" types. */
         else if (descr_obj == (PyObject *)&PyLong_Type) {
             descr = PyArray_DescrFromType(NPY_LONG);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6485,7 +6663,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
         }
         else if (descr_obj == (PyObject *)&PyFloat_Type) {
             descr = PyArray_DescrFromType(NPY_DOUBLE);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6497,7 +6674,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
         }
         else if (descr_obj == (PyObject *)&PyComplex_Type) {
             descr = PyArray_DescrFromType(NPY_CDOUBLE);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6887,7 +7063,7 @@ static PyObject *
 ufunc_get_identity(PyUFuncObject *ufunc, void *NPY_UNUSED(ignored))
 {
     npy_bool reorderable;
-    return _get_identity(ufunc, &reorderable);
+    return PyUFunc_GetDefaultIdentity(ufunc, &reorderable);
 }
 
 static PyObject *
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index 32af6c58e..1b80bb2df 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -4,14 +4,17 @@
 #include <numpy/ufuncobject.h>
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg));
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg);
 
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable);
+
 /* strings from umathmodule.c that are interned on umath import */
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_ufunc;
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index a0a16a0f9..decd26580 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -362,7 +362,7 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                 && PyArray_ISDATETIME(operands[1])
                 && type_num1 != type_num2) {
             /*
-             * Reject mixed datetime and timedelta explictly, this was always
+             * Reject mixed datetime and timedelta explicitly, this was always
              * implicitly rejected because casting fails (except with
              * `casting="unsafe"` admittedly).
              * This is required to ensure that `==` and `!=` can correctly
@@ -381,8 +381,28 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
             if (out_dtypes[0] == NULL) {
                 return -1;
             }
-            out_dtypes[1] = out_dtypes[0];
-            Py_INCREF(out_dtypes[1]);
+            if (PyArray_ISINTEGER(operands[0])
+                    && PyArray_ISINTEGER(operands[1])
+                    && !PyDataType_ISINTEGER(out_dtypes[0])) {
+                /*
+                 * NumPy promotion allows uint+int to go to float, avoid it
+                 * (input must have been a mix of signed and unsigned)
+                 */
+                if (PyArray_ISSIGNED(operands[0])) {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_LONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_ULONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+                else {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_ULONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_LONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+            }
+            else {
+                out_dtypes[1] = out_dtypes[0];
+                Py_INCREF(out_dtypes[1]);
+            }
         }
         else {
             /* Not doing anything will lead to a loop no found error. */
@@ -398,15 +418,8 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                 operands, type_tup, out_dtypes);
     }
 
-    /* Output type is always boolean */
+    /* Output type is always boolean (cannot fail for builtins) */
     out_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
-    if (out_dtypes[2] == NULL) {
-        for (i = 0; i < 2; ++i) {
-            Py_DECREF(out_dtypes[i]);
-            out_dtypes[i] = NULL;
-        }
-        return -1;
-    }
 
     /* Check against the casting rules */
     if (PyUFunc_ValidateCasting(ufunc, casting, operands, out_dtypes) < 0) {
diff --git a/numpy/core/src/umath/wrapping_array_method.c b/numpy/core/src/umath/wrapping_array_method.c
index 9f8f036e8..64fea7aeb 100644
--- a/numpy/core/src/umath/wrapping_array_method.c
+++ b/numpy/core/src/umath/wrapping_array_method.c
@@ -177,6 +177,39 @@ wrapping_method_get_loop(
 }
 
 
+/*
+ * Wraps the original identity function, needs to translate the descriptors
+ * back to the original ones and provide an "original" context (identically to
+ * `get_loop`).
+ * We assume again that translating the descriptors is quick.
+ */
+static int
+wrapping_method_get_identity_function(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *item)
+{
+    /* Copy the context, and replace descriptors: */
+    PyArrayMethod_Context orig_context = *context;
+    PyArray_Descr *orig_descrs[NPY_MAXARGS];
+    orig_context.descriptors = orig_descrs;
+    orig_context.method = context->method->wrapped_meth;
+
+    int nin = context->method->nin, nout = context->method->nout;
+    PyArray_DTypeMeta **dtypes = context->method->wrapped_dtypes;
+
+    if (context->method->translate_given_descrs(
+            nin, nout, dtypes, context->descriptors, orig_descrs) < 0) {
+        return -1;
+    }
+    int res = context->method->wrapped_meth->get_reduction_initial(
+            &orig_context, reduction_is_empty, item);
+    for (int i = 0; i < nin + nout; i++) {
+        Py_DECREF(orig_descrs);
+    }
+    return res;
+}
+
+
 /**
  * Allows creating of a fairly lightweight wrapper around an existing ufunc
  * loop.  The idea is mainly for units, as this is currently slightly limited
@@ -235,14 +268,17 @@ PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
         break;
     }
     if (wrapped_meth == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                "Did not find the to-be-wrapped loop in the ufunc.");
+        PyErr_Format(PyExc_TypeError,
+                "Did not find the to-be-wrapped loop in the ufunc with given "
+                "DTypes. Received wrapping types: %S", wrapped_dt_tuple);
         goto finish;
     }
 
     PyType_Slot slots[] = {
         {NPY_METH_resolve_descriptors, &wrapping_method_resolve_descriptors},
-        {NPY_METH_get_loop, &wrapping_method_get_loop},
+        {_NPY_METH_get_loop, &wrapping_method_get_loop},
+        {NPY_METH_get_reduction_initial,
+            &wrapping_method_get_identity_function},
         {0, NULL}
     };
 
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 5006e77f2..b9f2f8ae9 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -102,6 +102,16 @@ def test_array_array():
     assert_raises(ValueError, np.array, [nested], dtype=np.float64)
 
     # Try with lists...
+    # float32
+    assert_equal(np.array([None] * 10, dtype=np.float32),
+                 np.full((10,), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None]] * 10, dtype=np.float32),
+                 np.full((10, 1), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None] * 10], dtype=np.float32),
+                 np.full((1, 10), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None] * 10] * 10, dtype=np.float32),
+                 np.full((10, 10), np.nan, dtype=np.float32))
+    # float64
     assert_equal(np.array([None] * 10, dtype=np.float64),
                  np.full((10,), np.nan, dtype=np.float64))
     assert_equal(np.array([[None]] * 10, dtype=np.float64),
@@ -310,7 +320,7 @@ def test_array_astype_warning(t):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode_, np.bool_),
+         (np.str_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast(dtype, out_dtype):
     """
@@ -324,7 +334,7 @@ def test_string_to_boolean_cast(dtype, out_dtype):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode_, np.bool_),
+         (np.str_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast_errors(dtype, out_dtype):
     """
diff --git a/numpy/core/tests/test_argparse.py b/numpy/core/tests/test_argparse.py
index 63a01dee4..fae227027 100644
--- a/numpy/core/tests/test_argparse.py
+++ b/numpy/core/tests/test_argparse.py
@@ -53,8 +53,8 @@ def test_multiple_values():
 def test_string_fallbacks():
     # We can (currently?) use numpy strings to test the "slow" fallbacks
     # that should normally not be taken due to string interning.
-    arg2 = np.unicode_("arg2")
-    missing_arg = np.unicode_("missing_arg")
+    arg2 = np.str_("arg2")
+    missing_arg = np.str_("missing_arg")
     func(1, **{arg2: 3})
     with pytest.raises(TypeError,
             match="got an unexpected keyword argument 'missing_arg'"):
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index fade57292..d5373f642 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -1,6 +1,6 @@
 """
 Tests for array coercion, mainly through testing `np.array` results directly.
-Note that other such tests exist e.g. in `test_api.py` and many corner-cases
+Note that other such tests exist, e.g., in `test_api.py` and many corner-cases
 are tested (sometimes indirectly) elsewhere.
 """
 
@@ -20,8 +20,8 @@ from numpy.testing import (
 def arraylikes():
     """
     Generator for functions converting an array into various array-likes.
-    If full is True (default) includes array-likes not capable of handling
-    all dtypes
+    If full is True (default) it includes array-likes not capable of handling
+    all dtypes.
     """
     # base array:
     def ndarray(a):
@@ -39,9 +39,9 @@ def arraylikes():
     yield subclass
 
     class _SequenceLike():
-        # We are giving a warning that array-like's were also expected to be
-        # sequence-like in `np.array([array_like])`, this can be removed
-        # when the deprecation exired (started NumPy 1.20)
+        # Older NumPy versions, sometimes cared whether a protocol array was
+        # also _SequenceLike.  This shouldn't matter, but keep it for now
+        # for __array__ and not the others.
         def __len__(self):
             raise TypeError
 
@@ -62,7 +62,7 @@ def arraylikes():
     yield param(memoryview, id="memoryview")
 
     # Array-interface
-    class ArrayInterface(_SequenceLike):
+    class ArrayInterface:
         def __init__(self, a):
             self.a = a  # need to hold on to keep interface valid
             self.__array_interface__ = a.__array_interface__
@@ -70,7 +70,7 @@ def arraylikes():
     yield param(ArrayInterface, id="__array_interface__")
 
     # Array-Struct
-    class ArrayStruct(_SequenceLike):
+    class ArrayStruct:
         def __init__(self, a):
             self.a = a  # need to hold on to keep struct valid
             self.__array_struct__ = a.__array_struct__
@@ -130,7 +130,7 @@ def scalar_instances(times=True, extended_precision=True, user_dtype=True):
 
     # Strings and unstructured void:
     yield param(np.bytes_(b"1234"), id="bytes")
-    yield param(np.unicode_("2345"), id="unicode")
+    yield param(np.str_("2345"), id="unicode")
     yield param(np.void(b"4321"), id="unstructured_void")
 
 
@@ -161,8 +161,12 @@ class TestStringDiscovery:
         # A nested array is also discovered correctly
         arr = np.array(obj, dtype="O")
         assert np.array(arr, dtype="S").dtype == expected
+        # Also if we use the dtype class
+        assert np.array(arr, dtype=type(expected)).dtype == expected
         # Check that .astype() behaves identical
         assert arr.astype("S").dtype == expected
+        # The DType class is accepted by `.astype()`
+        assert arr.astype(type(np.dtype("S"))).dtype == expected
 
     @pytest.mark.parametrize("obj",
             [object(), 1.2, 10**43, None, "string"],
@@ -257,7 +261,7 @@ class TestScalarDiscovery:
     @pytest.mark.parametrize("scalar", scalar_instances())
     def test_scalar_coercion(self, scalar):
         # This tests various scalar coercion paths, mainly for the numerical
-        # types.  It includes some paths not directly related to `np.array`
+        # types. It includes some paths not directly related to `np.array`.
         if isinstance(scalar, np.inexact):
             # Ensure we have a full-precision number if available
             scalar = type(scalar)((scalar * 2)**0.5)
@@ -292,7 +296,7 @@ class TestScalarDiscovery:
            * `np.array(scalar, dtype=dtype)`
            * `np.empty((), dtype=dtype)[()] = scalar`
            * `np.array(scalar).astype(dtype)`
-        should behave the same.  The only exceptions are paramteric dtypes
+        should behave the same.  The only exceptions are parametric dtypes
         (mainly datetime/timedelta without unit) and void without fields.
         """
         dtype = cast_to.dtype  # use to parametrize only the target dtype
@@ -384,7 +388,7 @@ class TestScalarDiscovery:
         """
         dtype = np.dtype(dtype)
 
-        # This is a special case using casting logic.  It warns for the NaN
+        # This is a special case using casting logic. It warns for the NaN
         # but allows the cast (giving undefined behaviour).
         with np.errstate(invalid="ignore"):
             coerced = np.array(scalar, dtype=dtype)
@@ -650,6 +654,14 @@ class TestArrayLikes:
         res = np.array([obj], dtype=object)
         assert res[0] is obj
 
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    @pytest.mark.parametrize("arr", [np.array(0.), np.arange(4)])
+    def test_object_assignment_special_case(self, arraylike, arr):
+        obj = arraylike(arr)
+        empty = np.arange(1, dtype=object)
+        empty[:] = [obj]
+        assert empty[0] is obj
+
     def test_0d_generic_special_case(self):
         class ArraySubclass(np.ndarray):
             def __float__(self):
@@ -833,3 +845,28 @@ class TestSpecialAttributeLookupFailure:
             np.array(self.WeirdArrayLike())
         with pytest.raises(RuntimeError):
             np.array(self.WeirdArrayInterface())
+
+
+def test_subarray_from_array_construction():
+    # Arrays are more complex, since they "broadcast" on success:
+    arr = np.array([1, 2])
+
+    res = arr.astype("(2)i,")
+    assert_array_equal(res, [[1, 1], [2, 2]])
+
+    res = np.array(arr, dtype="(2)i,")
+
+    assert_array_equal(res, [[1, 1], [2, 2]])
+
+    res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
+    assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 1], [2, 2]]])
+
+    # Also try a multi-dimensional example:
+    arr = np.arange(5 * 2).reshape(5, 2)
+    expected = np.broadcast_to(arr[:, :, np.newaxis, np.newaxis], (5, 2, 2, 2))
+
+    res = arr.astype("(2,2)f")
+    assert_array_equal(res, expected)
+
+    res = np.array(arr, dtype="(2,2)f")
+    assert_array_equal(res, expected)
diff --git a/numpy/core/tests/test_arraymethod.py b/numpy/core/tests/test_arraymethod.py
index 6b75d1921..4fd4d5558 100644
--- a/numpy/core/tests/test_arraymethod.py
+++ b/numpy/core/tests/test_arraymethod.py
@@ -64,7 +64,6 @@ class TestSimpleStridedCall:
             self.method._simple_strided_call(*args)
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 @pytest.mark.parametrize(
     "cls", [np.ndarray, np.recarray, np.chararray, np.matrix, np.memmap]
 )
@@ -84,10 +83,3 @@ class TestClassGetItem:
             match = f"Too {'few' if arg_len == 0 else 'many'} arguments"
             with pytest.raises(TypeError, match=match):
                 cls[arg_tup]
-
-
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-def test_class_getitem_38() -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        np.ndarray[Any, Any]
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index f1883f703..6796b4077 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -9,6 +9,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_warns, HAS_REFCOUNT,
     assert_raises_regex,
     )
+from numpy.core.arrayprint import _typelessdata
 import textwrap
 
 class TestArrayRepr:
@@ -257,8 +258,7 @@ class TestArray2String:
         assert_(np.array2string(s, formatter={'numpystr':lambda s: s*2}) ==
                 '[abcabc defdef]')
 
-
-    def test_structure_format(self):
+    def test_structure_format_mixed(self):
         dt = np.dtype([('name', np.str_, 16), ('grades', np.float64, (2,))])
         x = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
         assert_equal(np.array2string(x),
@@ -300,6 +300,7 @@ class TestArray2String:
              ( 'NaT',) ( 'NaT',) ( 'NaT',)]""")
         )
 
+    def test_structure_format_int(self):
         # See #8160
         struct_int = np.array([([1, -1],), ([123, 1],)], dtype=[('B', 'i4', 2)])
         assert_equal(np.array2string(struct_int),
@@ -309,6 +310,7 @@ class TestArray2String:
         assert_equal(np.array2string(struct_2dint),
                 "[([[ 0,  1], [ 2,  3]],) ([[12,  0], [ 0,  0]],)]")
 
+    def test_structure_format_float(self):
         # See #8172
         array_scalar = np.array(
                 (1., 2.1234567890123456789, 3.), dtype=('f8,f8,f8'))
@@ -352,6 +354,33 @@ class TestArray2String:
                 '       [ 501,  502,  503, ...,  999, 1000, 1001]])'
         assert_equal(repr(A), reprA)
 
+    def test_summarize_structure(self):
+        A = (np.arange(2002, dtype="<i8").reshape(2, 1001)
+             .view([('i', "<i8", (1001,))]))
+        strA = ("[[([   0,    1,    2, ...,  998,  999, 1000],)]\n"
+                " [([1001, 1002, 1003, ..., 1999, 2000, 2001],)]]")
+        assert_equal(str(A), strA)
+
+        reprA = ("array([[([   0,    1,    2, ...,  998,  999, 1000],)],\n"
+                 "       [([1001, 1002, 1003, ..., 1999, 2000, 2001],)]],\n"
+                 "      dtype=[('i', '<i8', (1001,))])")
+        assert_equal(repr(A), reprA)
+
+        B = np.ones(2002, dtype=">i8").view([('i', ">i8", (2, 1001))])
+        strB = "[([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]],)]"
+        assert_equal(str(B), strB)
+
+        reprB = (
+            "array([([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]],)],\n"
+            "      dtype=[('i', '>i8', (2, 1001))])"
+        )
+        assert_equal(repr(B), reprB)
+
+        C = (np.arange(22, dtype="<i8").reshape(2, 11)
+             .view([('i1', "<i8"), ('i10', "<i8", (10,))]))
+        strC = "[[( 0, [ 1, ..., 10])]\n [(11, [12, ..., 21])]]"
+        assert_equal(np.array2string(C, threshold=1, edgeitems=1), strC)
+
     def test_linewidth(self):
         a = np.full(6, 1)
 
@@ -796,6 +825,47 @@ class TestPrintOptions:
             array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
                   dtype='{}')""".format(styp)))
 
+    @pytest.mark.parametrize(
+        ['native'],
+        [
+            ('bool',),
+            ('uint8',),
+            ('uint16',),
+            ('uint32',),
+            ('uint64',),
+            ('int8',),
+            ('int16',),
+            ('int32',),
+            ('int64',),
+            ('float16',),
+            ('float32',),
+            ('float64',),
+            ('U1',),     # 4-byte width string
+        ],
+    )
+    def test_dtype_endianness_repr(self, native):
+        '''
+        there was an issue where
+        repr(array([0], dtype='<u2')) and repr(array([0], dtype='>u2'))
+        both returned the same thing:
+        array([0], dtype=uint16)
+        even though their dtypes have different endianness.
+        '''
+        native_dtype = np.dtype(native)
+        non_native_dtype = native_dtype.newbyteorder()
+        non_native_repr = repr(np.array([1], non_native_dtype))
+        native_repr = repr(np.array([1], native_dtype))
+        # preserve the sensible default of only showing dtype if nonstandard
+        assert ('dtype' in native_repr) ^ (native_dtype in _typelessdata),\
+                ("an array's repr should show dtype if and only if the type "
+                 'of the array is NOT one of the standard types '
+                 '(e.g., int32, bool, float64).')
+        if non_native_dtype.itemsize > 1:
+            # if the type is >1 byte, the non-native endian version
+            # must show endianness.
+            assert non_native_repr != native_repr
+            assert f"dtype='{non_native_dtype.byteorder}" in non_native_repr
+
     def test_linewidth_repr(self):
         a = np.full(7, fill_value=2)
         np.set_printoptions(linewidth=17)
@@ -921,6 +991,16 @@ class TestPrintOptions:
                     [[ 0.]]]])""")
         )
 
+    def test_edgeitems_structured(self):
+        np.set_printoptions(edgeitems=1, threshold=1)
+        A = np.arange(5*2*3, dtype="<i8").view([('i', "<i8", (5, 2, 3))])
+        reprA = (
+            "array([([[[ 0, ...,  2], [ 3, ...,  5]], ..., "
+            "[[24, ..., 26], [27, ..., 29]]],)],\n"
+            "      dtype=[('i', '<i8', (5, 2, 3))])"
+        )
+        assert_equal(repr(A), reprA)
+
     def test_bad_args(self):
         assert_raises(ValueError, np.set_printoptions, threshold=float('nan'))
         assert_raises(TypeError, np.set_printoptions, threshold='1')
diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
index 1a76897e2..2fad4dfd9 100644
--- a/numpy/core/tests/test_cpu_features.py
+++ b/numpy/core/tests/test_cpu_features.py
@@ -1,5 +1,14 @@
 import sys, platform, re, pytest
-from numpy.core._multiarray_umath import __cpu_features__
+from numpy.core._multiarray_umath import (
+    __cpu_features__,
+    __cpu_baseline__,
+    __cpu_dispatch__,
+)
+import numpy as np
+import subprocess
+import pathlib
+import os
+import re
 
 def assert_features_equal(actual, desired, fname):
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -8,7 +17,7 @@ def assert_features_equal(actual, desired, fname):
         return
     detected = str(__cpu_features__).replace("'", "")
     try:
-        with open("/proc/cpuinfo", "r") as fd:
+        with open("/proc/cpuinfo") as fd:
             cpuinfo = fd.read(2048)
     except Exception as err:
         cpuinfo = str(err)
@@ -48,6 +57,10 @@ def assert_features_equal(actual, desired, fname):
         "%s"
     ) % (fname, actual, desired, error_report))
 
+def _text_to_list(txt):
+    out = txt.strip("][\n").replace("'", "").split(', ')
+    return None if out[0] == "" else out
+
 class AbstractTest:
     features = []
     features_groups = {}
@@ -92,7 +105,6 @@ class AbstractTest:
         return values
 
     def load_flags_auxv(self):
-        import subprocess
         auxv = subprocess.check_output(['/bin/true'], env=dict(LD_SHOW_AUXV="1"))
         for at in auxv.split(b'\n'):
             if not at.startswith(b"AT_HWCAP"):
@@ -103,6 +115,208 @@ class AbstractTest:
                     hwcap_value[1].upper().decode().split()
                 )
 
+@pytest.mark.skipif(
+    sys.platform == 'emscripten',
+    reason= (
+        "The subprocess module is not available on WASM platforms and"
+        " therefore this test class cannot be properly executed."
+    ),
+)
+class TestEnvPrivation:
+    cwd = pathlib.Path(__file__).parent.resolve()
+    env = os.environ.copy()
+    _enable = os.environ.pop('NPY_ENABLE_CPU_FEATURES', None)
+    _disable = os.environ.pop('NPY_DISABLE_CPU_FEATURES', None)
+    SUBPROCESS_ARGS = dict(cwd=cwd, capture_output=True, text=True, check=True)
+    unavailable_feats = [
+        feat for feat in __cpu_dispatch__ if not __cpu_features__[feat]
+    ]
+    UNAVAILABLE_FEAT = (
+        None if len(unavailable_feats) == 0
+        else unavailable_feats[0]
+    )
+    BASELINE_FEAT = None if len(__cpu_baseline__) == 0 else __cpu_baseline__[0]
+    SCRIPT = """
+def main():
+    from numpy.core._multiarray_umath import __cpu_features__, __cpu_dispatch__
+
+    detected = [feat for feat in __cpu_dispatch__ if __cpu_features__[feat]]
+    print(detected)
+
+if __name__ == "__main__":
+    main()
+    """
+
+    @pytest.fixture(autouse=True)
+    def setup_class(self, tmp_path_factory):
+        file = tmp_path_factory.mktemp("runtime_test_script")
+        file /= "_runtime_detect.py"
+        file.write_text(self.SCRIPT)
+        self.file = file
+        return
+
+    def _run(self):
+        return subprocess.run(
+            [sys.executable, self.file],
+            env=self.env,
+            **self.SUBPROCESS_ARGS,
+            )
+
+    # Helper function mimicing pytest.raises for subprocess call
+    def _expect_error(
+        self,
+        msg,
+        err_type,
+        no_error_msg="Failed to generate error"
+    ):
+        try:
+            self._run()
+        except subprocess.CalledProcessError as e:
+            assertion_message = f"Expected: {msg}\nGot: {e.stderr}"
+            assert re.search(msg, e.stderr), assertion_message
+
+            assertion_message = (
+                f"Expected error of type: {err_type}; see full "
+                f"error:\n{e.stderr}"
+            )
+            assert re.search(err_type, e.stderr), assertion_message
+        else:
+            assert False, no_error_msg
+
+    def setup_method(self):
+        """Ensure that the environment is reset"""
+        self.env = os.environ.copy()
+        return
+
+    def test_runtime_feature_selection(self):
+        """
+        Ensure that when selecting `NPY_ENABLE_CPU_FEATURES`, only the
+        features exactly specified are dispatched.
+        """
+
+        # Capture runtime-enabled features
+        out = self._run()
+        non_baseline_features = _text_to_list(out.stdout)
+
+        if non_baseline_features is None:
+            pytest.skip(
+                "No dispatchable features outside of baseline detected."
+            )
+        feature = non_baseline_features[0]
+
+        # Capture runtime-enabled features when `NPY_ENABLE_CPU_FEATURES` is
+        # specified
+        self.env['NPY_ENABLE_CPU_FEATURES'] = feature
+        out = self._run()
+        enabled_features = _text_to_list(out.stdout)
+
+        # Ensure that only one feature is enabled, and it is exactly the one
+        # specified by `NPY_ENABLE_CPU_FEATURES`
+        assert set(enabled_features) == {feature}
+
+        if len(non_baseline_features) < 2:
+            pytest.skip("Only one non-baseline feature detected.")
+        # Capture runtime-enabled features when `NPY_ENABLE_CPU_FEATURES` is
+        # specified
+        self.env['NPY_ENABLE_CPU_FEATURES'] = ",".join(non_baseline_features)
+        out = self._run()
+        enabled_features = _text_to_list(out.stdout)
+
+        # Ensure that both features are enabled, and they are exactly the ones
+        # specified by `NPY_ENABLE_CPU_FEATURES`
+        assert set(enabled_features) == set(non_baseline_features)
+        return
+
+    @pytest.mark.parametrize("enabled, disabled",
+    [
+        ("feature", "feature"),
+        ("feature", "same"),
+    ])
+    def test_both_enable_disable_set(self, enabled, disabled):
+        """
+        Ensure that when both environment variables are set then an
+        ImportError is thrown
+        """
+        self.env['NPY_ENABLE_CPU_FEATURES'] = enabled
+        self.env['NPY_DISABLE_CPU_FEATURES'] = disabled
+        msg = "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES"
+        err_type = "ImportError"
+        self._expect_error(msg, err_type)
+
+    @pytest.mark.skipif(
+        not __cpu_dispatch__,
+        reason=(
+            "NPY_*_CPU_FEATURES only parsed if "
+            "`__cpu_dispatch__` is non-empty"
+        )
+    )
+    @pytest.mark.parametrize("action", ["ENABLE", "DISABLE"])
+    def test_variable_too_long(self, action):
+        """
+        Test that an error is thrown if the environment variables are too long
+        to be processed. Current limit is 1024, but this may change later.
+        """
+        MAX_VAR_LENGTH = 1024
+        # Actual length is MAX_VAR_LENGTH + 1 due to null-termination
+        self.env[f'NPY_{action}_CPU_FEATURES'] = "t" * MAX_VAR_LENGTH
+        msg = (
+            f"Length of environment variable 'NPY_{action}_CPU_FEATURES' is "
+            f"{MAX_VAR_LENGTH + 1}, only {MAX_VAR_LENGTH} accepted"
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+    @pytest.mark.skipif(
+        not __cpu_dispatch__,
+        reason=(
+            "NPY_*_CPU_FEATURES only parsed if "
+            "`__cpu_dispatch__` is non-empty"
+        )
+    )
+    def test_impossible_feature_disable(self):
+        """
+        Test that a RuntimeError is thrown if an impossible feature-disabling
+        request is made. This includes disabling a baseline feature.
+        """
+
+        if self.BASELINE_FEAT is None:
+            pytest.skip("There are no unavailable features to test with")
+        bad_feature = self.BASELINE_FEAT
+        self.env['NPY_DISABLE_CPU_FEATURES'] = bad_feature
+        msg = (
+            f"You cannot disable CPU feature '{bad_feature}', since it is "
+            "part of the baseline optimizations"
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+    def test_impossible_feature_enable(self):
+        """
+        Test that a RuntimeError is thrown if an impossible feature-enabling
+        request is made. This includes enabling a feature not supported by the
+        machine, or disabling a baseline optimization.
+        """
+
+        if self.UNAVAILABLE_FEAT is None:
+            pytest.skip("There are no unavailable features to test with")
+        bad_feature = self.UNAVAILABLE_FEAT
+        self.env['NPY_ENABLE_CPU_FEATURES'] = bad_feature
+        msg = (
+            f"You cannot enable CPU features \\({bad_feature}\\), since "
+            "they are not supported by your machine."
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+        # Ensure that only the bad feature gets reported
+        feats = f"{bad_feature}, {self.BASELINE_FEAT}"
+        self.env['NPY_ENABLE_CPU_FEATURES'] = feats
+        msg = (
+            f"You cannot enable CPU features \\({bad_feature}\\), since they "
+            "are not supported by your machine."
+        )
+        self._expect_error(msg, err_type)
+
 is_linux = sys.platform.startswith('linux')
 is_cygwin = sys.platform.startswith('cygwin')
 machine  = platform.machine()
@@ -116,7 +330,7 @@ class Test_X86_Features(AbstractTest):
         "AVX", "F16C", "XOP", "FMA4", "FMA3", "AVX2", "AVX512F", "AVX512CD",
         "AVX512ER", "AVX512PF", "AVX5124FMAPS", "AVX5124VNNIW", "AVX512VPOPCNTDQ",
         "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512VNNI", "AVX512IFMA",
-        "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG",
+        "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG", "AVX512FP16",
     ]
     features_groups = dict(
         AVX512_KNL = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF"],
@@ -128,6 +342,10 @@ class Test_X86_Features(AbstractTest):
                       "AVX512VBMI"],
         AVX512_ICL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
                       "AVX512VBMI", "AVX512VNNI", "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ"],
+        AVX512_SPR = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ",
+                      "AVX512VL", "AVX512IFMA", "AVX512VBMI", "AVX512VNNI",
+                      "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ",
+                      "AVX512FP16"],
     )
     features_map = dict(
         SSE3="PNI", SSE41="SSE4_1", SSE42="SSE4_2", FMA3="FMA",
diff --git a/numpy/core/tests/test_custom_dtypes.py b/numpy/core/tests/test_custom_dtypes.py
index 6bcc45d6b..da6a4bd50 100644
--- a/numpy/core/tests/test_custom_dtypes.py
+++ b/numpy/core/tests/test_custom_dtypes.py
@@ -45,6 +45,9 @@ class TestSFloat:
         # Check the repr, mainly to cover the code paths:
         assert repr(SF(scaling=1.)) == "_ScaledFloatTestDType(scaling=1.0)"
 
+    def test_dtype_name(self):
+        assert SF(1.).name == "_ScaledFloatTestDType64"
+
     @pytest.mark.parametrize("scaling", [1., -1., 2.])
     def test_sfloat_from_float(self, scaling):
         a = np.array([1., 2., 3.]).astype(dtype=SF(scaling))
@@ -199,3 +202,52 @@ class TestSFloat:
         # The output casting does not match the bool, bool -> bool loop:
         with pytest.raises(TypeError):
             ufunc(a, a, out=np.empty(a.shape, dtype=int), casting="equiv")
+
+    def test_wrapped_and_wrapped_reductions(self):
+        a = self._get_array(2.)
+        float_equiv = a.astype(float)
+
+        expected = np.hypot(float_equiv, float_equiv)
+        res = np.hypot(a, a)
+        assert res.dtype == a.dtype
+        res_float = res.view(np.float64) * 2
+        assert_array_equal(res_float, expected)
+
+        # Also check reduction (keepdims, due to incorrect getitem)
+        res = np.hypot.reduce(a, keepdims=True)
+        assert res.dtype == a.dtype
+        expected = np.hypot.reduce(float_equiv, keepdims=True)
+        assert res.view(np.float64) * 2 == expected
+
+    def test_astype_class(self):
+        # Very simple test that we accept `.astype()` also on the class.
+        # ScaledFloat always returns the default descriptor, but it does
+        # check the relevant code paths.
+        arr = np.array([1., 2., 3.], dtype=object)
+
+        res = arr.astype(SF)  # passing the class class
+        expected = arr.astype(SF(1.))  # above will have discovered 1. scaling
+        assert_array_equal(res.view(np.float64), expected.view(np.float64))
+
+    def test_creation_class(self):
+        arr1 = np.array([1., 2., 3.], dtype=SF)
+        assert arr1.dtype == SF(1.)
+        arr2 = np.array([1., 2., 3.], dtype=SF(1.))
+        assert_array_equal(arr1.view(np.float64), arr2.view(np.float64))
+
+
+def test_type_pickle():
+    # can't actually unpickle, but we can pickle (if in namespace)
+    import pickle
+
+    np._ScaledFloatTestDType = SF
+
+    s = pickle.dumps(SF)
+    res = pickle.loads(s)
+    assert res is SF
+
+    del np._ScaledFloatTestDType
+
+
+def test_is_numeric():
+    assert SF._is_numeric
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 693eed0e2..547ebf9d6 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -719,8 +719,8 @@ class TestDateTime:
         assert_equal(uni_a, uni_b)
 
         # Datetime to long string - gh-9712
-        assert_equal(str_a, dt_a.astype((np.string_, 128)))
-        str_b = np.empty(str_a.shape, dtype=(np.string_, 128))
+        assert_equal(str_a, dt_a.astype((np.bytes_, 128)))
+        str_b = np.empty(str_a.shape, dtype=(np.bytes_, 128))
         str_b[...] = dt_a
         assert_equal(str_a, str_b)
 
@@ -1003,12 +1003,11 @@ class TestDateTime:
                      casting='unsafe'))
 
         # Shouldn't be able to compare datetime and timedelta
-        # TODO: Changing to 'same_kind' or 'safe' casting in the ufuncs by
-        #       default is needed to properly catch this kind of thing...
         a = np.array('2012-12-21', dtype='M8[D]')
         b = np.array(3, dtype='m8[D]')
-        #assert_raises(TypeError, np.less, a, b)
-        assert_raises(TypeError, np.less, a, b, casting='same_kind')
+        assert_raises(TypeError, np.less, a, b)
+        # not even if "unsafe"
+        assert_raises(TypeError, np.less, a, b, casting='unsafe')
 
     def test_datetime_like(self):
         a = np.array([3], dtype='m8[4D]')
@@ -2331,16 +2330,23 @@ class TestDateTime:
         assert_equal(np.busday_count('2011-01-01', dates, busdaycal=bdd),
                      np.arange(366))
         # Returns negative value when reversed
+        # -1 since the '2011-01-01' is not a busday
         assert_equal(np.busday_count(dates, '2011-01-01', busdaycal=bdd),
-                     -np.arange(366))
+                     -np.arange(366) - 1)
 
+        # 2011-12-31 is a saturday
         dates = np.busday_offset('2011-12-31', -np.arange(366),
                         roll='forward', busdaycal=bdd)
+        # only the first generated date is in the future of 2011-12-31
+        expected = np.arange(366)
+        expected[0] = -1
         assert_equal(np.busday_count(dates, '2011-12-31', busdaycal=bdd),
-                     np.arange(366))
+                     expected)
         # Returns negative value when reversed
+        expected = -np.arange(366)+1
+        expected[0] = 0
         assert_equal(np.busday_count('2011-12-31', dates, busdaycal=bdd),
-                     -np.arange(366))
+                     expected)
 
         # Can't supply both a weekmask/holidays and busdaycal
         assert_raises(ValueError, np.busday_offset, '2012-01-03', '2012-02-03',
@@ -2353,6 +2359,17 @@ class TestDateTime:
         # Returns negative value when reversed
         assert_equal(np.busday_count('2011-04', '2011-03', weekmask='Mon'), -4)
 
+        sunday = np.datetime64('2023-03-05')
+        monday = sunday + 1
+        friday = sunday + 5
+        saturday = sunday + 6
+        assert_equal(np.busday_count(sunday, monday), 0)
+        assert_equal(np.busday_count(monday, sunday), -1)
+
+        assert_equal(np.busday_count(friday, saturday), 1)
+        assert_equal(np.busday_count(saturday, friday), 0)
+
+
     def test_datetime_is_busday(self):
         holidays = ['2011-01-01', '2011-10-10', '2011-11-11', '2011-11-24',
                     '2011-12-25', '2011-05-30', '2011-02-21', '2011-01-17',
@@ -2530,3 +2547,23 @@ class TestDateTimeData:
 
         dt = np.datetime64('2000', '5μs')
         assert np.datetime_data(dt.dtype) == ('us', 5)
+
+
+def test_comparisons_return_not_implemented():
+    # GH#17017
+
+    class custom:
+        __array_priority__ = 10000
+
+    obj = custom()
+
+    dt = np.datetime64('2000', 'ns')
+    td = dt - dt
+
+    for item in [dt, td]:
+        assert item.__eq__(obj) is NotImplemented
+        assert item.__ne__(obj) is NotImplemented
+        assert item.__le__(obj) is NotImplemented
+        assert item.__lt__(obj) is NotImplemented
+        assert item.__ge__(obj) is NotImplemented
+        assert item.__gt__(obj) is NotImplemented
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index 22296604e..39699f457 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -1,3 +1,5 @@
+import pytest
+
 import numpy as np
 from numpy.core.multiarray import _vec_string
 from numpy.testing import (
@@ -29,7 +31,7 @@ class TestBasic:
     def test_from_string_array(self):
         A = np.array([[b'abc', b'foo'],
                       [b'long   ', b'0123456789']])
-        assert_equal(A.dtype.type, np.string_)
+        assert_equal(A.dtype.type, np.bytes_)
         B = np.char.array(A)
         assert_array_equal(B, A)
         assert_equal(B.dtype, A.dtype)
@@ -46,7 +48,7 @@ class TestBasic:
     def test_from_unicode_array(self):
         A = np.array([['abc', 'Sigma \u03a3'],
                       ['long   ', '0123456789']])
-        assert_equal(A.dtype.type, np.unicode_)
+        assert_equal(A.dtype.type, np.str_)
         B = np.char.array(A)
         assert_array_equal(B, A)
         assert_equal(B.dtype, A.dtype)
@@ -64,40 +66,40 @@ class TestBasic:
     def test_unicode_upconvert(self):
         A = np.char.array(['abc'])
         B = np.char.array(['\u03a3'])
-        assert_(issubclass((A + B).dtype.type, np.unicode_))
+        assert_(issubclass((A + B).dtype.type, np.str_))
 
     def test_from_string(self):
         A = np.char.array(b'abc')
         assert_equal(len(A), 1)
         assert_equal(len(A[0]), 3)
-        assert_(issubclass(A.dtype.type, np.string_))
+        assert_(issubclass(A.dtype.type, np.bytes_))
 
     def test_from_unicode(self):
         A = np.char.array('\u03a3')
         assert_equal(len(A), 1)
         assert_equal(len(A[0]), 1)
         assert_equal(A.itemsize, 4)
-        assert_(issubclass(A.dtype.type, np.unicode_))
+        assert_(issubclass(A.dtype.type, np.str_))
 
 class TestVecString:
     def test_non_existent_method(self):
 
         def fail():
-            _vec_string('a', np.string_, 'bogus')
+            _vec_string('a', np.bytes_, 'bogus')
 
         assert_raises(AttributeError, fail)
 
     def test_non_string_array(self):
 
         def fail():
-            _vec_string(1, np.string_, 'strip')
+            _vec_string(1, np.bytes_, 'strip')
 
         assert_raises(TypeError, fail)
 
     def test_invalid_args_tuple(self):
 
         def fail():
-            _vec_string(['a'], np.string_, 'strip', 1)
+            _vec_string(['a'], np.bytes_, 'strip', 1)
 
         assert_raises(TypeError, fail)
 
@@ -111,7 +113,7 @@ class TestVecString:
     def test_invalid_function_args(self):
 
         def fail():
-            _vec_string(['a'], np.string_, 'strip', (1,))
+            _vec_string(['a'], np.bytes_, 'strip', (1,))
 
         assert_raises(TypeError, fail)
 
@@ -190,7 +192,7 @@ class TestComparisonsMixed1(TestComparisons):
     def setup_method(self):
         TestComparisons.setup_method(self)
         self.B = np.array([['efg', '123  '],
-                           ['051', 'tuv']], np.unicode_).view(np.chararray)
+                           ['051', 'tuv']], np.str_).view(np.chararray)
 
 class TestComparisonsMixed2(TestComparisons):
     """Ticket #1276"""
@@ -198,7 +200,7 @@ class TestComparisonsMixed2(TestComparisons):
     def setup_method(self):
         TestComparisons.setup_method(self)
         self.A = np.array([['abc', '123'],
-                           ['789', 'xyz']], np.unicode_).view(np.chararray)
+                           ['789', 'xyz']], np.str_).view(np.chararray)
 
 class TestInformation:
     def setup_method(self):
@@ -320,17 +322,17 @@ class TestMethods:
         tgt = [[b' abc ', b''],
                [b'12345', b'Mixedcase'],
                [b'123 \t 345 \0 ', b'Upper']]
-        assert_(issubclass(self.A.capitalize().dtype.type, np.string_))
+        assert_(issubclass(self.A.capitalize().dtype.type, np.bytes_))
         assert_array_equal(self.A.capitalize(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'Mixedcase'],
                ['123 \t 345 \0 ', 'Upper']]
-        assert_(issubclass(self.B.capitalize().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.capitalize().dtype.type, np.str_))
         assert_array_equal(self.B.capitalize(), tgt)
 
     def test_center(self):
-        assert_(issubclass(self.A.center(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.center(10).dtype.type, np.bytes_))
         C = self.A.center([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
 
@@ -341,7 +343,7 @@ class TestMethods:
         C = np.char.center(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'   FOO    ', b'        FOO         '],
                [b'      FOO      ', b'  FOO   ']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_decode(self):
@@ -362,14 +364,14 @@ class TestMethods:
         A0 = self.A.decode('ascii')
 
         A = np.char.join([',', '#'], A0)
-        assert_(issubclass(A.dtype.type, np.unicode_))
+        assert_(issubclass(A.dtype.type, np.str_))
         tgt = np.array([[' ,a,b,c, ', ''],
                         ['1,2,3,4,5', 'M#i#x#e#d#C#a#s#e'],
                         ['1,2,3, ,\t, ,3,4,5, ,\x00, ', 'U#P#P#E#R']])
         assert_array_equal(np.char.join([',', '#'], A0), tgt)
 
     def test_ljust(self):
-        assert_(issubclass(self.A.ljust(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.ljust(10).dtype.type, np.bytes_))
 
         C = self.A.ljust([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
@@ -382,27 +384,27 @@ class TestMethods:
         C = np.char.ljust(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'FOO       ', b'FOO                 '],
                [b'FOO            ', b'FOO     ']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_lower(self):
         tgt = [[b' abc ', b''],
                [b'12345', b'mixedcase'],
                [b'123 \t 345 \0 ', b'upper']]
-        assert_(issubclass(self.A.lower().dtype.type, np.string_))
+        assert_(issubclass(self.A.lower().dtype.type, np.bytes_))
         assert_array_equal(self.A.lower(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'mixedcase'],
                ['123 \t 345 \0 ', 'upper']]
-        assert_(issubclass(self.B.lower().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.lower().dtype.type, np.str_))
         assert_array_equal(self.B.lower(), tgt)
 
     def test_lstrip(self):
         tgt = [[b'abc ', b''],
                [b'12345', b'MixedCase'],
                [b'123 \t 345 \0 ', b'UPPER']]
-        assert_(issubclass(self.A.lstrip().dtype.type, np.string_))
+        assert_(issubclass(self.A.lstrip().dtype.type, np.bytes_))
         assert_array_equal(self.A.lstrip(), tgt)
 
         tgt = [[b' abc', b''],
@@ -413,7 +415,7 @@ class TestMethods:
         tgt = [['\u03a3 ', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345 \0 ', 'UPPER']]
-        assert_(issubclass(self.B.lstrip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.lstrip().dtype.type, np.str_))
         assert_array_equal(self.B.lstrip(), tgt)
 
     def test_partition(self):
@@ -421,7 +423,7 @@ class TestMethods:
         tgt = [[(b' abc ', b'', b''), (b'', b'', b'')],
                [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
                [(b'12', b'3', b' \t 345 \0 '), (b'UPPER', b'', b'')]]
-        assert_(issubclass(P.dtype.type, np.string_))
+        assert_(issubclass(P.dtype.type, np.bytes_))
         assert_array_equal(P, tgt)
 
     def test_replace(self):
@@ -430,11 +432,11 @@ class TestMethods:
         tgt = [[b' abc ', b''],
                [b'12##########45', b'MixedC@se'],
                [b'12########## \t ##########45 \x00', b'UPPER']]
-        assert_(issubclass(R.dtype.type, np.string_))
+        assert_(issubclass(R.dtype.type, np.bytes_))
         assert_array_equal(R, tgt)
 
     def test_rjust(self):
-        assert_(issubclass(self.A.rjust(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.rjust(10).dtype.type, np.bytes_))
 
         C = self.A.rjust([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
@@ -447,7 +449,7 @@ class TestMethods:
         C = np.char.rjust(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'       FOO', b'                 FOO'],
                [b'            FOO', b'     FOO']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_rpartition(self):
@@ -455,7 +457,7 @@ class TestMethods:
         tgt = [[(b'', b'', b' abc '), (b'', b'', b'')],
                [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
                [(b'123 \t ', b'3', b'45 \0 '), (b'', b'', b'UPPER')]]
-        assert_(issubclass(P.dtype.type, np.string_))
+        assert_(issubclass(P.dtype.type, np.bytes_))
         assert_array_equal(P, tgt)
 
     def test_rsplit(self):
@@ -467,7 +469,7 @@ class TestMethods:
         assert_equal(A.tolist(), tgt)
 
     def test_rstrip(self):
-        assert_(issubclass(self.A.rstrip().dtype.type, np.string_))
+        assert_(issubclass(self.A.rstrip().dtype.type, np.bytes_))
 
         tgt = [[b' abc', b''],
                [b'12345', b'MixedCase'],
@@ -483,14 +485,14 @@ class TestMethods:
         tgt = [[' \u03a3', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345', 'UPPER']]
-        assert_(issubclass(self.B.rstrip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.rstrip().dtype.type, np.str_))
         assert_array_equal(self.B.rstrip(), tgt)
 
     def test_strip(self):
         tgt = [[b'abc', b''],
                [b'12345', b'MixedCase'],
                [b'123 \t 345', b'UPPER']]
-        assert_(issubclass(self.A.strip().dtype.type, np.string_))
+        assert_(issubclass(self.A.strip().dtype.type, np.bytes_))
         assert_array_equal(self.A.strip(), tgt)
 
         tgt = [[b' abc ', b''],
@@ -501,7 +503,7 @@ class TestMethods:
         tgt = [['\u03a3', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345', 'UPPER']]
-        assert_(issubclass(self.B.strip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.strip().dtype.type, np.str_))
         assert_array_equal(self.B.strip(), tgt)
 
     def test_split(self):
@@ -523,39 +525,39 @@ class TestMethods:
         tgt = [[b' ABC ', b''],
                [b'12345', b'mIXEDcASE'],
                [b'123 \t 345 \0 ', b'upper']]
-        assert_(issubclass(self.A.swapcase().dtype.type, np.string_))
+        assert_(issubclass(self.A.swapcase().dtype.type, np.bytes_))
         assert_array_equal(self.A.swapcase(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'mIXEDcASE'],
                ['123 \t 345 \0 ', 'upper']]
-        assert_(issubclass(self.B.swapcase().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.swapcase().dtype.type, np.str_))
         assert_array_equal(self.B.swapcase(), tgt)
 
     def test_title(self):
         tgt = [[b' Abc ', b''],
                [b'12345', b'Mixedcase'],
                [b'123 \t 345 \0 ', b'Upper']]
-        assert_(issubclass(self.A.title().dtype.type, np.string_))
+        assert_(issubclass(self.A.title().dtype.type, np.bytes_))
         assert_array_equal(self.A.title(), tgt)
 
         tgt = [[' \u03a3 ', ''],
                ['12345', 'Mixedcase'],
                ['123 \t 345 \0 ', 'Upper']]
-        assert_(issubclass(self.B.title().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.title().dtype.type, np.str_))
         assert_array_equal(self.B.title(), tgt)
 
     def test_upper(self):
         tgt = [[b' ABC ', b''],
                [b'12345', b'MIXEDCASE'],
                [b'123 \t 345 \0 ', b'UPPER']]
-        assert_(issubclass(self.A.upper().dtype.type, np.string_))
+        assert_(issubclass(self.A.upper().dtype.type, np.bytes_))
         assert_array_equal(self.A.upper(), tgt)
 
         tgt = [[' \u03a3 ', ''],
                ['12345', 'MIXEDCASE'],
                ['123 \t 345 \0 ', 'UPPER']]
-        assert_(issubclass(self.B.upper().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.upper().dtype.type, np.str_))
         assert_array_equal(self.B.upper(), tgt)
 
     def test_isnumeric(self):
@@ -670,3 +672,15 @@ def test_empty_indexing():
     # empty chararray instead of a chararray with a single empty string in it.
     s = np.chararray((4,))
     assert_(s[[]].size == 0)
+
+
+@pytest.mark.parametrize(["dt1", "dt2"],
+        [("S", "U"), ("U", "S"), ("S", "O"), ("U", "O"),
+         ("S", "d"), ("S", "V")])
+def test_add_types(dt1, dt2):
+    arr1 = np.array([1234234], dtype=dt1)
+    # If the following fails, e.g. use a number and test "V" explicitly
+    arr2 = np.array([b"423"], dtype=dt2)
+    with pytest.raises(TypeError,
+            match=f".*same dtype kind.*{arr1.dtype}.*{arr2.dtype}"):
+        np.char.add(arr1, arr2)
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 4ec1f83d4..3ada39e90 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -310,12 +310,6 @@ class TestGeneratorSum(_DeprecationTestCase):
         self.assert_deprecated(np.sum, args=((i for i in range(5)),))
 
 
-class TestPositiveOnNonNumerical(_DeprecationTestCase):
-    # 2018-06-28, 1.16.0
-    def test_positive_on_non_number(self):
-        self.assert_deprecated(operator.pos, args=(np.array('foo'),))
-
-
 class TestFromstring(_DeprecationTestCase):
     # 2017-10-19, 1.14
     def test_fromstring(self):
@@ -544,134 +538,6 @@ class FlatteningConcatenateUnsafeCast(_DeprecationTestCase):
                            casting="same_kind")
 
 
-class TestDeprecateSubarrayDTypeDuringArrayCoercion(_DeprecationTestCase):
-    warning_cls = FutureWarning
-    message = "(creating|casting) an array (with|to) a subarray dtype"
-
-    def test_deprecated_array(self):
-        # Arrays are more complex, since they "broadcast" on success:
-        arr = np.array([1, 2])
-
-        self.assert_deprecated(lambda: arr.astype("(2)i,"))
-        with pytest.warns(FutureWarning):
-            res = arr.astype("(2)i,")
-
-        assert_array_equal(res, [[1, 2], [1, 2]])
-
-        self.assert_deprecated(lambda: np.array(arr, dtype="(2)i,"))
-        with pytest.warns(FutureWarning):
-            res = np.array(arr, dtype="(2)i,")
-
-        assert_array_equal(res, [[1, 2], [1, 2]])
-
-        with pytest.warns(FutureWarning):
-            res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
-
-        assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 2], [1, 2]]])
-
-    def test_deprecated_and_error(self):
-        # These error paths do not give a warning, but will succeed in the
-        # future.
-        arr = np.arange(5 * 2).reshape(5, 2)
-        def check():
-            with pytest.raises(ValueError):
-                arr.astype("(2,2)f")
-
-        self.assert_deprecated(check)
-
-        def check():
-            with pytest.raises(ValueError):
-                np.array(arr, dtype="(2,2)f")
-
-        self.assert_deprecated(check)
-
-
-class TestFutureWarningArrayLikeNotIterable(_DeprecationTestCase):
-    # Deprecated 2020-12-09, NumPy 1.20
-    warning_cls = FutureWarning
-    message = "The input object of type.*but not a sequence"
-
-    @pytest.mark.parametrize("protocol",
-            ["__array__", "__array_interface__", "__array_struct__"])
-    def test_deprecated(self, protocol):
-        """Test that these objects give a warning since they are not 0-D,
-        not coerced at the top level `np.array(obj)`, but nested, and do
-        *not* define the sequence protocol.
-
-        NOTE: Tests for the versions including __len__ and __getitem__ exist
-              in `test_array_coercion.py` and they can be modified or amended
-              when this deprecation expired.
-        """
-        blueprint = np.arange(10)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        self.assert_deprecated(lambda: np.array([MyArr()], dtype=object))
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_0d_not_deprecated(self, protocol):
-        # 0-D always worked (albeit it would use __float__ or similar for the
-        # conversion, which may not happen anymore)
-        blueprint = np.array(1.)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        myarr = MyArr()
-
-        self.assert_not_deprecated(lambda: np.array([myarr], dtype=object))
-        res = np.array([myarr], dtype=object)
-        expected = np.empty(1, dtype=object)
-        expected[0] = myarr
-        assert_array_equal(res, expected)
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_unnested_not_deprecated(self, protocol):
-        blueprint = np.arange(10)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        myarr = MyArr()
-
-        self.assert_not_deprecated(lambda: np.array(myarr))
-        res = np.array(myarr)
-        assert_array_equal(res, blueprint)
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_strange_dtype_handling(self, protocol):
-        """The old code would actually use the dtype from the array, but
-        then end up not using the array (for dimension discovery)
-        """
-        blueprint = np.arange(10).astype("f4")
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol),
-                                   "__float__": lambda _: 0.5})
-        myarr = MyArr()
-
-        # Make sure we warn (and capture the FutureWarning)
-        with pytest.warns(FutureWarning, match=self.message):
-            res = np.array([[myarr]])
-
-        assert res.shape == (1, 1)
-        assert res.dtype == "f4"
-        assert res[0, 0] == 0.5
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_assignment_not_deprecated(self, protocol):
-        # If the result is dtype=object we do not unpack a nested array or
-        # array-like, if it is nested at exactly the right depth.
-        # NOTE: We actually do still call __array__, etc. but ignore the result
-        #       in the end. For `dtype=object` we could optimize that away.
-        blueprint = np.arange(10).astype("f4")
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol),
-                                   "__float__": lambda _: 0.5})
-        myarr = MyArr()
-
-        res = np.empty(3, dtype=object)
-        def set():
-            res[:] = [myarr, myarr, myarr]
-        self.assert_not_deprecated(set)
-        assert res[0] is myarr
-        assert res[1] is myarr
-        assert res[2] is myarr
-
-
 class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
     # Deprecated 2020-11-24, NumPy 1.20
     """
@@ -685,218 +551,6 @@ class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
         ctor = np.core.multiarray.scalar
         self.assert_deprecated(lambda: ctor(np.dtype("O"), 1))
 
-try:
-    with warnings.catch_warnings():
-        warnings.simplefilter("always")
-        import nose  # noqa: F401
-except ImportError:
-    HAVE_NOSE = False
-else:
-    HAVE_NOSE = True
-
-
-@pytest.mark.skipif(not HAVE_NOSE, reason="Needs nose")
-class TestNoseDecoratorsDeprecated(_DeprecationTestCase):
-    class DidntSkipException(Exception):
-        pass
-
-    def test_slow(self):
-        def _test_slow():
-            @np.testing.dec.slow
-            def slow_func(x, y, z):
-                pass
-
-            assert_(slow_func.slow)
-        self.assert_deprecated(_test_slow)
-
-    def test_setastest(self):
-        def _test_setastest():
-            @np.testing.dec.setastest()
-            def f_default(a):
-                pass
-
-            @np.testing.dec.setastest(True)
-            def f_istest(a):
-                pass
-
-            @np.testing.dec.setastest(False)
-            def f_isnottest(a):
-                pass
-
-            assert_(f_default.__test__)
-            assert_(f_istest.__test__)
-            assert_(not f_isnottest.__test__)
-        self.assert_deprecated(_test_setastest, num=3)
-
-    def test_skip_functions_hardcoded(self):
-        def _test_skip_functions_hardcoded():
-            @np.testing.dec.skipif(True)
-            def f1(x):
-                raise self.DidntSkipException
-
-            try:
-                f1('a')
-            except self.DidntSkipException:
-                raise Exception('Failed to skip')
-            except SkipTest().__class__:
-                pass
-
-            @np.testing.dec.skipif(False)
-            def f2(x):
-                raise self.DidntSkipException
-
-            try:
-                f2('a')
-            except self.DidntSkipException:
-                pass
-            except SkipTest().__class__:
-                raise Exception('Skipped when not expected to')
-        self.assert_deprecated(_test_skip_functions_hardcoded, num=2)
-
-    def test_skip_functions_callable(self):
-        def _test_skip_functions_callable():
-            def skip_tester():
-                return skip_flag == 'skip me!'
-
-            @np.testing.dec.skipif(skip_tester)
-            def f1(x):
-                raise self.DidntSkipException
-
-            try:
-                skip_flag = 'skip me!'
-                f1('a')
-            except self.DidntSkipException:
-                raise Exception('Failed to skip')
-            except SkipTest().__class__:
-                pass
-
-            @np.testing.dec.skipif(skip_tester)
-            def f2(x):
-                raise self.DidntSkipException
-
-            try:
-                skip_flag = 'five is right out!'
-                f2('a')
-            except self.DidntSkipException:
-                pass
-            except SkipTest().__class__:
-                raise Exception('Skipped when not expected to')
-        self.assert_deprecated(_test_skip_functions_callable, num=2)
-
-    def test_skip_generators_hardcoded(self):
-        def _test_skip_generators_hardcoded():
-            @np.testing.dec.knownfailureif(True, "This test is known to fail")
-            def g1(x):
-                yield from range(x)
-
-            try:
-                for j in g1(10):
-                    pass
-            except KnownFailureException().__class__:
-                pass
-            else:
-                raise Exception('Failed to mark as known failure')
-
-            @np.testing.dec.knownfailureif(False, "This test is NOT known to fail")
-            def g2(x):
-                yield from range(x)
-                raise self.DidntSkipException('FAIL')
-
-            try:
-                for j in g2(10):
-                    pass
-            except KnownFailureException().__class__:
-                raise Exception('Marked incorrectly as known failure')
-            except self.DidntSkipException:
-                pass
-        self.assert_deprecated(_test_skip_generators_hardcoded, num=2)
-
-    def test_skip_generators_callable(self):
-        def _test_skip_generators_callable():
-            def skip_tester():
-                return skip_flag == 'skip me!'
-
-            @np.testing.dec.knownfailureif(skip_tester, "This test is known to fail")
-            def g1(x):
-                yield from range(x)
-
-            try:
-                skip_flag = 'skip me!'
-                for j in g1(10):
-                    pass
-            except KnownFailureException().__class__:
-                pass
-            else:
-                raise Exception('Failed to mark as known failure')
-
-            @np.testing.dec.knownfailureif(skip_tester, "This test is NOT known to fail")
-            def g2(x):
-                yield from range(x)
-                raise self.DidntSkipException('FAIL')
-
-            try:
-                skip_flag = 'do not skip'
-                for j in g2(10):
-                    pass
-            except KnownFailureException().__class__:
-                raise Exception('Marked incorrectly as known failure')
-            except self.DidntSkipException:
-                pass
-        self.assert_deprecated(_test_skip_generators_callable, num=2)
-
-    def test_deprecated(self):
-        def _test_deprecated():
-            @np.testing.dec.deprecated(True)
-            def non_deprecated_func():
-                pass
-
-            @np.testing.dec.deprecated()
-            def deprecated_func():
-                import warnings
-                warnings.warn("TEST: deprecated func", DeprecationWarning, stacklevel=1)
-
-            @np.testing.dec.deprecated()
-            def deprecated_func2():
-                import warnings
-                warnings.warn("AHHHH", stacklevel=1)
-                raise ValueError
-
-            @np.testing.dec.deprecated()
-            def deprecated_func3():
-                import warnings
-                warnings.warn("AHHHH", stacklevel=1)
-
-            # marked as deprecated, but does not raise DeprecationWarning
-            assert_raises(AssertionError, non_deprecated_func)
-            # should be silent
-            deprecated_func()
-            with warnings.catch_warnings(record=True):
-                warnings.simplefilter("always")  # do not propagate unrelated warnings
-                # fails if deprecated decorator just disables test. See #1453.
-                assert_raises(ValueError, deprecated_func2)
-                # warning is not a DeprecationWarning
-                assert_raises(AssertionError, deprecated_func3)
-        self.assert_deprecated(_test_deprecated, num=4)
-
-    def test_parametrize(self):
-        def _test_parametrize():
-            # dec.parametrize assumes that it is being run by nose. Because
-            # we are running under pytest, we need to explicitly check the
-            # results.
-            @np.testing.dec.parametrize('base, power, expected',
-                    [(1, 1, 1),
-                    (2, 1, 2),
-                    (2, 2, 4)])
-            def check_parametrize(base, power, expected):
-                assert_(base**power == expected)
-
-            count = 0
-            for test in check_parametrize():
-                test[0](*test[1:])
-                count += 1
-            assert_(count == 3)
-        self.assert_deprecated(_test_parametrize)
-
 
 class TestSingleElementSignature(_DeprecationTestCase):
     # Deprecated 2021-04-01, NumPy 1.21
@@ -1040,6 +694,18 @@ class TestLoadtxtParseIntsViaFloat(_DeprecationTestCase):
                 assert isinstance(e.__cause__, DeprecationWarning)
 
 
+class TestScalarConversion(_DeprecationTestCase):
+    # 2023-01-02, 1.25.0
+    def test_float_conversion(self):
+        self.assert_deprecated(float, args=(np.array([3.14]),))
+
+    def test_behaviour(self):
+        b = np.array([[3.14]])
+        c = np.zeros(5)
+        with pytest.warns(DeprecationWarning):
+            c[0] = b
+
+
 class TestPyIntConversion(_DeprecationTestCase):
     message = r".*stop allowing conversion of out-of-bound.*"
 
@@ -1098,3 +764,54 @@ def test_future_scalar_attributes(name):
     # Unfortunately, they are currently still valid via `np.dtype()`
     np.dtype(name)
     name in np.sctypeDict
+
+
+# Ignore the above future attribute warning for this test.
+@pytest.mark.filterwarnings("ignore:In the future:FutureWarning")
+class TestRemovedGlobals:
+    # Removed 2023-01-12, NumPy 1.24.0
+    # Not a deprecation, but the large error was added to aid those who missed
+    # the previous deprecation, and should be removed similarly to one
+    # (or faster).
+    @pytest.mark.parametrize("name",
+            ["object", "bool", "float", "complex", "str", "int"])
+    def test_attributeerror_includes_info(self, name):
+        msg = f".*\n`np.{name}` was a deprecated alias for the builtin"
+        with pytest.raises(AttributeError, match=msg):
+            getattr(np, name)
+
+
+class TestDeprecatedFinfo(_DeprecationTestCase):
+    # Deprecated in NumPy 1.25, 2023-01-16
+    def test_deprecated_none(self):
+        self.assert_deprecated(np.finfo, args=(None,))
+
+class TestFromnumeric(_DeprecationTestCase):
+    # 2023-02-28, 1.25.0
+    def test_round_(self):
+        self.assert_deprecated(lambda: np.round_(np.array([1.5, 2.5, 3.5])))
+
+    # 2023-03-02, 1.25.0
+    def test_cumproduct(self):
+        self.assert_deprecated(lambda: np.cumproduct(np.array([1, 2, 3])))
+
+    # 2023-03-02, 1.25.0
+    def test_product(self):
+        self.assert_deprecated(lambda: np.product(np.array([1, 2, 3])))
+
+    # 2023-03-02, 1.25.0
+    def test_sometrue(self):
+        self.assert_deprecated(lambda: np.sometrue(np.array([True, False])))
+
+    # 2023-03-02, 1.25.0
+    def test_alltrue(self):
+        self.assert_deprecated(lambda: np.alltrue(np.array([True, False])))
+
+
+class TestMathAlias(_DeprecationTestCase):
+    # Deprecated in Numpy 1.25, 2023-04-06
+    def test_deprecated_np_math(self):
+        self.assert_deprecated(lambda: np.math)
+
+    def test_deprecated_np_lib_math(self):
+        self.assert_deprecated(lambda: np.lib.math)
diff --git a/numpy/core/tests/test_dlpack.py b/numpy/core/tests/test_dlpack.py
index 278bdd12d..49249bc6a 100644
--- a/numpy/core/tests/test_dlpack.py
+++ b/numpy/core/tests/test_dlpack.py
@@ -38,13 +38,14 @@ class TestDLPack:
         assert sys.getrefcount(x) == 2
 
     @pytest.mark.parametrize("dtype", [
+        np.bool_,
         np.int8, np.int16, np.int32, np.int64,
         np.uint8, np.uint16, np.uint32, np.uint64,
         np.float16, np.float32, np.float64,
         np.complex64, np.complex128
     ])
     def test_dtype_passthrough(self, dtype):
-        x = np.arange(5, dtype=dtype)
+        x = np.arange(5).astype(dtype)
         y = np.from_dlpack(x)
 
         assert y.dtype == x.dtype
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 0a56483d6..57831f46f 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -7,6 +7,7 @@ import types
 from typing import Any
 
 import numpy as np
+import numpy.dtypes
 from numpy.core._rational_tests import rational
 from numpy.core._multiarray_tests import create_custom_field_dtype
 from numpy.testing import (
@@ -195,6 +196,34 @@ class TestBuiltin:
         # This is an safe cast (not equiv) due to the different names:
         assert np.can_cast(x, y, casting="safe")
 
+    @pytest.mark.parametrize(
+        ["type_char", "char_size", "scalar_type"],
+        [["U", 4, np.str_],
+         ["S", 1, np.bytes_]])
+    def test_create_string_dtypes_directly(
+            self, type_char, char_size, scalar_type):
+        dtype_class = type(np.dtype(type_char))
+
+        dtype = dtype_class(8)
+        assert dtype.type is scalar_type
+        assert dtype.itemsize == 8*char_size
+
+    def test_create_invalid_string_errors(self):
+        one_too_big = np.iinfo(np.intc).max + 1
+        with pytest.raises(TypeError):
+            type(np.dtype("U"))(one_too_big // 4)
+
+        with pytest.raises(TypeError):
+            # Code coverage for very large numbers:
+            type(np.dtype("U"))(np.iinfo(np.intp).max // 4 + 1)
+
+        if one_too_big < sys.maxsize:
+            with pytest.raises(TypeError):
+                type(np.dtype("S"))(one_too_big)
+
+        with pytest.raises(ValueError):
+            type(np.dtype("U"))(-1)
+
 
 class TestRecord:
     def test_equivalent_record(self):
@@ -523,6 +552,14 @@ class TestRecord:
         assert_equal(np.zeros((1, 2), dtype=[]) == a,
                      np.ones((1, 2), dtype=bool))
 
+    def test_nonstructured_with_object(self):
+        # See gh-23277, the dtype here thinks it contain objects, if the
+        # assert about that fails, the test becomes meaningless (which is OK)
+        arr = np.recarray((0,), dtype="O") 
+        assert arr.dtype.names is None  # no fields
+        assert arr.dtype.hasobject  # but claims to contain objects
+        del arr  # the deletion failed previously.
+
 
 class TestSubarray:
     def test_single_subarray(self):
@@ -1527,8 +1564,21 @@ class TestDTypeClasses:
         dtype = np.dtype(dtype)
         assert isinstance(dtype, np.dtype)
         assert type(dtype) is not np.dtype
-        assert type(dtype).__name__ == f"dtype[{dtype.type.__name__}]"
-        assert type(dtype).__module__ == "numpy"
+        if dtype.type.__name__ != "rational":
+            dt_name = type(dtype).__name__.lower().removesuffix("dtype")
+            if dt_name == "uint" or dt_name == "int":
+                # The scalar names has a `c` attached because "int" is Python
+                # int and that is long...
+                dt_name += "c"
+            sc_name = dtype.type.__name__
+            assert dt_name == sc_name.strip("_")
+            assert type(dtype).__module__ == "numpy.dtypes"
+
+            assert getattr(numpy.dtypes, type(dtype).__name__) is type(dtype)
+        else:
+            assert type(dtype).__name__ == "dtype[rational]"
+            assert type(dtype).__module__ == "numpy"
+
         assert not type(dtype)._abstract
 
         # the flexible dtypes and datetime/timedelta have additional parameters
@@ -1551,6 +1601,32 @@ class TestDTypeClasses:
         assert type(np.dtype).__module__ == "numpy"
         assert np.dtype._abstract
 
+    def test_is_numeric(self):
+        all_codes = set(np.typecodes['All'])
+        numeric_codes = set(np.typecodes['AllInteger'] +
+                            np.typecodes['AllFloat'] + '?')
+        non_numeric_codes = all_codes - numeric_codes
+
+        for code in numeric_codes:
+            assert type(np.dtype(code))._is_numeric
+
+        for code in non_numeric_codes:
+            assert not type(np.dtype(code))._is_numeric
+
+    @pytest.mark.parametrize("int_", ["UInt", "Int"])
+    @pytest.mark.parametrize("size", [8, 16, 32, 64])
+    def test_integer_alias_names(self, int_, size):
+        DType = getattr(numpy.dtypes, f"{int_}{size}DType")
+        sctype = getattr(numpy, f"{int_.lower()}{size}")
+        assert DType.type is sctype
+        assert DType.__name__.lower().removesuffix("dtype") == sctype.__name__
+
+    @pytest.mark.parametrize("name",
+            ["Half", "Float", "Double", "CFloat", "CDouble"])
+    def test_float_alias_names(self, name):
+        with pytest.raises(AttributeError):
+            getattr(numpy.dtypes, name + "DType") is numpy.dtypes.Float16DType
+
 
 class TestFromCTypes:
 
@@ -1785,7 +1861,6 @@ class TestUserDType:
             create_custom_field_dtype(blueprint, mytype, 2)
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 class TestClassGetItem:
     def test_dtype(self) -> None:
         alias = np.dtype[Any]
@@ -1818,10 +1893,3 @@ def test_result_type_integers_and_unitless_timedelta64():
     td = np.timedelta64(4)
     result = np.result_type(0, td)
     assert_dtype_equal(result, td.dtype)
-
-
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-def test_class_getitem_38() -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        np.dtype[Any]
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 7c0e8d97c..3a06d119f 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -100,6 +100,69 @@ class TestEinsum:
             assert_raises(ValueError, np.einsum, "i->i", np.arange(6).reshape(-1, 1),
                           optimize=do_opt, order='d')
 
+    def test_einsum_object_errors(self):
+        # Exceptions created by object arithmetic should
+        # successfully propogate
+
+        class CustomException(Exception):
+            pass
+
+        class DestructoBox:
+
+            def __init__(self, value, destruct):
+                self._val = value
+                self._destruct = destruct
+
+            def __add__(self, other):
+                tmp = self._val + other._val
+                if tmp >= self._destruct:
+                    raise CustomException
+                else:
+                    self._val = tmp
+                    return self
+
+            def __radd__(self, other):
+                if other == 0:
+                    return self
+                else:
+                    return self.__add__(other)
+
+            def __mul__(self, other):
+                tmp = self._val * other._val
+                if tmp >= self._destruct:
+                    raise CustomException
+                else:
+                    self._val = tmp
+                    return self
+
+            def __rmul__(self, other):
+                if other == 0:
+                    return self
+                else:
+                    return self.__mul__(other)
+
+        a = np.array([DestructoBox(i, 5) for i in range(1, 10)],
+                     dtype='object').reshape(3, 3)
+
+        # raised from unbuffered_loop_nop1_ndim2
+        assert_raises(CustomException, np.einsum, "ij->i", a)
+
+        # raised from unbuffered_loop_nop1_ndim3
+        b = np.array([DestructoBox(i, 100) for i in range(0, 27)],
+                     dtype='object').reshape(3, 3, 3)
+        assert_raises(CustomException, np.einsum, "i...k->...", b)
+
+        # raised from unbuffered_loop_nop2_ndim2
+        b = np.array([DestructoBox(i, 55) for i in range(1, 4)],
+                     dtype='object')
+        assert_raises(CustomException, np.einsum, "ij, j", a, b)
+
+        # raised from unbuffered_loop_nop2_ndim3
+        assert_raises(CustomException, np.einsum, "ij, jh", a, a)
+
+        # raised from PyArray_EinsteinSum
+        assert_raises(CustomException, np.einsum, "ij->", a)
+
     def test_einsum_views(self):
         # pass-through
         for do_opt in [True, False]:
@@ -247,47 +310,50 @@ class TestEinsum:
         # sum(a, axis=-1)
         for n in range(1, 17):
             a = np.arange(n, dtype=dtype)
-            assert_equal(np.einsum("i->", a, optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
-            assert_equal(np.einsum(a, [0], [], optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
+            b = np.sum(a, axis=-1)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i->", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0], [], optimize=do_opt), b)
 
         for n in range(1, 17):
             a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
-            assert_equal(np.einsum("...i->...", a, optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
-            assert_equal(np.einsum(a, [Ellipsis, 0], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
+            b = np.sum(a, axis=-1)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("...i->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [Ellipsis, 0], [Ellipsis], optimize=do_opt), b)
 
         # sum(a, axis=0)
         for n in range(1, 17):
             a = np.arange(2*n, dtype=dtype).reshape(2, n)
-            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
-            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
+            b = np.sum(a, axis=0)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt), b)
 
         for n in range(1, 17):
             a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
-            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
-            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
+            b = np.sum(a, axis=0)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt), b)
 
         # trace(a)
         for n in range(1, 17):
             a = np.arange(n*n, dtype=dtype).reshape(n, n)
-            assert_equal(np.einsum("ii", a, optimize=do_opt),
-                         np.trace(a).astype(dtype))
-            assert_equal(np.einsum(a, [0, 0], optimize=do_opt),
-                         np.trace(a).astype(dtype))
+            b = np.trace(a)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("ii", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, 0], optimize=do_opt), b)
 
             # gh-15961: should accept numpy int64 type in subscript list
             np_array = np.asarray([0, 0])
-            assert_equal(np.einsum(a, np_array, optimize=do_opt),
-                         np.trace(a).astype(dtype))
-            assert_equal(np.einsum(a, list(np_array), optimize=do_opt),
-                         np.trace(a).astype(dtype))
+            assert_equal(np.einsum(a, np_array, optimize=do_opt), b)
+            assert_equal(np.einsum(a, list(np_array), optimize=do_opt), b)
 
         # multiply(a, b)
         assert_equal(np.einsum("..., ...", 3, 4), 12)  # scalar case
@@ -489,11 +555,15 @@ class TestEinsum:
 
         b = np.einsum("i->", a, dtype=dtype, casting='unsafe')
         assert_equal(b, np.sum(a))
-        assert_equal(b.dtype, np.dtype(dtype))
+        if hasattr(b, "dtype"):
+            # Can be a python object when dtype is object
+            assert_equal(b.dtype, np.dtype(dtype))
 
         b = np.einsum(a, [0], [], dtype=dtype, casting='unsafe')
         assert_equal(b, np.sum(a))
-        assert_equal(b.dtype, np.dtype(dtype))
+        if hasattr(b, "dtype"):
+            # Can be a python object when dtype is object
+            assert_equal(b.dtype, np.dtype(dtype))
 
         # A case which was failing (ticket #1885)
         p = np.arange(2) + 1
@@ -587,6 +657,10 @@ class TestEinsum:
     def test_einsum_sums_clongdouble(self):
         self.check_einsum_sums(np.clongdouble)
 
+    def test_einsum_sums_object(self):
+        self.check_einsum_sums('object')
+        self.check_einsum_sums('object', True)
+
     def test_einsum_misc(self):
         # This call used to crash because of a bug in
         # PyArray_AssignZero
@@ -625,6 +699,21 @@ class TestEinsum:
         # see issue gh-15776 and issue gh-15256
         assert_equal(np.einsum('i,j', [1], [2], out=None), [[2]])
 
+    def test_object_loop(self):
+
+        class Mult:
+            def __mul__(self, other):
+                return 42
+
+        objMult = np.array([Mult()])
+        objNULL = np.ndarray(buffer = b'\0' * np.intp(0).itemsize, shape=1, dtype=object)
+
+        with pytest.raises(TypeError):
+            np.einsum("i,j", [1], objNULL)
+        with pytest.raises(TypeError):
+            np.einsum("i,j", objNULL, [1])
+        assert np.einsum("i,j", objMult, objMult) == 42
+
     def test_subscript_range(self):
         # Issue #7741, make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
         # when creating a subscript from arrays
@@ -755,7 +844,7 @@ class TestEinsum:
         # Test originally added to cover broken float16 path: gh-20305
         # Likely most are covered elsewhere, at least partially.
         dtype = np.dtype(dtype)
-        # Simple test, designed to excersize most specialized code paths,
+        # Simple test, designed to exercise most specialized code paths,
         # note the +0.5 for floats.  This makes sure we use a float value
         # where the results must be exact.
         arr = (np.arange(7) + 0.5).astype(dtype)
diff --git a/numpy/core/tests/test_function_base.py b/numpy/core/tests/test_function_base.py
index dad7a5883..79f1ecfc9 100644
--- a/numpy/core/tests/test_function_base.py
+++ b/numpy/core/tests/test_function_base.py
@@ -1,3 +1,4 @@
+import pytest
 from numpy import (
     logspace, linspace, geomspace, dtype, array, sctypes, arange, isnan,
     ndarray, sqrt, nextafter, stack, errstate
@@ -65,6 +66,33 @@ class TestLogspace:
         t5 = logspace(start, stop, 6, axis=-1)
         assert_equal(t5, t2.T)
 
+    @pytest.mark.parametrize("axis", [0, 1, -1])
+    def test_base_array(self, axis: int):
+        start = 1
+        stop = 2
+        num = 6
+        base = array([1, 2])
+        t1 = logspace(start, stop, num=num, base=base, axis=axis)
+        t2 = stack(
+            [logspace(start, stop, num=num, base=_base) for _base in base],
+            axis=(axis + 1) % t1.ndim,
+        )
+        assert_equal(t1, t2)
+
+    @pytest.mark.parametrize("axis", [0, 1, -1])
+    def test_stop_base_array(self, axis: int):
+        start = 1
+        stop = array([2, 3])
+        num = 6
+        base = array([1, 2])
+        t1 = logspace(start, stop, num=num, base=base, axis=axis)
+        t2 = stack(
+            [logspace(start, _stop, num=num, base=_base)
+             for _stop, _base in zip(stop, base)],
+            axis=(axis + 1) % t1.ndim,
+        )
+        assert_equal(t1, t2)
+
     def test_dtype(self):
         y = logspace(0, 6, dtype='float32')
         assert_equal(y.dtype, dtype('float32'))
@@ -407,3 +435,12 @@ class TestLinspace:
         y = linspace(-1, 3, num=8, dtype=int)
         t = array([-1, -1, 0, 0, 1, 1, 2, 3], dtype=int)
         assert_array_equal(y, t)
+
+    def test_any_step_zero_and_not_mult_inplace(self):
+        # any_step_zero is True, _mult_inplace is False
+        start = array([0.0, 1.0])
+        stop = array([2.0, 1.0])
+        y = linspace(start, stop, 3)
+        assert_array_equal(y, array([[0.0, 1.0], [1.0, 1.0], [2.0, 1.0]]))
+    
+
diff --git a/numpy/core/tests/test_getlimits.py b/numpy/core/tests/test_getlimits.py
index b8aaba386..63217c38c 100644
--- a/numpy/core/tests/test_getlimits.py
+++ b/numpy/core/tests/test_getlimits.py
@@ -3,6 +3,7 @@
 """
 import warnings
 import numpy as np
+import pytest
 from numpy.core import finfo, iinfo
 from numpy import half, single, double, longdouble
 from numpy.testing import assert_equal, assert_, assert_raises
@@ -40,20 +41,38 @@ class TestLongdouble:
         ftype2 = finfo(longdouble)
         assert_equal(id(ftype), id(ftype2))
 
+def assert_finfo_equal(f1, f2):
+    # assert two finfo instances have the same attributes
+    for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
+                 'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
+                 'nmant', 'precision', 'resolution', 'tiny',
+                 'smallest_normal', 'smallest_subnormal'):
+        assert_equal(getattr(f1, attr), getattr(f2, attr),
+                     f'finfo instances {f1} and {f2} differ on {attr}')
+
+def assert_iinfo_equal(i1, i2):
+    # assert two iinfo instances have the same attributes
+    for attr in ('bits', 'min', 'max'):
+        assert_equal(getattr(i1, attr), getattr(i2, attr),
+                     f'iinfo instances {i1} and {i2} differ on {attr}')
+
 class TestFinfo:
     def test_basic(self):
         dts = list(zip(['f2', 'f4', 'f8', 'c8', 'c16'],
                        [np.float16, np.float32, np.float64, np.complex64,
                         np.complex128]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
-                         'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
-                         'nmant', 'precision', 'resolution', 'tiny',
-                         'smallest_normal', 'smallest_subnormal'):
-                assert_equal(getattr(finfo(dt1), attr),
-                             getattr(finfo(dt2), attr), attr)
+            assert_finfo_equal(finfo(dt1), finfo(dt2))
+
         assert_raises(ValueError, finfo, 'i4')
 
+    def test_regression_gh23108(self):
+        # np.float32(1.0) and np.float64(1.0) have the same hash and are
+        # equal under the == operator
+        f1 = np.finfo(np.float32(1.0))
+        f2 = np.finfo(np.float64(1.0))
+        assert f1 != f2
+
 class TestIinfo:
     def test_basic(self):
         dts = list(zip(['i1', 'i2', 'i4', 'i8',
@@ -61,9 +80,8 @@ class TestIinfo:
                   [np.int8, np.int16, np.int32, np.int64,
                    np.uint8, np.uint16, np.uint32, np.uint64]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'min', 'max'):
-                assert_equal(getattr(iinfo(dt1), attr),
-                             getattr(iinfo(dt2), attr), attr)
+            assert_iinfo_equal(iinfo(dt1), iinfo(dt2))
+
         assert_raises(ValueError, iinfo, 'f4')
 
     def test_unsigned_max(self):
@@ -85,8 +103,28 @@ class TestRepr:
 
 
 def test_instances():
-    iinfo(10)
-    finfo(3.0)
+    # Test the finfo and iinfo results on numeric instances agree with
+    # the results on the corresponding types
+
+    for c in [int, np.int16, np.int32, np.int64]:
+        class_iinfo = iinfo(c)
+        instance_iinfo = iinfo(c(12))
+
+        assert_iinfo_equal(class_iinfo, instance_iinfo)
+
+    for c in [float, np.float16, np.float32, np.float64]:
+        class_finfo = finfo(c)
+        instance_finfo = finfo(c(1.2))
+        assert_finfo_equal(class_finfo, instance_finfo)
+
+    with pytest.raises(ValueError):
+        iinfo(10.)
+
+    with pytest.raises(ValueError):
+        iinfo('hi')
+
+    with pytest.raises(ValueError):
+        finfo(np.int64(1))
 
 
 def assert_ma_equal(discovered, ma_like):
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 74075639c..042936702 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -1062,7 +1062,7 @@ class TestMultiIndexingAutomated:
                         if np.any(_indx >= _size) or np.any(_indx < -_size):
                                 raise IndexError
                 if len(indx[1:]) == len(orig_slice):
-                    if np.product(orig_slice) == 0:
+                    if np.prod(orig_slice) == 0:
                         # Work around for a crash or IndexError with 'wrap'
                         # in some 0-sized cases.
                         try:
diff --git a/numpy/core/tests/test_item_selection.py b/numpy/core/tests/test_item_selection.py
index 3c35245a3..5660ef583 100644
--- a/numpy/core/tests/test_item_selection.py
+++ b/numpy/core/tests/test_item_selection.py
@@ -1,5 +1,7 @@
 import sys
 
+import pytest
+
 import numpy as np
 from numpy.testing import (
     assert_, assert_raises, assert_array_equal, HAS_REFCOUNT
@@ -84,3 +86,80 @@ class TestTake:
 
         b = np.array([0, 1, 2, 3, 4, 5])
         assert_array_equal(a, b)
+
+
+class TestPutMask:
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"]) + ["i,O"])
+    def test_simple(self, dtype):
+        if dtype.lower() == "m":
+            dtype += "8[ns]"
+
+        # putmask is weird and doesn't care about value length (even shorter)
+        vals = np.arange(1001).astype(dtype=dtype)
+
+        mask = np.random.randint(2, size=1000).astype(bool)
+        # Use vals.dtype in case of flexible dtype (i.e. string)
+        arr = np.zeros(1000, dtype=vals.dtype)
+        zeros = arr.copy()
+
+        np.putmask(arr, mask, vals)
+        assert_array_equal(arr[mask], vals[:len(mask)][mask])
+        assert_array_equal(arr[~mask], zeros[~mask])
+
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_empty(self, dtype, mode):
+        arr = np.zeros(1000, dtype=dtype)
+        arr_copy = arr.copy()
+        mask = np.random.randint(2, size=1000).astype(bool)
+
+        # Allowing empty values like this is weird...
+        np.put(arr, mask, [])
+        assert_array_equal(arr, arr_copy)
+
+
+class TestPut:
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_simple(self, dtype, mode):
+        if dtype.lower() == "m":
+            dtype += "8[ns]"
+
+        # put is weird and doesn't care about value length (even shorter)
+        vals = np.arange(1001).astype(dtype=dtype)
+
+        # Use vals.dtype in case of flexible dtype (i.e. string)
+        arr = np.zeros(1000, dtype=vals.dtype)
+        zeros = arr.copy()
+
+        if mode == "clip":
+            # Special because 0 and -1 value are "reserved" for clip test
+            indx = np.random.permutation(len(arr) - 2)[:-500] + 1
+
+            indx[-1] = 0
+            indx[-2] = len(arr) - 1
+            indx_put = indx.copy()
+            indx_put[-1] = -1389
+            indx_put[-2] = 1321
+        else:
+            # Avoid duplicates (for simplicity) and fill half only
+            indx = np.random.permutation(len(arr) - 3)[:-500]
+            indx_put = indx
+            if mode == "wrap":
+                indx_put = indx_put + len(arr)
+
+        np.put(arr, indx_put, vals, mode=mode)
+        assert_array_equal(arr[indx], vals[:len(indx)])
+        untouched = np.ones(len(arr), dtype=bool)
+        untouched[indx] = False
+        assert_array_equal(arr[untouched], zeros[:untouched.sum()])
+
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_empty(self, dtype, mode):
+        arr = np.zeros(1000, dtype=dtype)
+        arr_copy = arr.copy()
+
+        # Allowing empty values like this is weird...
+        np.put(arr, [1, 2, 3], [])
+        assert_array_equal(arr, arr_copy)
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index 1a54e62d8..45721950c 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -1,10 +1,11 @@
 import warnings
+import platform
 import pytest
 
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_warns, assert_array_equal,
-    temppath,
+    temppath, IS_MUSL
     )
 from numpy.core.tests._locales import CommaDecimalPointLocale
 
@@ -30,6 +31,10 @@ def test_scalar_extraction():
 # 0.1 not exactly representable in base 2 floating point.
 repr_precision = len(repr(np.longdouble(0.1)))
 # +2 from macro block starting around line 842 in scalartypes.c.src.
+
+
+@pytest.mark.skipif(IS_MUSL,
+                    reason="test flaky on musllinux")
 @pytest.mark.skipif(LD_INFO.precision + 2 >= repr_precision,
                     reason="repr precision not enough to show eps")
 def test_repr_roundtrip():
@@ -142,7 +147,7 @@ class TestFileBased:
 
     def test_fromfile_bogus(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write("1. 2. 3. flop 4.\n")
 
             with assert_warns(DeprecationWarning):
@@ -153,7 +158,7 @@ class TestFileBased:
         for ctype in ["complex", "cdouble", "cfloat"]:
             # Check spacing between separator and only real component specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1, 2 ,  3  ,4\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -161,7 +166,7 @@ class TestFileBased:
 
             # Real component not specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1j, -2j,  3j, 4e1j\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -169,7 +174,7 @@ class TestFileBased:
 
             # Both components specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+1j,2-2j, -3+3j,  -4e1+4j\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -177,7 +182,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+2 j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -186,7 +191,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+ 2j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -195,7 +200,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1 +2j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -204,7 +209,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+j\n")
 
                 with assert_warns(DeprecationWarning):
@@ -213,7 +218,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+\n")
 
                 with assert_warns(DeprecationWarning):
@@ -222,7 +227,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1j+1\n")
 
                 with assert_warns(DeprecationWarning):
@@ -235,7 +240,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_fromfile(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.fromfile(path, dtype=np.longdouble, sep="\n")
         assert_equal(res, self.tgt)
@@ -244,7 +249,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_genfromtxt(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.genfromtxt(path, dtype=np.longdouble)
         assert_equal(res, self.tgt)
@@ -253,7 +258,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_loadtxt(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.loadtxt(path, dtype=np.longdouble)
         assert_equal(res, self.tgt)
@@ -368,3 +373,23 @@ def test_longdouble_from_int(int_val):
     True, False])
 def test_longdouble_from_bool(bool_val):
     assert np.longdouble(bool_val) == np.longdouble(int(bool_val))
+
+
+@pytest.mark.skipif(
+    not (IS_MUSL and platform.machine() == "x86_64"),
+    reason="only need to run on musllinux_x86_64"
+)
+def test_musllinux_x86_64_signature():
+    # this test may fail if you're emulating musllinux_x86_64 on a different
+    # architecture, but should pass natively.
+    known_sigs = [b'\xcd\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xfb\xbf']
+    sig = (np.longdouble(-1.0) / np.longdouble(10.0)
+           ).newbyteorder('<').tobytes()[:10]
+    assert sig in known_sigs
+
+
+def test_eps_positive():
+    # np.finfo('g').eps should be positive on all platforms. If this isn't true
+    # then something may have gone wrong with the MachArLike, e.g. if
+    # np.core.getlimits._discovered_machar didn't work properly
+    assert np.finfo(np.longdouble).eps > 0.
diff --git a/numpy/core/tests/test_mem_overlap.py b/numpy/core/tests/test_mem_overlap.py
index d66decfda..1fd4c4d41 100644
--- a/numpy/core/tests/test_mem_overlap.py
+++ b/numpy/core/tests/test_mem_overlap.py
@@ -55,7 +55,7 @@ def _indices(ndims):
 def _check_assignment(srcidx, dstidx):
     """Check assignment arr[dstidx] = arr[srcidx] works."""
 
-    arr = np.arange(np.product(shape)).reshape(shape)
+    arr = np.arange(np.prod(shape)).reshape(shape)
 
     cpy = arr.copy()
 
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
index d5dfbc38b..479f702ea 100644
--- a/numpy/core/tests/test_mem_policy.py
+++ b/numpy/core/tests/test_mem_policy.py
@@ -89,6 +89,7 @@ def get_module(tmp_path):
          """),
     ]
     prologue = '''
+        #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
         #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
         #include <numpy/arrayobject.h>
         /*
diff --git a/numpy/core/tests/test_memmap.py b/numpy/core/tests/test_memmap.py
index 914f86f14..ad074b312 100644
--- a/numpy/core/tests/test_memmap.py
+++ b/numpy/core/tests/test_memmap.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from tempfile import NamedTemporaryFile, TemporaryFile
 
 from numpy import (
-    memmap, sum, average, product, ndarray, isscalar, add, subtract, multiply)
+    memmap, sum, average, prod, ndarray, isscalar, add, subtract, multiply)
 
 from numpy import arange, allclose, asarray
 from numpy.testing import (
@@ -153,7 +153,7 @@ class TestMemmap:
 
         with suppress_warnings() as sup:
             sup.filter(FutureWarning, "np.average currently does not preserve")
-            for unary_op in [sum, average, product]:
+            for unary_op in [sum, average, prod]:
                 result = unary_op(fp)
                 assert_(isscalar(result))
                 assert_(result.__class__ is self.data[0, 0].__class__)
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 63ac32f20..196c2dc13 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import collections.abc
 import tempfile
 import sys
@@ -28,11 +30,12 @@ from numpy.testing import (
     assert_, assert_raises, assert_warns, assert_equal, assert_almost_equal,
     assert_array_equal, assert_raises_regex, assert_array_almost_equal,
     assert_allclose, IS_PYPY, IS_PYSTON, HAS_REFCOUNT, assert_array_less,
-    runstring, temppath, suppress_warnings, break_cycles,
+    runstring, temppath, suppress_warnings, break_cycles, _SUPPORTS_SVE,
     )
 from numpy.testing._private.utils import requires_memory, _no_tracing
 from numpy.core.tests._locales import CommaDecimalPointLocale
 from numpy.lib.recfunctions import repack_fields
+from numpy.core.multiarray import _get_ndarray_c_version
 
 # Need to test an object that does not fully implement math interface
 from datetime import timedelta, datetime
@@ -403,6 +406,13 @@ class TestAttributes:
         assert_array_equal(x['a'], [3.5, 3.5])
         assert_array_equal(x['b'], [-2, -2])
 
+    def test_fill_readonly(self):
+        # gh-22922
+        a = np.zeros(11)
+        a.setflags(write=False)
+        with pytest.raises(ValueError, match=".*read-only"):
+            a.fill(0)
+
 
 class TestArrayConstruction:
     def test_array(self):
@@ -1186,6 +1196,17 @@ class TestCreation:
         expected = expected * (arr.nbytes // len(expected))
         assert arr.tobytes() == expected
 
+    @pytest.mark.parametrize("func", [
+        np.array, np.asarray, np.asanyarray, np.ascontiguousarray,
+        np.asfortranarray])
+    def test_creation_from_dtypemeta(self, func):
+        dtype = np.dtype('i')
+        arr1 = func([1, 2, 3], dtype=dtype)
+        arr2 = func([1, 2, 3], dtype=type(dtype))
+        assert_array_equal(arr1, arr2)
+        assert arr2.dtype == dtype
+
+
 class TestStructured:
     def test_subarray_field_access(self):
         a = np.zeros((3, 5), dtype=[('a', ('i4', (2, 2)))])
@@ -1698,7 +1719,7 @@ class TestBool:
 
     @pytest.mark.xfail(reason="See gh-9847")
     def test_cast_from_unicode(self):
-        self._test_cast_from_flexible(np.unicode_)
+        self._test_cast_from_flexible(np.str_)
 
     @pytest.mark.xfail(reason="See gh-9847")
     def test_cast_from_bytes(self):
@@ -1913,8 +1934,9 @@ class TestMethods:
                 assert_array_equal(a2.prod(axis=-1),
                                    np.array([24, 1890, 600], ctype))
 
-    def test_repeat(self):
-        m = np.array([1, 2, 3, 4, 5, 6])
+    @pytest.mark.parametrize('dtype', [None, object])
+    def test_repeat(self, dtype):
+        m = np.array([1, 2, 3, 4, 5, 6], dtype=dtype)
         m_rect = m.reshape((2, 3))
 
         A = m.repeat([1, 3, 2, 1, 1, 2])
@@ -2080,7 +2102,7 @@ class TestMethods:
                 msg = 'byte-swapped complex sort, dtype={0}'.format(dt)
                 assert_equal(c, arr, msg)
 
-    @pytest.mark.parametrize('dtype', [np.bytes_, np.unicode_])
+    @pytest.mark.parametrize('dtype', [np.bytes_, np.str_])
     def test_sort_string(self, dtype):
         # np.array will perform the encoding to bytes for us in the bytes test
         a = np.array(['aaaaaaaa' + chr(i) for i in range(101)], dtype=dtype)
@@ -2108,19 +2130,26 @@ class TestMethods:
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-    def test_sort_structured(self):
+    @pytest.mark.parametrize("dt", [
+            np.dtype([('f', float), ('i', int)]),
+            np.dtype([('f', float), ('i', object)])])
+    @pytest.mark.parametrize("step", [1, 2])
+    def test_sort_structured(self, dt, step):
         # test record array sorts.
-        dt = np.dtype([('f', float), ('i', int)])
-        a = np.array([(i, i) for i in range(101)], dtype=dt)
+        a = np.array([(i, i) for i in range(101*step)], dtype=dt)
         b = a[::-1]
         for kind in ['q', 'h', 'm']:
             msg = "kind=%s" % kind
-            c = a.copy()
+            c = a.copy()[::step]
+            indx = c.argsort(kind=kind)
             c.sort(kind=kind)
-            assert_equal(c, a, msg)
-            c = b.copy()
+            assert_equal(c, a[::step], msg)
+            assert_equal(a[::step][indx], a[::step], msg)
+            c = b.copy()[::step]
+            indx = c.argsort(kind=kind)
             c.sort(kind=kind)
-            assert_equal(c, a, msg)
+            assert_equal(c, a[step-1::step], msg)
+            assert_equal(b[::step][indx], a[step-1::step], msg)
 
     @pytest.mark.parametrize('dtype', ['datetime64[D]', 'timedelta64[D]'])
     def test_sort_time(self, dtype):
@@ -2354,7 +2383,7 @@ class TestMethods:
 
         # test unicode argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode_)
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.str_)
         b = a[::-1]
         r = np.arange(101)
         rr = r[::-1]
@@ -2437,7 +2466,7 @@ class TestMethods:
         a = np.array(['aaaaaaaaa' for i in range(100)])
         assert_equal(a.argsort(kind='m'), r)
         # unicode
-        a = np.array(['aaaaaaaaa' for i in range(100)], dtype=np.unicode_)
+        a = np.array(['aaaaaaaaa' for i in range(100)], dtype=np.str_)
         assert_equal(a.argsort(kind='m'), r)
 
     def test_sort_unicode_kind(self):
@@ -2581,7 +2610,7 @@ class TestMethods:
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100197_1',
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100198_1',
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100199_1'],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
         ind = np.arange(len(a))
         assert_equal([a.searchsorted(v, 'left') for v in a], ind)
         assert_equal([a.searchsorted(v, 'right') for v in a], ind + 1)
@@ -3616,9 +3645,13 @@ class TestMethods:
             msg = 'dtype: {0}'.format(dt)
             ap = complex(a)
             assert_equal(ap, a, msg)
-            bp = complex(b)
+
+            with assert_warns(DeprecationWarning):
+                bp = complex(b)
             assert_equal(bp, b, msg)
-            cp = complex(c)
+
+            with assert_warns(DeprecationWarning):
+                cp = complex(c)
             assert_equal(cp, c, msg)
 
     def test__complex__should_not_work(self):
@@ -3641,7 +3674,8 @@ class TestMethods:
         assert_raises(TypeError, complex, d)
 
         e = np.array(['1+1j'], 'U')
-        assert_raises(TypeError, complex, e)
+        with assert_warns(DeprecationWarning):
+            assert_raises(TypeError, complex, e)
 
 class TestCequenceMethods:
     def test_array_contains(self):
@@ -3703,7 +3737,7 @@ class TestBinop:
             'and':      (np.bitwise_and, True, int),
             'xor':      (np.bitwise_xor, True, int),
             'or':       (np.bitwise_or, True, int),
-            'matmul':   (np.matmul, False, float),
+            'matmul':   (np.matmul, True, float),
             # 'ge':       (np.less_equal, False),
             # 'gt':       (np.less, False),
             # 'le':       (np.greater_equal, False),
@@ -4721,23 +4755,23 @@ class TestArgmax:
 
         a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**15 - 1, -2**15], dtype=np.int16)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**31 - 1, -2**31], dtype=np.int32)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**63 - 1, -2**63], dtype=np.int64)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
 class TestArgmin:
     usg_data = [
@@ -4863,23 +4897,23 @@ class TestArgmin:
 
         a = np.array([1, -2**7, -2**7 + 1, 2**7 - 1], dtype=np.int8)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**15, -2**15 + 1, 2**15 - 1], dtype=np.int16)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**31, -2**31 + 1, 2**31 - 1], dtype=np.int32)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**63, -2**63 + 1, 2**63 - 1], dtype=np.int64)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
 class TestMinMax:
 
@@ -5077,6 +5111,22 @@ class TestPutmask:
         with pytest.raises(ValueError):
             np.putmask(a, a >= 2, 3)
 
+    def test_kwargs(self):
+        x = np.array([0, 0])
+        np.putmask(x, [0, 1], [-1, -2])
+        assert_array_equal(x, [0, -2])
+
+        x = np.array([0, 0])
+        np.putmask(x, mask=[0, 1], values=[-1, -2])
+        assert_array_equal(x, [0, -2])
+
+        x = np.array([0, 0])
+        np.putmask(x, values=[-1, -2],  mask=[0, 1])
+        assert_array_equal(x, [0, -2])
+
+        with pytest.raises(TypeError):
+            np.putmask(a=x, values=[-1, -2],  mask=[0, 1])
+
 
 class TestTake:
     def tst_basic(self, x):
@@ -5528,33 +5578,6 @@ class TestIO:
             tmp_filename,
             dtype='<f4')
 
-    @pytest.mark.slow  # takes > 1 minute on mechanical hard drive
-    def test_big_binary(self):
-        """Test workarounds for 32-bit limit for MSVC fwrite, fseek, and ftell
-
-        These normally would hang doing something like this.
-        See : https://github.com/numpy/numpy/issues/2256
-        """
-        if sys.platform != 'win32' or '[GCC ' in sys.version:
-            return
-        try:
-            # before workarounds, only up to 2**32-1 worked
-            fourgbplus = 2**32 + 2**16
-            testbytes = np.arange(8, dtype=np.int8)
-            n = len(testbytes)
-            flike = tempfile.NamedTemporaryFile()
-            f = flike.file
-            np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
-            flike.seek(0)
-            a = np.fromfile(f, dtype=np.int8)
-            flike.close()
-            assert_(len(a) == fourgbplus)
-            # check only start and end for speed:
-            assert_((a[:n] == testbytes).all())
-            assert_((a[-n:] == testbytes).all())
-        except (MemoryError, ValueError):
-            pass
-
     def test_string(self, tmp_filename):
         self._check_from(b'1,2,3,4', [1., 2., 3., 4.], tmp_filename, sep=',')
 
@@ -6658,6 +6681,22 @@ class TestDot:
         r = np.empty((1024, 32), dtype=int)
         assert_raises(ValueError, dot, f, v, r)
 
+    def test_dot_out_result(self):
+        x = np.ones((), dtype=np.float16)
+        y = np.ones((5,), dtype=np.float16)
+        z = np.zeros((5,), dtype=np.float16)
+        res = x.dot(y, out=z)
+        assert np.array_equal(res, y)
+        assert np.array_equal(z, y)
+
+    def test_dot_out_aliasing(self):
+        x = np.ones((), dtype=np.float16)
+        y = np.ones((5,), dtype=np.float16)
+        z = np.zeros((5,), dtype=np.float16)
+        res = x.dot(y, out=z)
+        z[0] = 2
+        assert np.array_equal(res, z)
+
     def test_dot_array_order(self):
         a = np.array([[1, 2], [3, 4]], order='C')
         b = np.array([[1, 2], [3, 4]], order='F')
@@ -7165,16 +7204,69 @@ class TestMatmulOperator(MatmulCommon):
         assert_raises(TypeError, self.matmul, np.void(b'abc'), np.void(b'abc'))
         assert_raises(TypeError, self.matmul, np.arange(10), np.void(b'abc'))
 
-def test_matmul_inplace():
-    # It would be nice to support in-place matmul eventually, but for now
-    # we don't have a working implementation, so better just to error out
-    # and nudge people to writing "a = a @ b".
-    a = np.eye(3)
-    b = np.eye(3)
-    assert_raises(TypeError, a.__imatmul__, b)
-    import operator
-    assert_raises(TypeError, operator.imatmul, a, b)
-    assert_raises(TypeError, exec, "a @= b", globals(), locals())
+
+class TestMatmulInplace:
+    DTYPES = {}
+    for i in MatmulCommon.types:
+        for j in MatmulCommon.types:
+            if np.can_cast(j, i):
+                DTYPES[f"{i}-{j}"] = (np.dtype(i), np.dtype(j))
+
+    @pytest.mark.parametrize("dtype1,dtype2", DTYPES.values(), ids=DTYPES)
+    def test_basic(self, dtype1: np.dtype, dtype2: np.dtype) -> None:
+        a = np.arange(10).reshape(5, 2).astype(dtype1)
+        a_id = id(a)
+        b = np.ones((2, 2), dtype=dtype2)
+
+        ref = a @ b
+        a @= b
+
+        assert id(a) == a_id
+        assert a.dtype == dtype1
+        assert a.shape == (5, 2)
+        if dtype1.kind in "fc":
+            np.testing.assert_allclose(a, ref)
+        else:
+            np.testing.assert_array_equal(a, ref)
+
+    SHAPES = {
+        "2d_large": ((10**5, 10), (10, 10)),
+        "3d_large": ((10**4, 10, 10), (1, 10, 10)),
+        "1d": ((3,), (3,)),
+        "2d_1d": ((3, 3), (3,)),
+        "1d_2d": ((3,), (3, 3)),
+        "2d_broadcast": ((3, 3), (3, 1)),
+        "2d_broadcast_reverse": ((1, 3), (3, 3)),
+        "3d_broadcast1": ((3, 3, 3), (1, 3, 1)),
+        "3d_broadcast2": ((3, 3, 3), (1, 3, 3)),
+        "3d_broadcast3": ((3, 3, 3), (3, 3, 1)),
+        "3d_broadcast_reverse1": ((1, 3, 3), (3, 3, 3)),
+        "3d_broadcast_reverse2": ((3, 1, 3), (3, 3, 3)),
+        "3d_broadcast_reverse3": ((1, 1, 3), (3, 3, 3)),
+    }
+
+    @pytest.mark.parametrize("a_shape,b_shape", SHAPES.values(), ids=SHAPES)
+    def test_shapes(self, a_shape: tuple[int, ...], b_shape: tuple[int, ...]):
+        a_size = np.prod(a_shape)
+        a = np.arange(a_size).reshape(a_shape).astype(np.float64)
+        a_id = id(a)
+
+        b_size = np.prod(b_shape)
+        b = np.arange(b_size).reshape(b_shape)
+
+        ref = a @ b
+        if ref.shape != a_shape:
+            with pytest.raises(ValueError):
+                a @= b
+            return
+        else:
+            a @= b
+
+        assert id(a) == a_id
+        assert a.dtype.type == np.float64
+        assert a.shape == a_shape
+        np.testing.assert_allclose(a, ref)
+
 
 def test_matmul_axes():
     a = np.arange(3*4*5).reshape(3, 4, 5)
@@ -8670,14 +8762,16 @@ class TestConversion:
         int_funcs = (int, lambda x: x.__int__())
         for int_func in int_funcs:
             assert_equal(int_func(np.array(0)), 0)
-            assert_equal(int_func(np.array([1])), 1)
-            assert_equal(int_func(np.array([[42]])), 42)
+            with assert_warns(DeprecationWarning):
+                assert_equal(int_func(np.array([1])), 1)
+            with assert_warns(DeprecationWarning):
+                assert_equal(int_func(np.array([[42]])), 42)
             assert_raises(TypeError, int_func, np.array([1, 2]))
 
             # gh-9972
             assert_equal(4, int_func(np.array('4')))
             assert_equal(5, int_func(np.bytes_(b'5')))
-            assert_equal(6, int_func(np.unicode_('6')))
+            assert_equal(6, int_func(np.str_('6')))
 
             # The delegation of int() to __trunc__ was deprecated in
             # Python 3.11.
@@ -8686,7 +8780,8 @@ class TestConversion:
                     def __trunc__(self):
                         return 3
                 assert_equal(3, int_func(np.array(HasTrunc())))
-                assert_equal(3, int_func(np.array([HasTrunc()])))
+                with assert_warns(DeprecationWarning):
+                    assert_equal(3, int_func(np.array([HasTrunc()])))
             else:
                 pass
 
@@ -8695,8 +8790,9 @@ class TestConversion:
                     raise NotImplementedError
             assert_raises(NotImplementedError,
                 int_func, np.array(NotConvertible()))
-            assert_raises(NotImplementedError,
-                int_func, np.array([NotConvertible()]))
+            with assert_warns(DeprecationWarning):
+                assert_raises(NotImplementedError,
+                    int_func, np.array([NotConvertible()]))
 
 
 class TestWhere:
@@ -8863,6 +8959,11 @@ class TestWhere:
             result = array.nonzero()
             assert_array_equal(benchmark, result)
 
+    def test_kwargs(self):
+        a = np.zeros(1)
+        with assert_raises(TypeError):
+            np.where(a, x=a, y=a)
+
 
 if not IS_PYPY:
     # sys.getsizeof() is not valid on PyPy
@@ -9044,33 +9145,33 @@ class TestUnicodeEncoding:
     def test_assign_scalar(self):
         # gh-3258
         l = np.array(['aa', 'bb'])
-        l[:] = np.unicode_('cc')
+        l[:] = np.str_('cc')
         assert_equal(l, ['cc', 'cc'])
 
     def test_fill_scalar(self):
         # gh-7227
         l = np.array(['aa', 'bb'])
-        l.fill(np.unicode_('cc'))
+        l.fill(np.str_('cc'))
         assert_equal(l, ['cc', 'cc'])
 
 
 class TestUnicodeArrayNonzero:
 
     def test_empty_ustring_array_is_falsey(self):
-        assert_(not np.array([''], dtype=np.unicode_))
+        assert_(not np.array([''], dtype=np.str_))
 
     def test_whitespace_ustring_array_is_falsey(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = '  \0\0'
         assert_(not a)
 
     def test_all_null_ustring_array_is_falsey(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = '\0\0\0\0'
         assert_(not a)
 
     def test_null_inside_ustring_array_is_truthy(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = ' \0 \0'
         assert_(a)
 
@@ -9505,7 +9606,7 @@ def test_equal_override():
 
 @pytest.mark.parametrize("op", [operator.eq, operator.ne])
 @pytest.mark.parametrize(["dt1", "dt2"], [
-        ([("f", "i")], [("f", "i")]),  # structured comparison (successfull)
+        ([("f", "i")], [("f", "i")]),  # structured comparison (successful)
         ("M8", "d"),  # impossible comparison: result is all True or False
         ("d", "d"),  # valid comparison
         ])
@@ -9828,40 +9929,50 @@ class TestViewDtype:
         assert_array_equal(x.view('<i2'), expected)
 
 
+@pytest.mark.xfail(_SUPPORTS_SVE, reason="gh-22982")
 # Test various array sizes that hit different code paths in quicksort-avx512
-@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191,
-                               256, 383, 512, 1023, 2047])
-def test_sort_float(N):
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['e', 'f', 'd'])
+def test_sort_float(N, dtype):
     # Regular data with nan sprinkled
     np.random.seed(42)
-    arr = -0.5 + np.random.sample(N).astype('f')
+    arr = -0.5 + np.random.sample(N).astype(dtype)
     arr[np.random.choice(arr.shape[0], 3)] = np.nan
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
 
     # (2) with +INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], 5)] = -1.0
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
     # (3) with -INF
-    neginfarr = -np.inf*np.ones(N, dtype='f')
+    neginfarr = -np.inf*np.ones(N, dtype=dtype)
     neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0
     assert_equal(np.sort(neginfarr, kind='quick'),
                  np.sort(neginfarr, kind='heap'))
 
     # (4) with +/-INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
-
-def test_sort_int():
-    # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled
-    rng = np.random.default_rng(42)
-    N = 2047
-    minv = np.iinfo(np.int32).min
-    maxv = np.iinfo(np.int32).max
-    arr = rng.integers(low=minv, high=maxv, size=N).astype('int32')
+def test_sort_float16():
+    arr = np.arange(65536, dtype=np.int16)
+    temp = np.frombuffer(arr.tobytes(), dtype=np.float16)
+    data = np.copy(temp)
+    np.random.shuffle(data)
+    data_backup = data
+    assert_equal(np.sort(data, kind='quick'),
+            np.sort(data_backup, kind='heap'))
+
+
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
+def test_sort_int(N, dtype):
+    # Random data with MAX and MIN sprinkled
+    minv = np.iinfo(dtype).min
+    maxv = np.iinfo(dtype).max
+    arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype)
     arr[np.random.choice(arr.shape[0], 10)] = minv
     arr[np.random.choice(arr.shape[0], 10)] = maxv
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
@@ -9875,3 +9986,6 @@ def test_sort_uint():
     arr = rng.integers(low=0, high=maxv, size=N).astype('uint32')
     arr[np.random.choice(arr.shape[0], 10)] = maxv
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
+
+def test_private_get_ndarray_c_version():
+    assert isinstance(_get_ndarray_c_version(), int)
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index b88afdfa1..9f639c4c1 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -1457,6 +1457,38 @@ def test_iter_copy_casts_structured():
         assert_array_equal(res2["b"][field], expected)
 
 
+def test_iter_copy_casts_structured2():
+    # Similar to the above, this is a fairly arcane test to cover internals
+    in_dtype = np.dtype([("a", np.dtype("O,O")),
+                         ("b", np.dtype("(5)O,(3)O,(1,)O,(1,)i,(1,)O"))])
+    out_dtype = np.dtype([("a", np.dtype("O")),
+                          ("b", np.dtype("O,(3)i,(4)O,(4)O,(4)i"))])
+
+    arr = np.ones(1, dtype=in_dtype)
+    it = np.nditer((arr,), ["buffered", "external_loop", "refs_ok"],
+                   op_dtypes=[out_dtype], casting="unsafe")
+    it_copy = it.copy()
+
+    res1 = next(it)
+    del it
+    res2 = next(it_copy)
+    del it_copy
+
+    # Array of two structured scalars:
+    for res in res1, res2:
+        # Cast to tuple by getitem, which may be weird and changable?:
+        assert type(res["a"][0]) == tuple
+        assert res["a"][0] == (1, 1)
+
+    for res in res1, res2:
+        assert_array_equal(res["b"]["f0"][0], np.ones(5, dtype=object))
+        assert_array_equal(res["b"]["f1"], np.ones((1, 3), dtype="i"))
+        assert res["b"]["f2"].shape == (1, 4)
+        assert_array_equal(res["b"]["f2"][0], np.ones(4, dtype=object))
+        assert_array_equal(res["b"]["f3"][0], np.ones(4, dtype=object))
+        assert_array_equal(res["b"]["f3"][0], np.ones(4, dtype="i"))
+
+
 def test_iter_allocate_output_simple():
     # Check that the iterator will properly allocate outputs
 
@@ -2257,7 +2289,7 @@ def test_iter_buffering_string():
     assert_equal(i[0], b'abc')
     assert_equal(i[0].dtype, np.dtype('S6'))
 
-    a = np.array(['abc', 'a', 'abcd'], dtype=np.unicode_)
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.str_)
     assert_equal(a.dtype, np.dtype('U4'))
     assert_raises(TypeError, nditer, a, ['buffered'], ['readonly'],
                     op_dtypes='U2')
diff --git a/numpy/core/tests/test_nep50_promotions.py b/numpy/core/tests/test_nep50_promotions.py
index 3c0316960..7d52c5089 100644
--- a/numpy/core/tests/test_nep50_promotions.py
+++ b/numpy/core/tests/test_nep50_promotions.py
@@ -131,7 +131,7 @@ def test_nep50_weak_integers_with_inexact(dtype):
 
 @pytest.mark.parametrize("op", [operator.add, operator.pow, operator.eq])
 def test_weak_promotion_scalar_path(op):
-    # Some additional paths excercising the weak scalars.
+    # Some additional paths exercising the weak scalars.
     np._set_promotion_state("weak")
 
     # Integer path:
@@ -180,3 +180,55 @@ def test_nep50_integer_regression():
     arr = np.array(1)
     assert (arr + 2**63).dtype == np.float64
     assert (arr[()] + 2**63).dtype == np.float64
+
+
+def test_nep50_with_axisconcatenator():
+    # I promised that this will be an error in the future in the 1.25
+    # release notes;  test this (NEP 50 opt-in makes the deprecation an error).
+    np._set_promotion_state("weak")
+
+    with pytest.raises(OverflowError):
+        np.r_[np.arange(5, dtype=np.int8), 255]
+
+
+@pytest.mark.parametrize("ufunc", [np.add, np.power])
+@pytest.mark.parametrize("state", ["weak", "weak_and_warn"])
+def test_nep50_huge_integers(ufunc, state):
+    # Very large integers are complicated, because they go to uint64 or
+    # object dtype.  This tests covers a few possible paths (some of which
+    # cannot give the NEP 50 warnings).
+    np._set_promotion_state(state)
+
+    with pytest.raises(OverflowError):
+        ufunc(np.int64(0), 2**63)  # 2**63 too large for int64
+
+    if state == "weak_and_warn":
+        with pytest.warns(UserWarning,
+                match="result dtype changed.*float64.*uint64"):
+            with pytest.raises(OverflowError):
+                ufunc(np.uint64(0), 2**64)
+    else:
+        with pytest.raises(OverflowError):
+            ufunc(np.uint64(0), 2**64)  # 2**64 cannot be represented by uint64
+
+    # However, 2**63 can be represented by the uint64 (and that is used):
+    if state == "weak_and_warn":
+        with pytest.warns(UserWarning,
+                match="result dtype changed.*float64.*uint64"):
+            res = ufunc(np.uint64(1), 2**63)
+    else:
+        res = ufunc(np.uint64(1), 2**63)
+
+    assert res.dtype == np.uint64
+    assert res == ufunc(1, 2**63, dtype=object)
+
+    # The following paths fail to warn correctly about the change:
+    with pytest.raises(OverflowError):
+        ufunc(np.int64(1), 2**63)  # np.array(2**63) would go to uint
+
+    with pytest.raises(OverflowError):
+        ufunc(np.int64(1), 2**100)  # np.array(2**100) would go to object
+
+    # This would go to object and thus a Python float, not a NumPy one:
+    res = ufunc(1.0, 2**100)
+    assert isinstance(res, np.float64)
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 3cc168b34..832a47c92 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -114,7 +114,9 @@ class TestNonarrayArgs:
 
     def test_cumproduct(self):
         A = [[1, 2, 3], [4, 5, 6]]
-        assert_(np.all(np.cumproduct(A) == np.array([1, 2, 6, 24, 120, 720])))
+        with assert_warns(DeprecationWarning):
+            expected = np.array([1, 2, 6, 24, 120, 720])
+            assert_(np.all(np.cumproduct(A) == expected))
 
     def test_diagonal(self):
         a = [[0, 1, 2, 3],
@@ -1193,8 +1195,8 @@ class TestFromiter:
         expected = np.array(list(self.makegen()))
         a = np.fromiter(self.makegen(), int)
         a20 = np.fromiter(self.makegen(), int, 20)
-        assert_(np.alltrue(a == expected, axis=0))
-        assert_(np.alltrue(a20 == expected[:20], axis=0))
+        assert_(np.all(a == expected, axis=0))
+        assert_(np.all(a20 == expected[:20], axis=0))
 
     def load_data(self, n, eindex):
         # Utility method for the issue 2592 tests.
@@ -1822,20 +1824,11 @@ class TestClip:
         self.nr = 5
         self.nc = 3
 
-    def fastclip(self, a, m, M, out=None, casting=None):
-        if out is None:
-            if casting is None:
-                return a.clip(m, M)
-            else:
-                return a.clip(m, M, casting=casting)
-        else:
-            if casting is None:
-                return a.clip(m, M, out)
-            else:
-                return a.clip(m, M, out, casting=casting)
+    def fastclip(self, a, m, M, out=None, **kwargs):
+        return a.clip(m, M, out=out, **kwargs)
 
     def clip(self, a, m, M, out=None):
-        # use slow-clip
+        # use a.choose to verify fastclip result
         selector = np.less(a, m) + 2*np.greater(a, M)
         return selector.choose((a, m, M), out=out)
 
@@ -1991,14 +1984,13 @@ class TestClip:
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
         if casting is None:
-            with assert_warns(DeprecationWarning):
-                # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            with pytest.raises(TypeError):
                 self.fastclip(a, m, M, ac, casting=casting)
         else:
             # explicitly passing "unsafe" will silence warning
             self.fastclip(a, m, M, ac, casting=casting)
-        self.clip(a, m, M, act)
-        assert_array_strict_equal(ac, act)
+            self.clip(a, m, M, act)
+            assert_array_strict_equal(ac, act)
 
     def test_simple_int64_out(self):
         # Test native int32 input with int32 scalar min/max and int64 out.
@@ -2018,9 +2010,7 @@ class TestClip:
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2031,9 +2021,7 @@ class TestClip:
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2209,9 +2197,7 @@ class TestClip:
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2233,9 +2219,7 @@ class TestClip:
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2246,9 +2230,7 @@ class TestClip:
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2301,16 +2283,11 @@ class TestClip:
 
     def test_clip_nan(self):
         d = np.arange(7.)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan, max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=-2, max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan, max=10), d)
+        assert_equal(d.clip(min=np.nan), np.nan)
+        assert_equal(d.clip(max=np.nan), np.nan)
+        assert_equal(d.clip(min=np.nan, max=np.nan), np.nan)
+        assert_equal(d.clip(min=-2, max=np.nan), np.nan)
+        assert_equal(d.clip(min=np.nan, max=10), np.nan)
 
     def test_object_clip(self):
         a = np.arange(10, dtype=object)
@@ -2362,16 +2339,12 @@ class TestClip:
         actual = np.clip(arr, amin, amax)
         assert_equal(actual, exp)
 
-    @pytest.mark.xfail(reason="no scalar nan propagation yet",
-                       raises=AssertionError,
-                       strict=True)
     @pytest.mark.parametrize("arr, amin, amax", [
         # problematic scalar nan case from hypothesis
         (np.zeros(10, dtype=np.int64),
          np.array(np.nan),
          np.zeros(10, dtype=np.int32)),
     ])
-    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
     def test_clip_scalar_nan_propagation(self, arr, amin, amax):
         # enforcement of scalar nan propagation for comparisons
         # called through clip()
diff --git a/numpy/core/tests/test_numerictypes.py b/numpy/core/tests/test_numerictypes.py
index 072cd65fe..bab5bf246 100644
--- a/numpy/core/tests/test_numerictypes.py
+++ b/numpy/core/tests/test_numerictypes.py
@@ -339,23 +339,28 @@ class TestEmptyField:
 
 class TestCommonType:
     def test_scalar_loses1(self):
-        res = np.find_common_type(['f4', 'f4', 'i2'], ['f8'])
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type(['f4', 'f4', 'i2'], ['f8'])
         assert_(res == 'f4')
 
     def test_scalar_loses2(self):
-        res = np.find_common_type(['f4', 'f4'], ['i8'])
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type(['f4', 'f4'], ['i8'])
         assert_(res == 'f4')
 
     def test_scalar_wins(self):
-        res = np.find_common_type(['f4', 'f4', 'i2'], ['c8'])
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type(['f4', 'f4', 'i2'], ['c8'])
         assert_(res == 'c8')
 
     def test_scalar_wins2(self):
-        res = np.find_common_type(['u4', 'i4', 'i4'], ['f4'])
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type(['u4', 'i4', 'i4'], ['f4'])
         assert_(res == 'f8')
 
     def test_scalar_wins3(self):  # doesn't go up to 'f16' on purpose
-        res = np.find_common_type(['u8', 'i8', 'i8'], ['f8'])
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type(['u8', 'i8', 'i8'], ['f8'])
         assert_(res == 'f8')
 
 class TestMultipleFields:
@@ -473,7 +478,8 @@ class TestMaximumSctype:
     def test_complex(self, t):
         assert_equal(np.maximum_sctype(t), np.sctypes['complex'][-1])
 
-    @pytest.mark.parametrize('t', [np.bool_, np.object_, np.unicode_, np.bytes_, np.void])
+    @pytest.mark.parametrize('t', [np.bool_, np.object_, np.str_, np.bytes_,
+                                   np.void])
     def test_other(self, t):
         assert_equal(np.maximum_sctype(t), t)
 
@@ -485,7 +491,7 @@ class Test_sctype2char:
     def test_scalar_type(self):
         assert_equal(np.sctype2char(np.double), 'd')
         assert_equal(np.sctype2char(np.int_), 'l')
-        assert_equal(np.sctype2char(np.unicode_), 'U')
+        assert_equal(np.sctype2char(np.str_), 'U')
         assert_equal(np.sctype2char(np.bytes_), 'S')
 
     def test_other_type(self):
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 63432ffa7..5924358ea 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -10,16 +10,11 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex)
 from numpy.core.overrides import (
     _get_implementing_args, array_function_dispatch,
-    verify_matching_signatures, ARRAY_FUNCTION_ENABLED)
+    verify_matching_signatures)
 from numpy.compat import pickle
 import pytest
 
 
-requires_array_function = pytest.mark.skipif(
-    not ARRAY_FUNCTION_ENABLED,
-    reason="__array_function__ dispatch not enabled.")
-
-
 def _return_not_implemented(self, *args, **kwargs):
     return NotImplemented
 
@@ -150,7 +145,6 @@ class TestGetImplementingArgs:
 
 class TestNDArrayArrayFunction:
 
-    @requires_array_function
     def test_method(self):
 
         class Other:
@@ -209,7 +203,6 @@ class TestNDArrayArrayFunction:
                                      args=(array,), kwargs={})
 
 
-@requires_array_function
 class TestArrayFunctionDispatch:
 
     def test_pickle(self):
@@ -248,8 +241,20 @@ class TestArrayFunctionDispatch:
         with assert_raises_regex(TypeError, 'no implementation found'):
             dispatched_one_arg(array)
 
+    def test_where_dispatch(self):
+
+        class DuckArray:
+            def __array_function__(self, ufunc, method, *inputs, **kwargs):
+                return "overridden"
+
+        array = np.array(1)
+        duck_array = DuckArray()
+
+        result = np.std(array, where=duck_array)
+
+        assert_equal(result, "overridden")
+
 
-@requires_array_function
 class TestVerifyMatchingSignatures:
 
     def test_verify_matching_signatures(self):
@@ -302,7 +307,6 @@ def _new_duck_type_and_implements():
     return (MyArray, implements)
 
 
-@requires_array_function
 class TestArrayFunctionImplementation:
 
     def test_one_arg(self):
@@ -355,6 +359,17 @@ class TestArrayFunctionImplementation:
                 TypeError, "no implementation found for 'my.func'"):
             func(MyArray())
 
+    @pytest.mark.parametrize("name", ["concatenate", "mean", "asarray"])
+    def test_signature_error_message_simple(self, name):
+        func = getattr(np, name)
+        try:
+            # all of these functions need an argument:
+            func()
+        except TypeError as e:
+            exc = e
+
+        assert exc.args[0].startswith(f"{name}()")
+
     def test_signature_error_message(self):
         # The lambda function will be named "<lambda>", but the TypeError
         # should show the name as "func"
@@ -366,7 +381,7 @@ class TestArrayFunctionImplementation:
             pass
 
         try:
-            func(bad_arg=3)
+            func._implementation(bad_arg=3)
         except TypeError as e:
             expected_exception = e
 
@@ -374,6 +389,12 @@ class TestArrayFunctionImplementation:
             func(bad_arg=3)
             raise AssertionError("must fail")
         except TypeError as exc:
+            if exc.args[0].startswith("_dispatcher"):
+                # We replace the qualname currently, but it used `__name__`
+                # (relevant functions have the same name and qualname anyway)
+                pytest.skip("Python version is not using __qualname__ for "
+                            "TypeError formatting.")
+
             assert exc.args == expected_exception.args
 
     @pytest.mark.parametrize("value", [234, "this func is not replaced"])
@@ -394,6 +415,56 @@ class TestArrayFunctionImplementation:
         except TypeError as exc:
             assert exc is error  # unmodified exception
 
+    def test_properties(self):
+        # Check that str and repr are sensible
+        func = dispatched_two_arg
+        assert str(func) == str(func._implementation)
+        repr_no_id = repr(func).split("at ")[0]
+        repr_no_id_impl = repr(func._implementation).split("at ")[0]
+        assert repr_no_id == repr_no_id_impl
+
+    @pytest.mark.parametrize("func", [
+            lambda x, y: 0,  # no like argument
+            lambda like=None: 0,  # not keyword only
+            lambda *, like=None, a=3: 0,  # not last (not that it matters)
+        ])
+    def test_bad_like_sig(self, func):
+        # We sanity check the signature, and these should fail.
+        with pytest.raises(RuntimeError):
+            array_function_dispatch()(func)
+
+    def test_bad_like_passing(self):
+        # Cover internal sanity check for passing like as first positional arg
+        def func(*, like=None):
+            pass
+
+        func_with_like = array_function_dispatch()(func)
+        with pytest.raises(TypeError):
+            func_with_like()
+        with pytest.raises(TypeError):
+            func_with_like(like=234)
+
+    def test_too_many_args(self):
+        # Mainly a unit-test to increase coverage
+        objs = []
+        for i in range(40):
+            class MyArr:
+                def __array_function__(self, *args, **kwargs):
+                    return NotImplemented
+
+            objs.append(MyArr())
+
+        def _dispatch(*args):
+            return args
+
+        @array_function_dispatch(_dispatch)
+        def func(*args):
+            pass
+
+        with pytest.raises(TypeError, match="maximum number"):
+            func(*objs)
+
+
 
 class TestNDArrayMethods:
 
@@ -422,7 +493,6 @@ class TestNumPyFunctions:
         signature = inspect.signature(np.sum)
         assert_('axis' in signature.parameters)
 
-    @requires_array_function
     def test_override_sum(self):
         MyArray, implements = _new_duck_type_and_implements()
 
@@ -432,7 +502,6 @@ class TestNumPyFunctions:
 
         assert_equal(np.sum(MyArray()), 'yes')
 
-    @requires_array_function
     def test_sum_on_mock_array(self):
 
         # We need a proxy for mocks because __array_function__ is only looked
@@ -453,7 +522,6 @@ class TestNumPyFunctions:
             np.sum, (ArrayProxy,), (proxy,), {})
         proxy.value.__array__.assert_not_called()
 
-    @requires_array_function
     def test_sum_forwarding_implementation(self):
 
         class MyArray(np.ndarray):
@@ -505,7 +573,6 @@ class TestArrayLike:
     def func_args(*args, **kwargs):
         return args, kwargs
 
-    @requires_array_function
     def test_array_like_not_implemented(self):
         self.add_method('array', self.MyArray)
 
@@ -538,7 +605,6 @@ class TestArrayLike:
 
     @pytest.mark.parametrize('function, args, kwargs', _array_tests)
     @pytest.mark.parametrize('numpy_ref', [True, False])
-    @requires_array_function
     def test_array_like(self, function, args, kwargs, numpy_ref):
         self.add_method('array', self.MyArray)
         self.add_method(function, self.MyArray)
@@ -571,7 +637,6 @@ class TestArrayLike:
 
     @pytest.mark.parametrize('function, args, kwargs', _array_tests)
     @pytest.mark.parametrize('ref', [1, [1], "MyNoArrayFunctionArray"])
-    @requires_array_function
     def test_no_array_function_like(self, function, args, kwargs, ref):
         self.add_method('array', self.MyNoArrayFunctionArray)
         self.add_method(function, self.MyNoArrayFunctionArray)
@@ -613,7 +678,6 @@ class TestArrayLike:
                 assert type(array_like) is self.MyArray
                 assert array_like.function is self.MyArray.fromfile
 
-    @requires_array_function
     def test_exception_handling(self):
         self.add_method('array', self.MyArray, enable_value_error=True)
 
@@ -640,3 +704,56 @@ class TestArrayLike:
             array_like.fill(1)
             expected.fill(1)
         assert_equal(array_like, expected)
+
+
+def test_function_like():
+    # We provide a `__get__` implementation, make sure it works
+    assert type(np.mean) is np.core._multiarray_umath._ArrayFunctionDispatcher 
+
+    class MyClass:
+        def __array__(self):
+            # valid argument to mean:
+            return np.arange(3)
+
+        func1 = staticmethod(np.mean)
+        func2 = np.mean
+        func3 = classmethod(np.mean)
+
+    m = MyClass()
+    assert m.func1([10]) == 10
+    assert m.func2() == 1  # mean of the arange
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        # Tries to operate on the class
+        m.func3()
+
+    # Manual binding also works (the above may shortcut):
+    bound = np.mean.__get__(m, MyClass)
+    assert bound() == 1
+
+    bound = np.mean.__get__(None, MyClass)  # unbound actually
+    assert bound([10]) == 10
+
+    bound = np.mean.__get__(MyClass)  # classmethod
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        bound()
+
+
+def test_scipy_trapz_support_shim():
+    # SciPy 1.10 and earlier "clone" trapz in this way, so we have a
+    # support shim in place: https://github.com/scipy/scipy/issues/17811
+    # That should be removed eventually.  This test copies what SciPy does.
+    # Hopefully removable 1 year after SciPy 1.11; shim added to NumPy 1.25.
+    import types
+    import functools
+
+    def _copy_func(f):
+        # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
+        g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__,
+                            argdefs=f.__defaults__, closure=f.__closure__)
+        g = functools.update_wrapper(g, f)
+        g.__kwdefaults__ = f.__kwdefaults__
+        return g
+
+    trapezoid = _copy_func(np.trapz)
+
+    assert np.trapz([1, 2]) == trapezoid([1, 2])
diff --git a/numpy/core/tests/test_print.py b/numpy/core/tests/test_print.py
index 89a8b48bf..162686ee0 100644
--- a/numpy/core/tests/test_print.py
+++ b/numpy/core/tests/test_print.py
@@ -3,7 +3,7 @@ import sys
 import pytest
 
 import numpy as np
-from numpy.testing import assert_, assert_equal
+from numpy.testing import assert_, assert_equal, IS_MUSL
 from numpy.core.tests._locales import CommaDecimalPointLocale
 
 
@@ -196,5 +196,7 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
     def test_locale_double(self):
         assert_equal(str(np.double(1.2)), str(float(1.2)))
 
+    @pytest.mark.skipif(IS_MUSL,
+                        reason="test flaky on musllinux")
     def test_locale_longdouble(self):
         assert_equal(str(np.longdouble('1.2')), str(float(1.2)))
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index f638284de..841144790 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -292,7 +292,7 @@ class TestRegression:
 
     def test_unicode_string_comparison(self):
         # Ticket #190
-        a = np.array('hello', np.unicode_)
+        a = np.array('hello', np.str_)
         b = np.array('world')
         a == b
 
@@ -455,7 +455,7 @@ class TestRegression:
 
         test_data = [
             # (original, py2_pickle)
-            (np.unicode_('\u6f2c'),
+            (np.str_('\u6f2c'),
              b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
              b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\n"
              b"I0\ntp6\nbS',o\\x00\\x00'\np7\ntp8\nRp9\n."),
@@ -516,22 +516,15 @@ class TestRegression:
     def test_method_args(self):
         # Make sure methods and functions have same default axis
         # keyword and arguments
-        funcs1 = ['argmax', 'argmin', 'sum', ('product', 'prod'),
-                 ('sometrue', 'any'),
-                 ('alltrue', 'all'), 'cumsum', ('cumproduct', 'cumprod'),
-                 'ptp', 'cumprod', 'prod', 'std', 'var', 'mean',
-                 'round', 'min', 'max', 'argsort', 'sort']
+        funcs1 = ['argmax', 'argmin', 'sum', 'any', 'all', 'cumsum',
+                  'ptp', 'cumprod', 'prod', 'std', 'var', 'mean',
+                  'round', 'min', 'max', 'argsort', 'sort']
         funcs2 = ['compress', 'take', 'repeat']
 
         for func in funcs1:
             arr = np.random.rand(8, 7)
             arr2 = arr.copy()
-            if isinstance(func, tuple):
-                func_meth = func[1]
-                func = func[0]
-            else:
-                func_meth = func
-            res1 = getattr(arr, func_meth)()
+            res1 = getattr(arr, func)()
             res2 = getattr(np, func)(arr2)
             if res1 is None:
                 res1 = arr
@@ -1336,8 +1329,8 @@ class TestRegression:
         # Ticket #1058
         a = np.fromiter(list(range(10)), dtype='b')
         b = np.fromiter(list(range(10)), dtype='B')
-        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
-        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
 
     def test_array_from_sequence_scalar_array(self):
         # Ticket #1078: segfaults when creating an array with a sequence of
@@ -1515,8 +1508,8 @@ class TestRegression:
     def test_fromiter_comparison(self):
         a = np.fromiter(list(range(10)), dtype='b')
         b = np.fromiter(list(range(10)), dtype='B')
-        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
-        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
 
     def test_fromstring_crash(self):
         # Ticket #1345: the following should not cause a crash
@@ -1535,9 +1528,12 @@ class TestRegression:
             for y in dtypes:
                 c = a.astype(y)
                 try:
-                    np.dot(b, c)
+                    d = np.dot(b, c)
                 except TypeError:
                     failures.append((x, y))
+                else:
+                    if d != 0:
+                        failures.append((x, y))
         if failures:
             raise AssertionError("Failures: %r" % failures)
 
@@ -1671,7 +1667,9 @@ class TestRegression:
 
     def test_find_common_type_boolean(self):
         # Ticket #1695
-        assert_(np.find_common_type([], ['?', '?']) == '?')
+        with pytest.warns(DeprecationWarning, match="np.find_common_type"):
+            res = np.find_common_type([], ['?', '?'])
+        assert res == '?'
 
     def test_empty_mul(self):
         a = np.array([1.])
@@ -1685,7 +1683,7 @@ class TestRegression:
         # number 2, and the exception hung around until something checked
         # PyErr_Occurred() and returned an error.
         assert_equal(np.dtype('S10').itemsize, 10)
-        np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.string_)
+        np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.bytes_)
         assert_equal(np.dtype('S10').itemsize, 10)
 
     def test_any_float(self):
@@ -1954,7 +1952,7 @@ class TestRegression:
         # Python2 output for pickle.dumps(...)
         datas = [
             # (original, python2_pickle, koi8r_validity)
-            (np.unicode_('\u6bd2'),
+            (np.str_('\u6bd2'),
              (b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
               b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\nI0\n"
               b"tp6\nbS'\\xd2k\\x00\\x00'\np7\ntp8\nRp9\n."),
@@ -2078,7 +2076,7 @@ class TestRegression:
         # Ticket #1578, the mismatch only showed up when running
         # python-debug for python versions >= 2.7, and then as
         # a core dump and error message.
-        a = np.array(['abc'], dtype=np.unicode_)[0]
+        a = np.array(['abc'], dtype=np.str_)[0]
         del a
 
     def test_refcount_error_in_clip(self):
@@ -2225,7 +2223,7 @@ class TestRegression:
     def test_pickle_empty_string(self):
         # gh-3926
         for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
-            test_string = np.string_('')
+            test_string = np.bytes_('')
             assert_equal(pickle.loads(
                 pickle.dumps(test_string, protocol=proto)), test_string)
 
@@ -2346,7 +2344,7 @@ class TestRegression:
         values = {
             np.void: b"a",
             np.bytes_: b"a",
-            np.unicode_: "a",
+            np.str_: "a",
             np.datetime64: "2017-08-25",
         }
         for sctype in scalar_types:
@@ -2553,3 +2551,14 @@ class TestRegression:
                 f"Unexpected types order of ufunc in {operation}"
                 f"for {order}. Possible fix: Use signed before unsigned"
                 "in generate_umath.py")
+
+    def test_nonbool_logical(self):
+        # gh-22845
+        # create two arrays with bit patterns that do not overlap.
+        # needs to be large enough to test both SIMD and scalar paths
+        size = 100
+        a = np.frombuffer(b'\x01' * size, dtype=np.bool_)
+        b = np.frombuffer(b'\x80' * size, dtype=np.bool_)
+        expected = np.ones(size, dtype=np.bool_)
+        assert_array_equal(np.logical_and(a, b), expected)
+
diff --git a/numpy/core/tests/test_scalar_methods.py b/numpy/core/tests/test_scalar_methods.py
index a53e47b19..18a7bc828 100644
--- a/numpy/core/tests/test_scalar_methods.py
+++ b/numpy/core/tests/test_scalar_methods.py
@@ -1,7 +1,6 @@
 """
 Test the scalar constructors, which also do type-coercion
 """
-import sys
 import fractions
 import platform
 import types
@@ -10,7 +9,7 @@ from typing import Any, Type
 import pytest
 import numpy as np
 
-from numpy.testing import assert_equal, assert_raises
+from numpy.testing import assert_equal, assert_raises, IS_MUSL
 
 
 class TestAsIntegerRatio:
@@ -99,6 +98,8 @@ class TestAsIntegerRatio:
             try:
                 nf = np.longdouble(n)
                 df = np.longdouble(d)
+                if not np.isfinite(df):
+                    raise OverflowError
             except (OverflowError, RuntimeWarning):
                 # the values may not fit in any float type
                 pytest.skip("longdouble too small on this platform")
@@ -132,7 +133,6 @@ class TestIsInteger:
             assert not value.is_integer()
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 class TestClassGetItem:
     @pytest.mark.parametrize("cls", [
         np.number,
@@ -186,14 +186,6 @@ class TestClassGetItem:
         assert np.number[Any]
 
 
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-@pytest.mark.parametrize("cls", [np.number, np.complexfloating, np.int64])
-def test_class_getitem_38(cls: Type[np.number]) -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        cls[Any]
-
-
 class TestBitCount:
     # derived in part from the cpython test "test_bit_count"
 
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index 0e6ab1015..31b0494cf 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -68,11 +68,11 @@ class TestScalarPEP3118:
             get_buffer_info(x, ["WRITABLE"])
 
     def test_void_scalar_structured_data(self):
-        dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
+        dt = np.dtype([('name', np.str_, 16), ('grades', np.float64, (2,))])
         x = np.array(('ndarray_scalar', (1.2, 3.0)), dtype=dt)[()]
         assert_(isinstance(x, np.void))
         mv_x = memoryview(x)
-        expected_size = 16 * np.dtype((np.unicode_, 1)).itemsize
+        expected_size = 16 * np.dtype((np.str_, 1)).itemsize
         expected_size += 2 * np.dtype(np.float64).itemsize
         assert_equal(mv_x.itemsize, expected_size)
         assert_equal(mv_x.ndim, 0)
diff --git a/numpy/core/tests/test_scalarinherit.py b/numpy/core/tests/test_scalarinherit.py
index f697c3c81..f9c574d57 100644
--- a/numpy/core/tests/test_scalarinherit.py
+++ b/numpy/core/tests/test_scalarinherit.py
@@ -58,8 +58,8 @@ class TestInherit:
 class TestCharacter:
     def test_char_radd(self):
         # GH issue 9620, reached gentype_add and raise TypeError
-        np_s = np.string_('abc')
-        np_u = np.unicode_('abc')
+        np_s = np.bytes_('abc')
+        np_u = np.str_('abc')
         s = b'def'
         u = 'def'
         assert_(np_s.__radd__(np_s) is NotImplemented)
@@ -90,8 +90,8 @@ class TestCharacter:
         assert ret == b"defabc"
 
     def test_char_repeat(self):
-        np_s = np.string_('abc')
-        np_u = np.unicode_('abc')
+        np_s = np.bytes_('abc')
+        np_u = np.str_('abc')
         res_s = b'abc' * 5
         res_u = 'abc' * 5
         assert_(np_s * 5 == res_s)
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 8de821340..c737099c1 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -14,7 +14,7 @@ import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_almost_equal,
     assert_array_equal, IS_PYPY, suppress_warnings, _gen_alignment_data,
-    assert_warns,
+    assert_warns, _SUPPORTS_SVE,
     )
 
 types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
@@ -75,17 +75,7 @@ class TestTypes:
             np.add(1, 1)
 
 
-@pytest.mark.slow
-@settings(max_examples=10000, deadline=2000)
-@given(sampled_from(reasonable_operators_for_scalars),
-       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()),
-       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()))
-def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
-    """
-    This is a thorough test attempting to cover important promotion paths
-    and ensuring that arrays and scalars stay as aligned as possible.
-    However, if it creates troubles, it should maybe just be removed.
-    """
+def check_ufunc_scalar_equivalence(op, arr1, arr2):
     scalar1 = arr1[()]
     scalar2 = arr2[()]
     assert isinstance(scalar1, np.generic)
@@ -95,6 +85,11 @@ def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
         comp_ops = {operator.ge, operator.gt, operator.le, operator.lt}
         if op in comp_ops and (np.isnan(scalar1) or np.isnan(scalar2)):
             pytest.xfail("complex comp ufuncs use sort-order, scalars do not.")
+    if op == operator.pow and arr2.item() in [-1, 0, 0.5, 1, 2]:
+        # array**scalar special case can have different result dtype
+        # (Other powers may have issues also, but are not hit here.)
+        # TODO: It would be nice to resolve this issue.
+        pytest.skip("array**2 can have incorrect/weird result dtype")
 
     # ignore fpe's since they may just mismatch for integers anyway.
     with warnings.catch_warnings(), np.errstate(all="ignore"):
@@ -107,10 +102,51 @@ def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
                 op(scalar1, scalar2)
         else:
             scalar_res = op(scalar1, scalar2)
-            assert_array_equal(scalar_res, res)
+            assert_array_equal(scalar_res, res, strict=True)
+
+
+@pytest.mark.slow
+@settings(max_examples=10000, deadline=2000)
+@given(sampled_from(reasonable_operators_for_scalars),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()))
+def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
+    """
+    This is a thorough test attempting to cover important promotion paths
+    and ensuring that arrays and scalars stay as aligned as possible.
+    However, if it creates troubles, it should maybe just be removed.
+    """
+    check_ufunc_scalar_equivalence(op, arr1, arr2)
+
+
+@pytest.mark.slow
+@given(sampled_from(reasonable_operators_for_scalars),
+       hynp.scalar_dtypes(), hynp.scalar_dtypes())
+def test_array_scalar_ufunc_dtypes(op, dt1, dt2):
+    # Same as above, but don't worry about sampling weird values so that we
+    # do not have to sample as much
+    arr1 = np.array(2, dtype=dt1)
+    arr2 = np.array(3, dtype=dt2)  # some power do weird things.
+
+    check_ufunc_scalar_equivalence(op, arr1, arr2)
+
+
+@pytest.mark.parametrize("fscalar", [np.float16, np.float32])
+def test_int_float_promotion_truediv(fscalar):
+    # Promotion for mixed int and float32/float16 must not go to float64
+    i = np.int8(1)
+    f = fscalar(1)
+    expected = np.result_type(i, f)
+    assert (i / f).dtype == expected
+    assert (f / i).dtype == expected
+    # But normal int / int true division goes to float64:
+    assert (i / i).dtype == np.dtype("float64")
+    # For int16, result has to be ast least float32 (takes ufunc path):
+    assert (np.int16(1) / f).dtype == np.dtype("float32")
 
 
 class TestBaseMath:
+    @pytest.mark.xfail(_SUPPORTS_SVE, reason="gh-22982")
     def test_blocked(self):
         # test alignments offsets for simd instructions
         # alignments for vz + 2 * (vs - 1) + 1
@@ -860,27 +896,29 @@ def test_operator_scalars(op, type1, type2):
 
 
 @pytest.mark.parametrize("op", reasonable_operators_for_scalars)
-def test_longdouble_inf_loop(op):
+@pytest.mark.parametrize("val", [None, 2**64])
+def test_longdouble_inf_loop(op, val):
+    # Note: The 2**64 value will pass once NEP 50 is adopted.
     try:
-        op(np.longdouble(3), None)
+        op(np.longdouble(3), val)
     except TypeError:
         pass
     try:
-        op(None, np.longdouble(3))
+        op(val, np.longdouble(3))
     except TypeError:
         pass
 
 
 @pytest.mark.parametrize("op", reasonable_operators_for_scalars)
-def test_clongdouble_inf_loop(op):
-    if op in {operator.mod} and False:
-        pytest.xfail("The modulo operator is known to be broken")
+@pytest.mark.parametrize("val", [None, 2**64])
+def test_clongdouble_inf_loop(op, val):
+    # Note: The 2**64 value will pass once NEP 50 is adopted.
     try:
-        op(np.clongdouble(3), None)
+        op(np.clongdouble(3), val)
     except TypeError:
         pass
     try:
-        op(None, np.longdouble(3))
+        op(val, np.longdouble(3))
     except TypeError:
         pass
 
diff --git a/numpy/core/tests/test_scalarprint.py b/numpy/core/tests/test_scalarprint.py
index 4deb5a0a4..98d1f4aa1 100644
--- a/numpy/core/tests/test_scalarprint.py
+++ b/numpy/core/tests/test_scalarprint.py
@@ -8,7 +8,7 @@ import sys
 
 from tempfile import TemporaryFile
 import numpy as np
-from numpy.testing import assert_, assert_equal, assert_raises
+from numpy.testing import assert_, assert_equal, assert_raises, IS_MUSL
 
 class TestRealScalars:
     def test_str(self):
@@ -260,10 +260,10 @@ class TestRealScalars:
         assert_equal(fpos64('324', unique=False, precision=5,
                                    fractional=False), "324.00")
 
-
     def test_dragon4_interface(self):
         tps = [np.float16, np.float32, np.float64]
-        if hasattr(np, 'float128'):
+        # test is flaky for musllinux on np.float128
+        if hasattr(np, 'float128') and not IS_MUSL:
             tps.append(np.float128)
 
         fpos = np.format_float_positional
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 570d006f5..0428b95a9 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -152,9 +152,9 @@ class TestHstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             hstack((np.arange(3) for _ in range(2)))
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             hstack(map(lambda x: x, np.ones((3, 2))))
 
     def test_casting_and_dtype(self):
@@ -207,7 +207,7 @@ class TestVstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             vstack((np.arange(3) for _ in range(2)))
 
     def test_casting_and_dtype(self):
@@ -472,10 +472,11 @@ def test_stack():
                         stack, [np.zeros((3, 3)), np.zeros(3)], axis=1)
     assert_raises_regex(ValueError, 'must have the same shape',
                         stack, [np.arange(2), np.arange(3)])
-    # generator is deprecated
-    with assert_warns(FutureWarning):
-        result = stack((x for x in range(3)))
-    assert_array_equal(result, np.array([0, 1, 2]))
+
+    # do not accept generators
+    with pytest.raises(TypeError, match="arrays to stack must be"):
+        stack((x for x in range(3)))
+
     #casting and dtype test
     a = np.array([1, 2, 3])
     b = np.array([2.5, 3.5, 4.5])
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 264300621..92b567446 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -2,9 +2,24 @@
 # may be involved in their functionality.
 import pytest, math, re
 import itertools
-from numpy.core._simd import targets
+import operator
+from numpy.core._simd import targets, clear_floatstatus, get_floatstatus
 from numpy.core._multiarray_umath import __cpu_baseline__
 
+def check_floatstatus(divbyzero=False, overflow=False,
+                      underflow=False, invalid=False,
+                      all=False):
+    #define NPY_FPE_DIVIDEBYZERO  1
+    #define NPY_FPE_OVERFLOW      2
+    #define NPY_FPE_UNDERFLOW     4
+    #define NPY_FPE_INVALID       8
+    err = get_floatstatus()
+    ret = (all or divbyzero) and (err & 1) != 0
+    ret |= (all or overflow) and (err & 2) != 0
+    ret |= (all or underflow) and (err & 4) != 0
+    ret |= (all or invalid) and (err & 8) != 0
+    return ret
+
 class _Test_Utility:
     # submodule of the desired SIMD extension, e.g. targets["AVX512F"]
     npyv = None
@@ -20,6 +35,9 @@ class _Test_Utility:
         """
         return getattr(self.npyv, attr + "_" + self.sfx)
 
+    def _x2(self, intrin_name):
+        return getattr(self.npyv, f"{intrin_name}_{self.sfx}x2")
+
     def _data(self, start=None, count=None, reverse=False):
         """
         Create list of consecutive numbers according to number of vector's lanes.
@@ -365,6 +383,11 @@ class _SIMD_FP(_Test_Utility):
         nfms = self.nmulsub(vdata_a, vdata_b, vdata_c)
         data_nfms = self.mul(data_fma, self.setall(-1))
         assert nfms == data_nfms
+        # multiply, add for odd elements and subtract even elements.
+        # (a * b) -+ c
+        fmas = list(self.muladdsub(vdata_a, vdata_b, vdata_c))
+        assert fmas[0::2] == list(data_fms)[0::2]
+        assert fmas[1::2] == list(data_fma)[1::2]
 
     def test_abs(self):
         pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
@@ -553,7 +576,16 @@ class _SIMD_FP(_Test_Utility):
         nnan = self.notnan(self.setall(self._nan()))
         assert nnan == [0]*self.nlanes
 
-    import operator
+    @pytest.mark.parametrize("intrin_name", [
+        "rint", "trunc", "ceil", "floor"
+    ])
+    def test_unary_invalid_fpexception(self, intrin_name):
+        intrin = getattr(self, intrin_name)
+        for d in [float("nan"), float("inf"), -float("inf")]:
+            v = self.setall(d)
+            clear_floatstatus()
+            intrin(v)
+            assert check_floatstatus(invalid=True) == False
 
     @pytest.mark.parametrize('py_comp,np_comp', [
         (operator.lt, "cmplt"),
@@ -571,7 +603,8 @@ class _SIMD_FP(_Test_Utility):
             return [lane == mask_true for lane in vector]
 
         intrin = getattr(self, np_comp)
-        cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan), (ninf, nan))
+        cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan),
+                     (ninf, nan), (-0.0, +0.0))
         for case_operand1, case_operand2 in cmp_cases:
             data_a = [case_operand1]*self.nlanes
             data_b = [case_operand2]*self.nlanes
@@ -653,25 +686,34 @@ class _SIMD_ALL(_Test_Utility):
         assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
         assert store_h != vdata  # detect overflow
 
-    def test_memory_partial_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale, fill", [
+        ("self.load_tillz, self.load_till", (32, 64), 1, [0xffff]),
+        ("self.load2_tillz, self.load2_till", (32, 64), 2, [0xffff, 0x7fff]),
+    ])
+    def test_memory_partial_load(self, intrin, elsizes, scale, fill):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_load_tillz, npyv_load_till = eval(intrin)
         data = self._data()
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4] # test out of range
         for n in lanes:
-            load_till  = self.load_till(data, n, 15)
-            data_till  = data[:n] + [15] * (self.nlanes-n)
+            load_till = npyv_load_till(data, n, *fill)
+            load_tillz = npyv_load_tillz(data, n)
+            n *= scale
+            data_till = data[:n] + fill * ((self.nlanes-n) // scale)
             assert load_till == data_till
-            load_tillz = self.load_tillz(data, n)
             data_tillz = data[:n] + [0] * (self.nlanes-n)
             assert load_tillz == data_tillz
 
-    def test_memory_partial_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.store_till", (32, 64), 1),
+        ("self.store2_till", (32, 64), 2),
+    ])
+    def test_memory_partial_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_store_till = eval(intrin)
         data = self._data()
         data_rev = self._data(reverse=True)
         vdata = self.load(data)
@@ -679,105 +721,159 @@ class _SIMD_ALL(_Test_Utility):
         lanes += [self.nlanes**2, self.nlanes**4]
         for n in lanes:
             data_till = data_rev.copy()
-            data_till[:n] = data[:n]
+            data_till[:n*scale] = data[:n*scale]
             store_till = self._data(reverse=True)
-            self.store_till(store_till, n, vdata)
+            npyv_store_till(store_till, n, vdata)
             assert store_till == data_till
 
-    def test_memory_noncont_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.loadn", (32, 64), 1),
+        ("self.loadn2", (32, 64), 2),
+    ])
+    def test_memory_noncont_load(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
-        for stride in range(1, 64):
-            data = self._data(count=stride*self.nlanes)
-            data_stride = data[::stride]
-            loadn = self.loadn(data, stride)
-            assert loadn == data_stride
-
-        for stride in range(-64, 0):
-            data = self._data(stride, -stride*self.nlanes)
-            data_stride = self.load(data[::stride]) # cast unsigned
-            loadn = self.loadn(data, stride)
+        npyv_loadn = eval(intrin)
+        for stride in range(-64, 64):
+            if stride < 0:
+                data = self._data(stride, -stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[-i::stride] for i in range(scale, 0, -1)])
+                ))
+            elif stride == 0:
+                data = self._data()
+                data_stride = data[0:scale] * (self.nlanes//scale)
+            else:
+                data = self._data(count=stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[i::stride] for i in range(scale)]))
+                )
+            data_stride = self.load(data_stride)  # cast unsigned
+            loadn = npyv_loadn(data, stride)
             assert loadn == data_stride
 
-    def test_memory_noncont_partial_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale, fill", [
+        ("self.loadn_tillz, self.loadn_till", (32, 64), 1, [0xffff]),
+        ("self.loadn2_tillz, self.loadn2_till", (32, 64), 2, [0xffff, 0x7fff]),
+    ])
+    def test_memory_noncont_partial_load(self, intrin, elsizes, scale, fill):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_loadn_tillz, npyv_loadn_till = eval(intrin)
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4]
-        for stride in range(1, 64):
-            data = self._data(count=stride*self.nlanes)
-            data_stride = data[::stride]
-            for n in lanes:
-                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
-                loadn_till = self.loadn_till(data, stride, n, 15)
-                assert loadn_till == data_stride_till
-                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
-                loadn_tillz = self.loadn_tillz(data, stride, n)
-                assert loadn_tillz == data_stride_tillz
-
-        for stride in range(-64, 0):
-            data = self._data(stride, -stride*self.nlanes)
-            data_stride = list(self.load(data[::stride])) # cast unsigned
+        for stride in range(-64, 64):
+            if stride < 0:
+                data = self._data(stride, -stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[-i::stride] for i in range(scale, 0, -1)])
+                ))
+            elif stride == 0:
+                data = self._data()
+                data_stride = data[0:scale] * (self.nlanes//scale)
+            else:
+                data = self._data(count=stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[i::stride] for i in range(scale)])
+                ))
+            data_stride = list(self.load(data_stride))  # cast unsigned
             for n in lanes:
-                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
-                loadn_till = self.loadn_till(data, stride, n, 15)
+                nscale = n * scale
+                llanes = self.nlanes - nscale
+                data_stride_till = (
+                    data_stride[:nscale] + fill * (llanes//scale)
+                )
+                loadn_till = npyv_loadn_till(data, stride, n, *fill)
                 assert loadn_till == data_stride_till
-                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
-                loadn_tillz = self.loadn_tillz(data, stride, n)
+                data_stride_tillz = data_stride[:nscale] + [0] * llanes
+                loadn_tillz = npyv_loadn_tillz(data, stride, n)
                 assert loadn_tillz == data_stride_tillz
 
-    def test_memory_noncont_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.storen", (32, 64), 1),
+        ("self.storen2", (32, 64), 2),
+    ])
+    def test_memory_noncont_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
-        vdata = self.load(self._data())
+        npyv_storen = eval(intrin)
+        data = self._data()
+        vdata = self.load(data)
+        hlanes = self.nlanes // scale
         for stride in range(1, 64):
-            data = [15] * stride * self.nlanes
-            data[::stride] = vdata
-            storen = [15] * stride * self.nlanes
-            storen += [127]*64
-            self.storen(storen, stride, vdata)
-            assert storen[:-64] == data
-            assert storen[-64:] == [127]*64 # detect overflow
+            data_storen = [0xff] * stride * self.nlanes
+            for s in range(0, hlanes*stride, stride):
+                i = (s//stride)*scale
+                data_storen[s:s+scale] = data[i:i+scale]
+            storen = [0xff] * stride * self.nlanes
+            storen += [0x7f]*64
+            npyv_storen(storen, stride, vdata)
+            assert storen[:-64] == data_storen
+            assert storen[-64:] == [0x7f]*64  # detect overflow
 
         for stride in range(-64, 0):
-            data = [15] * -stride * self.nlanes
-            data[::stride] = vdata
-            storen = [127]*64
-            storen += [15] * -stride * self.nlanes
-            self.storen(storen, stride, vdata)
-            assert storen[64:] == data
-            assert storen[:64] == [127]*64 # detect overflow
-
-    def test_memory_noncont_partial_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+            data_storen = [0xff] * -stride * self.nlanes
+            for s in range(0, hlanes*stride, stride):
+                i = (s//stride)*scale
+                data_storen[s-scale:s or None] = data[i:i+scale]
+            storen = [0x7f]*64
+            storen += [0xff] * -stride * self.nlanes
+            npyv_storen(storen, stride, vdata)
+            assert storen[64:] == data_storen
+            assert storen[:64] == [0x7f]*64  # detect overflow
+        # stride 0
+        data_storen = [0x7f] * self.nlanes
+        storen = data_storen.copy()
+        data_storen[0:scale] = data[-scale:]
+        npyv_storen(storen, 0, vdata)
+        assert storen == data_storen
+
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.storen_till", (32, 64), 1),
+        ("self.storen2_till", (32, 64), 2),
+    ])
+    def test_memory_noncont_partial_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_storen_till = eval(intrin)
         data = self._data()
         vdata = self.load(data)
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4]
+        hlanes = self.nlanes // scale
         for stride in range(1, 64):
             for n in lanes:
-                data_till = [15] * stride * self.nlanes
-                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
-                storen_till = [15] * stride * self.nlanes
-                storen_till += [127]*64
-                self.storen_till(storen_till, stride, n, vdata)
+                data_till = [0xff] * stride * self.nlanes
+                tdata = data[:n*scale] + [0xff] * (self.nlanes-n*scale)
+                for s in range(0, hlanes*stride, stride)[:n]:
+                    i = (s//stride)*scale
+                    data_till[s:s+scale] = tdata[i:i+scale]
+                storen_till = [0xff] * stride * self.nlanes
+                storen_till += [0x7f]*64
+                npyv_storen_till(storen_till, stride, n, vdata)
                 assert storen_till[:-64] == data_till
-                assert storen_till[-64:] == [127]*64 # detect overflow
+                assert storen_till[-64:] == [0x7f]*64  # detect overflow
 
         for stride in range(-64, 0):
             for n in lanes:
-                data_till = [15] * -stride * self.nlanes
-                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
-                storen_till = [127]*64
-                storen_till += [15] * -stride * self.nlanes
-                self.storen_till(storen_till, stride, n, vdata)
+                data_till = [0xff] * -stride * self.nlanes
+                tdata = data[:n*scale] + [0xff] * (self.nlanes-n*scale)
+                for s in range(0, hlanes*stride, stride)[:n]:
+                    i = (s//stride)*scale
+                    data_till[s-scale:s or None] = tdata[i:i+scale]
+                storen_till = [0x7f]*64
+                storen_till += [0xff] * -stride * self.nlanes
+                npyv_storen_till(storen_till, stride, n, vdata)
                 assert storen_till[64:] == data_till
-                assert storen_till[:64] == [127]*64 # detect overflow
+                assert storen_till[:64] == [0x7f]*64  # detect overflow
+
+        # stride 0
+        for n in lanes:
+            data_till = [0x7f] * self.nlanes
+            storen_till = data_till.copy()
+            data_till[0:scale] = data[:n*scale][-scale:]
+            npyv_storen_till(storen_till, 0, n, vdata)
+            assert storen_till == data_till
 
     @pytest.mark.parametrize("intrin, table_size, elsize", [
         ("self.lut32", 32, 32),
@@ -861,13 +957,27 @@ class _SIMD_ALL(_Test_Utility):
         combineh = self.combineh(vdata_a, vdata_b)
         assert combineh == data_a_hi + data_b_hi
         # combine x2
-        combine  = self.combine(vdata_a, vdata_b)
+        combine = self.combine(vdata_a, vdata_b)
         assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi)
+
         # zip(interleave)
-        data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p]
-        data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p]
-        vzip  = self.zip(vdata_a, vdata_b)
+        data_zipl = self.load([
+            v for p in zip(data_a_lo, data_b_lo) for v in p
+        ])
+        data_ziph = self.load([
+            v for p in zip(data_a_hi, data_b_hi) for v in p
+        ])
+        vzip = self.zip(vdata_a, vdata_b)
         assert vzip == (data_zipl, data_ziph)
+        vzip = [0]*self.nlanes*2
+        self._x2("store")(vzip, (vdata_a, vdata_b))
+        assert vzip == list(data_zipl) + list(data_ziph)
+
+        # unzip(deinterleave)
+        unzip = self.unzip(data_zipl, data_ziph)
+        assert unzip == (data_a, data_b)
+        unzip = self._x2("load")(list(data_zipl) + list(data_ziph))
+        assert unzip == (data_a, data_b)
 
     def test_reorder_rev64(self):
         # Reverse elements of each 64-bit lane
@@ -881,41 +991,51 @@ class _SIMD_ALL(_Test_Utility):
         rev64 = self.rev64(self.load(range(self.nlanes)))
         assert rev64 == data_rev64
 
-    def test_operators_comparison(self):
+    def test_reorder_permi128(self):
+        """
+        Test permuting elements for each 128-bit lane.
+        npyv_permi128_##sfx
+        """
+        ssize = self._scalar_size()
+        if ssize < 32:
+            return
+        data = self.load(self._data())
+        permn = 128//ssize
+        permd = permn-1
+        nlane128 = self.nlanes//permn
+        shfl = [0, 1] if ssize == 64 else [0, 2, 4, 6]
+        for i in range(permn):
+            indices = [(i >> shf) & permd for shf in shfl]
+            vperm = self.permi128(data, *indices)
+            data_vperm = [
+                data[j + (e & -permn)]
+                for e, j in enumerate(indices*nlane128)
+            ]
+            assert vperm == data_vperm
+
+    @pytest.mark.parametrize('func, intrin', [
+        (operator.lt, "cmplt"),
+        (operator.le, "cmple"),
+        (operator.gt, "cmpgt"),
+        (operator.ge, "cmpge"),
+        (operator.eq, "cmpeq")
+    ])
+    def test_operators_comparison(self, func, intrin):
         if self._is_fp():
             data_a = self._data()
         else:
             data_a = self._data(self._int_max() - self.nlanes)
         data_b = self._data(self._int_min(), reverse=True)
         vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+        intrin = getattr(self, intrin)
 
         mask_true = self._true_mask()
         def to_bool(vector):
             return [lane == mask_true for lane in vector]
-        # equal
-        data_eq = [a == b for a, b in zip(data_a, data_b)]
-        cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b))
-        assert cmpeq == data_eq
-        # not equal
-        data_neq = [a != b for a, b in zip(data_a, data_b)]
-        cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b))
-        assert cmpneq == data_neq
-        # greater than
-        data_gt = [a > b for a, b in zip(data_a, data_b)]
-        cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b))
-        assert cmpgt == data_gt
-        # greater than and equal
-        data_ge = [a >= b for a, b in zip(data_a, data_b)]
-        cmpge = to_bool(self.cmpge(vdata_a, vdata_b))
-        assert cmpge == data_ge
-        # less than
-        data_lt  = [a < b for a, b in zip(data_a, data_b)]
-        cmplt = to_bool(self.cmplt(vdata_a, vdata_b))
-        assert cmplt == data_lt
-        # less than and equal
-        data_le  = [a <= b for a, b in zip(data_a, data_b)]
-        cmple = to_bool(self.cmple(vdata_a, vdata_b))
-        assert cmple == data_le
+
+        data_cmp = [func(a, b) for a, b in zip(data_a, data_b)]
+        cmp = to_bool(intrin(vdata_a, vdata_b))
+        assert cmp == data_cmp
 
     def test_operators_logical(self):
         if self._is_fp():
@@ -1155,6 +1275,18 @@ class _SIMD_ALL(_Test_Utility):
         ifadd = self.ifadd(false_mask, vdata_a, vdata_b, vdata_b)
         assert ifadd == vdata_b
 
+        if not self._is_fp():
+            return
+        data_div = self.div(vdata_b, vdata_a)
+        ifdiv = self.ifdiv(true_mask, vdata_b, vdata_a, vdata_b)
+        assert ifdiv == data_div
+        ifdivz = self.ifdivz(true_mask, vdata_b, vdata_a)
+        assert ifdivz == data_div
+        ifdiv = self.ifdiv(false_mask, vdata_a, vdata_b, vdata_b)
+        assert ifdiv == vdata_b
+        ifdivz = self.ifdivz(false_mask, vdata_a, vdata_b)
+        assert ifdivz == self.zero()
+
 bool_sfx = ("b8", "b16", "b32", "b64")
 int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
 fp_sfx  = ("f32", "f64")
diff --git a/numpy/core/tests/test_strings.py b/numpy/core/tests/test_strings.py
index 2b87ed654..42f775e85 100644
--- a/numpy/core/tests/test_strings.py
+++ b/numpy/core/tests/test_strings.py
@@ -83,3 +83,17 @@ def test_string_comparisons_empty(op, ufunc, sym, dtypes):
     assert_array_equal(op(arr, arr2), expected)
     assert_array_equal(ufunc(arr, arr2), expected)
     assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected)
+
+
+@pytest.mark.parametrize("str_dt", ["S", "U"])
+@pytest.mark.parametrize("float_dt", np.typecodes["AllFloat"])
+def test_float_to_string_cast(str_dt, float_dt):
+    float_dt = np.dtype(float_dt)
+    fi = np.finfo(float_dt)
+    arr = np.array([np.nan, np.inf, -np.inf, fi.max, fi.min], dtype=float_dt)
+    expected = ["nan", "inf", "-inf", repr(fi.max), repr(fi.min)]
+    if float_dt.kind == 'c':
+        expected = [f"({r}+0j)" for r in expected]
+
+    res = arr.astype(str_dt)
+    assert_array_equal(res, np.array(expected, dtype=str_dt))
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 00aa48647..f716e2104 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1650,6 +1650,19 @@ class TestUfunc:
         a = a[1:, 1:, 1:]
         self.check_identityless_reduction(a)
 
+    def test_reduce_identity_depends_on_loop(self):
+        """
+        The type of the result should always depend on the selected loop, not
+        necessarily the output (only relevant for object arrays).
+        """
+        # For an object loop, the default value 0 with type int is used:
+        assert type(np.add.reduce([], dtype=object)) is int
+        out = np.array(None, dtype=object)
+        # When the loop is float64 but `out` is object this does not happen,
+        # the result is float64 cast to object (which gives Python `float`).
+        np.add.reduce([], out=out, dtype=np.float64)
+        assert type(out[()]) is float
+
     def test_initial_reduction(self):
         # np.minimum.reduce is an identityless reduction
 
@@ -1670,6 +1683,9 @@ class TestUfunc:
         # Check initial=None raises ValueError for both types of ufunc reductions
         assert_raises(ValueError, np.minimum.reduce, [], initial=None)
         assert_raises(ValueError, np.add.reduce, [], initial=None)
+        # Also in the somewhat special object case:
+        with pytest.raises(ValueError):
+            np.add.reduce([], initial=None, dtype=object)
 
         # Check that np._NoValue gives default behavior.
         assert_equal(np.add.reduce([], initial=np._NoValue), 0)
@@ -1679,6 +1695,23 @@ class TestUfunc:
         res = np.add.reduce(a, initial=5)
         assert_equal(res, 15)
 
+    def test_empty_reduction_and_idenity(self):
+        arr = np.zeros((0, 5))
+        # OK, since the reduction itself is *not* empty, the result is
+        assert np.true_divide.reduce(arr, axis=1).shape == (0,)
+        # Not OK, the reduction itself is empty and we have no idenity
+        with pytest.raises(ValueError):
+            np.true_divide.reduce(arr, axis=0)
+
+        # Test that an empty reduction fails also if the result is empty
+        arr = np.zeros((0, 0, 5))
+        with pytest.raises(ValueError):
+            np.true_divide.reduce(arr, axis=1)
+
+        # Division reduction makes sense with `initial=1` (empty or not):
+        res = np.true_divide.reduce(arr, axis=1, initial=1)
+        assert_array_equal(res, np.ones((0, 5)))
+
     @pytest.mark.parametrize('axis', (0, 1, None))
     @pytest.mark.parametrize('where', (np.array([False, True, True]),
                                        np.array([[True], [False], [True]]),
@@ -1923,17 +1956,133 @@ class TestUfunc:
         assert_(MyThing.rmul_count == 1, MyThing.rmul_count)
         assert_(MyThing.getitem_count <= 2, MyThing.getitem_count)
 
-    def test_inplace_fancy_indexing(self):
+    @pytest.mark.parametrize("a", (
+                             np.arange(10, dtype=int),
+                             np.arange(10, dtype=_rational_tests.rational),
+                             ))
+    def test_ufunc_at_basic(self, a):
 
-        a = np.arange(10)
-        np.add.at(a, [2, 5, 2], 1)
-        assert_equal(a, [0, 1, 4, 3, 4, 6, 6, 7, 8, 9])
+        aa = a.copy()
+        np.add.at(aa, [2, 5, 2], 1)
+        assert_equal(aa, [0, 1, 4, 3, 4, 6, 6, 7, 8, 9])
 
-        a = np.arange(10)
+        with pytest.raises(ValueError):
+            # missing second operand
+            np.add.at(aa, [2, 5, 3])
+
+        aa = a.copy()
+        np.negative.at(aa, [2, 5, 3])
+        assert_equal(aa, [0, 1, -2, -3, 4, -5, 6, 7, 8, 9])
+
+        aa = a.copy()
         b = np.array([100, 100, 100])
-        np.add.at(a, [2, 5, 2], b)
-        assert_equal(a, [0, 1, 202, 3, 4, 105, 6, 7, 8, 9])
+        np.add.at(aa, [2, 5, 2], b)
+        assert_equal(aa, [0, 1, 202, 3, 4, 105, 6, 7, 8, 9])
+
+        with pytest.raises(ValueError):
+            # extraneous second operand
+            np.negative.at(a, [2, 5, 3], [1, 2, 3])
 
+        with pytest.raises(ValueError):
+            # second operand cannot be converted to an array
+            np.add.at(a, [2, 5, 3], [[1, 2], 1])
+
+    # ufuncs with indexed loops for performance in ufunc.at
+    indexed_ufuncs = [np.add, np.subtract, np.multiply, np.floor_divide,
+                      np.maximum, np.minimum, np.fmax, np.fmin]
+
+    @pytest.mark.parametrize(
+                "typecode", np.typecodes['AllInteger'] + np.typecodes['Float'])
+    @pytest.mark.parametrize("ufunc", indexed_ufuncs)
+    def test_ufunc_at_inner_loops(self, typecode, ufunc):
+        if ufunc is np.divide and typecode in np.typecodes['AllInteger']:
+            # Avoid divide-by-zero and inf for integer divide
+            a = np.ones(100, dtype=typecode)
+            indx = np.random.randint(100, size=30, dtype=np.intp)
+            vals = np.arange(1, 31, dtype=typecode)
+        else:
+            a = np.ones(1000, dtype=typecode)
+            indx = np.random.randint(1000, size=3000, dtype=np.intp)
+            vals = np.arange(3000, dtype=typecode)
+        atag = a.copy()
+        # Do the calculation twice and compare the answers
+        with warnings.catch_warnings(record=True) as w_at:
+            warnings.simplefilter('always')
+            ufunc.at(a, indx, vals)
+        with warnings.catch_warnings(record=True) as w_loop:
+            warnings.simplefilter('always')
+            for i, v in zip(indx, vals):
+                # Make sure all the work happens inside the ufunc
+                # in order to duplicate error/warning handling
+                ufunc(atag[i], v, out=atag[i:i+1], casting="unsafe")
+        assert_equal(atag, a)
+        # If w_loop warned, make sure w_at warned as well
+        if len(w_loop) > 0:
+            #
+            assert len(w_at) > 0
+            assert w_at[0].category == w_loop[0].category
+            assert str(w_at[0].message)[:10] == str(w_loop[0].message)[:10]
+
+    @pytest.mark.parametrize("typecode", np.typecodes['Complex'])
+    @pytest.mark.parametrize("ufunc", [np.add, np.subtract, np.multiply])
+    def test_ufunc_at_inner_loops_complex(self, typecode, ufunc):
+        a = np.ones(10, dtype=typecode)
+        indx = np.concatenate([np.ones(6, dtype=np.intp),
+                               np.full(18, 4, dtype=np.intp)])
+        value = a.dtype.type(1j)
+        ufunc.at(a, indx, value)
+        expected = np.ones_like(a)
+        if ufunc is np.multiply:
+            expected[1] = expected[4] = -1
+        else:
+            expected[1] += 6 * (value if ufunc is np.add else -value)
+            expected[4] += 18 * (value if ufunc is np.add else -value)
+
+        assert_array_equal(a, expected)
+
+    def test_ufunc_at_ellipsis(self):
+        # Make sure the indexed loop check does not choke on iters
+        # with subspaces
+        arr = np.zeros(5)
+        np.add.at(arr, slice(None), np.ones(5))
+        assert_array_equal(arr, np.ones(5))
+
+    def test_ufunc_at_negative(self):
+        arr = np.ones(5, dtype=np.int32)
+        indx = np.arange(5)
+        umt.indexed_negative.at(arr, indx)
+        # If it is [-1, -1, -1, -100, 0] then the regular strided loop was used
+        assert np.all(arr == [-1, -1, -1, -200, -1])
+
+    def test_ufunc_at_large(self):
+        # issue gh-23457
+        indices = np.zeros(8195, dtype=np.int16)
+        b = np.zeros(8195, dtype=float)
+        b[0] = 10
+        b[1] = 5
+        b[8192:] = 100
+        a = np.zeros(1, dtype=float)
+        np.add.at(a, indices, b)
+        assert a[0] == b.sum()
+
+    def test_cast_index_fastpath(self):
+        arr = np.zeros(10)
+        values = np.ones(100000)
+        # index must be cast, which may be buffered in chunks:
+        index = np.zeros(len(values), dtype=np.uint8)
+        np.add.at(arr, index, values)
+        assert arr[0] == len(values)
+
+    @pytest.mark.parametrize("value", [
+        np.ones(1), np.ones(()), np.float64(1.), 1.])
+    def test_ufunc_at_scalar_value_fastpath(self, value):
+        arr = np.zeros(1000)
+        # index must be cast, which may be buffered in chunks:
+        index = np.repeat(np.arange(1000), 2)
+        np.add.at(arr, index, value)
+        assert_array_equal(arr, np.full_like(arr, 2 * value))
+
+    def test_ufunc_at_multiD(self):
         a = np.arange(9).reshape(3, 3)
         b = np.array([[100, 100, 100], [200, 200, 200], [300, 300, 300]])
         np.add.at(a, (slice(None), [1, 2, 1]), b)
@@ -2013,11 +2162,7 @@ class TestUfunc:
               [121, 222, 323],
               [124, 225, 326]]])
 
-        a = np.arange(10)
-        np.negative.at(a, [2, 5, 2])
-        assert_equal(a, [0, 1, 2, 3, 4, -5, 6, 7, 8, 9])
-
-        # Test 0-dim array
+    def test_ufunc_at_0D(self):
         a = np.array(0)
         np.add.at(a, (), 1)
         assert_equal(a, 1)
@@ -2025,11 +2170,13 @@ class TestUfunc:
         assert_raises(IndexError, np.add.at, a, 0, 1)
         assert_raises(IndexError, np.add.at, a, [], 1)
 
+    def test_ufunc_at_dtypes(self):
         # Test mixed dtypes
         a = np.arange(10)
         np.power.at(a, [1, 2, 3, 2], 3.5)
         assert_equal(a, np.array([0, 1, 4414, 46, 4, 5, 6, 7, 8, 9]))
 
+    def test_ufunc_at_boolean(self):
         # Test boolean indexing and boolean ufuncs
         a = np.arange(10)
         index = a % 2 == 0
@@ -2041,6 +2188,7 @@ class TestUfunc:
         np.invert.at(a, [2, 5, 2])
         assert_equal(a, [0, 1, 2, 3, 4, 5 ^ 0xffffffff, 6, 7, 8, 9])
 
+    def test_ufunc_at_advanced(self):
         # Test empty subspace
         orig = np.arange(4)
         a = orig[:, None][:, 0:0]
@@ -2064,7 +2212,7 @@ class TestUfunc:
         # Test maximum
         a = np.array([1, 2, 3])
         np.maximum.at(a, [0], 0)
-        assert_equal(np.array([1, 2, 3]), a)
+        assert_equal(a, np.array([1, 2, 3]))
 
     def test_at_not_none_signature(self):
         # Test ufuncs with non-trivial signature raise a TypeError
@@ -2075,6 +2223,23 @@ class TestUfunc:
         a = np.array([[[1, 2], [3, 4]]])
         assert_raises(TypeError, np.linalg._umath_linalg.det.at, a, [0])
 
+    def test_at_no_loop_for_op(self):
+        # str dtype does not have a ufunc loop for np.add
+        arr = np.ones(10, dtype=str)
+        with pytest.raises(np.core._exceptions._UFuncNoLoopError):
+            np.add.at(arr, [0, 1], [0, 1])
+
+    def test_at_output_casting(self):
+        arr = np.array([-1])
+        np.equal.at(arr, [0], [0])
+        assert arr[0] == 0
+
+    def test_at_broadcast_failure(self):
+        arr = np.arange(5)
+        with pytest.raises(ValueError):
+            np.add.at(arr, [0, 1], [1, 2, 3])
+
+
     def test_reduce_arguments(self):
         f = np.add.reduce
         d = np.ones((5,2), dtype=int)
@@ -2580,7 +2745,9 @@ def test_reduce_casterrors(offset):
     with pytest.raises(ValueError, match="invalid literal"):
         # This is an unsafe cast, but we currently always allow that.
         # Note that the double loop is picked, but the cast fails.
-        np.add.reduce(arr, dtype=np.intp, out=out)
+        # `initial=None` disables the use of an identity here to test failures
+        # while copying the first values path (not used when identity exists).
+        np.add.reduce(arr, dtype=np.intp, out=out, initial=None)
     assert count == sys.getrefcount(value)
     # If an error occurred during casting, the operation is done at most until
     # the error occurs (the result of which would be `value * offset`) and -1
@@ -2589,6 +2756,15 @@ def test_reduce_casterrors(offset):
     assert out[()] < value * offset
 
 
+def test_object_reduce_cleanup_on_failure():
+    # Test cleanup, including of the initial value (manually provided or not)
+    with pytest.raises(TypeError):
+        np.add.reduce([1, 2, None], initial=4)
+
+    with pytest.raises(TypeError):
+        np.add.reduce([1, 2, None])
+
+
 @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
 @pytest.mark.parametrize("method",
         [np.add.accumulate, np.add.reduce,
@@ -2689,10 +2865,10 @@ class TestLowlevelAPIAccess:
         r = np.equal.resolve_dtypes((S0, S0, None))
         assert r == (S0, S0, np.dtype(bool))
 
-        # Subarray dtypes are weird and only really exist nested, they need
-        # the shift to full NEP 50 to be implemented nicely:
+        # Subarray dtypes are weird and may not work fully, we preserve them
+        # leading to a TypeError (currently no equal loop for void/structured)
         dts = np.dtype("10i")
-        with pytest.raises(NotImplementedError):
+        with pytest.raises(TypeError):
             np.equal.resolve_dtypes((dts, dts, None))
 
     def test_resolve_dtypes_reduction(self):
@@ -2791,3 +2967,10 @@ class TestLowlevelAPIAccess:
         with pytest.raises(TypeError):
             # cannot call it a second time:
             np.negative._get_strided_loop(call_info)
+
+    def test_long_arrays(self):
+        t = np.zeros((1029, 917), dtype=np.single)
+        t[0][0] = 1
+        t[28][414] = 1
+        tc = np.cos(t)
+        assert_equal(tc[0][0], tc[28][414])
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 88ab7e014..9e3fe387b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -17,10 +17,26 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM
+    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL
     )
 from numpy.testing._private.utils import _glibc_older_than
 
+UFUNCS = [obj for obj in np.core.umath.__dict__.values()
+         if isinstance(obj, np.ufunc)]
+
+UFUNCS_UNARY = [
+    uf for uf in UFUNCS if uf.nin == 1
+]
+UFUNCS_UNARY_FP = [
+    uf for uf in UFUNCS_UNARY if 'f->f' in uf.types
+]
+
+UFUNCS_BINARY = [
+    uf for uf in UFUNCS if uf.nin == 2
+]
+UFUNCS_BINARY_ACC = [
+    uf for uf in UFUNCS_BINARY if hasattr(uf, "accumulate") and uf.nout == 1
+]
 
 def interesting_binop_operands(val1, val2, dtype):
     """
@@ -275,16 +291,16 @@ class TestComparisons:
         b_lst = b.tolist()
 
         # (Binary) Comparison (x1=array, x2=array)
-        comp_b = np_comp(a, b)
-        comp_b_list = [py_comp(x, y) for x, y in zip(a_lst, b_lst)]
+        comp_b = np_comp(a, b).view(np.uint8)
+        comp_b_list = [int(py_comp(x, y)) for x, y in zip(a_lst, b_lst)]
 
         # (Scalar1) Comparison (x1=scalar, x2=array)
-        comp_s1 = np_comp(np_scalar, b)
-        comp_s1_list = [py_comp(scalar, x) for x in b_lst]
+        comp_s1 = np_comp(np_scalar, b).view(np.uint8)
+        comp_s1_list = [int(py_comp(scalar, x)) for x in b_lst]
 
         # (Scalar2) Comparison (x1=array, x2=scalar)
-        comp_s2 = np_comp(a, np_scalar)
-        comp_s2_list = [py_comp(x, scalar) for x in a_lst]
+        comp_s2 = np_comp(a, np_scalar).view(np.uint8)
+        comp_s2_list = [int(py_comp(x, scalar)) for x in a_lst]
 
         # Sequence: Binary, Scalar1 and Scalar2
         assert_(comp_b.tolist() == comp_b_list,
@@ -353,6 +369,64 @@ class TestComparisons:
         with pytest.raises(TypeError, match="No loop matching"):
             np.equal(1, 1, sig=(None, None, "l"))
 
+    @pytest.mark.parametrize("dtypes", ["qQ", "Qq"])
+    @pytest.mark.parametrize('py_comp, np_comp', [
+        (operator.lt, np.less),
+        (operator.le, np.less_equal),
+        (operator.gt, np.greater),
+        (operator.ge, np.greater_equal),
+        (operator.eq, np.equal),
+        (operator.ne, np.not_equal)
+    ])
+    @pytest.mark.parametrize("vals", [(2**60, 2**60+1), (2**60+1, 2**60)])
+    def test_large_integer_direct_comparison(
+            self, dtypes, py_comp, np_comp, vals):
+        # Note that float(2**60) + 1 == float(2**60).
+        a1 = np.array([2**60], dtype=dtypes[0])
+        a2 = np.array([2**60 + 1], dtype=dtypes[1])
+        expected = py_comp(2**60, 2**60+1)
+
+        assert py_comp(a1, a2) == expected
+        assert np_comp(a1, a2) == expected
+        # Also check the scalars:
+        s1 = a1[0]
+        s2 = a2[0]
+        assert isinstance(s1, np.integer)
+        assert isinstance(s2, np.integer)
+        # The Python operator here is mainly interesting:
+        assert py_comp(s1, s2) == expected
+        assert np_comp(s1, s2) == expected
+
+    @pytest.mark.parametrize("dtype", np.typecodes['UnsignedInteger'])
+    @pytest.mark.parametrize('py_comp_func, np_comp_func', [
+        (operator.lt, np.less),
+        (operator.le, np.less_equal),
+        (operator.gt, np.greater),
+        (operator.ge, np.greater_equal),
+        (operator.eq, np.equal),
+        (operator.ne, np.not_equal)
+    ])
+    @pytest.mark.parametrize("flip", [True, False])
+    def test_unsigned_signed_direct_comparison(
+            self, dtype, py_comp_func, np_comp_func, flip):
+        if flip:
+            py_comp = lambda x, y: py_comp_func(y, x)
+            np_comp = lambda x, y: np_comp_func(y, x)
+        else:
+            py_comp = py_comp_func
+            np_comp = np_comp_func
+
+        arr = np.array([np.iinfo(dtype).max], dtype=dtype)
+        expected = py_comp(int(arr[0]), -1)
+
+        assert py_comp(arr, -1) == expected
+        assert np_comp(arr, -1) == expected
+        scalar = arr[0]
+        assert isinstance(scalar, np.integer)
+        # The Python operator here is mainly interesting:
+        assert py_comp(scalar, -1) == expected
+        assert np_comp(scalar, -1) == expected
+
 
 class TestAdd:
     def test_reduce_alignment(self):
@@ -408,7 +482,7 @@ class TestDivision:
     def test_division_int_boundary(self, dtype, ex_val):
         fo = np.iinfo(dtype)
         neg = -1 if fo.min < 0 else 1
-        # Large enough to test SIMD loops and remaind elements
+        # Large enough to test SIMD loops and remainder elements
         lsize = 512 + 7
         a, b, divisors = eval(ex_val)
         a_lst, b_lst = a.tolist(), b.tolist()
@@ -573,6 +647,8 @@ class TestDivision:
         assert_equal(np.signbit(x//1), 0)
         assert_equal(np.signbit((-x)//1), 1)
 
+    @pytest.mark.skipif(hasattr(np.__config__, "blas_ssl2_info"),
+            reason="gh-22982")
     @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize('dtype', np.typecodes['Float'])
     def test_floor_division_errors(self, dtype):
@@ -715,6 +791,8 @@ class TestRemainder:
             # inf / 0 does not set any flags, only the modulo creates a NaN
             np.divmod(finf, fzero)
 
+    @pytest.mark.skipif(hasattr(np.__config__, "blas_ssl2_info"),
+            reason="gh-22982")
     @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.xfail(sys.platform.startswith("darwin"),
            reason="MacOS seems to not give the correct 'invalid' warning for "
@@ -1086,7 +1164,7 @@ class TestPower:
         assert_complex_equal(np.power(zero, 1+1j), zero)
         assert_complex_equal(np.power(zero, 1+0j), zero)
         assert_complex_equal(np.power(zero, 1-1j), zero)
-        #Complex powers will negative real part or 0 (provided imaginary 
+        #Complex powers will negative real part or 0 (provided imaginary
         # part is not zero) will generate a NAN and hence a RUNTIME warning
         with pytest.warns(expected_warning=RuntimeWarning) as r:
             assert_complex_equal(np.power(zero, -1+1j), cnan)
@@ -1276,9 +1354,20 @@ class TestLog:
 
         # test log() of max for dtype does not raise
         for dt in ['f', 'd', 'g']:
-            with np.errstate(all='raise'):
-                x = np.finfo(dt).max
-                np.log(x)
+            try:
+                with np.errstate(all='raise'):
+                    x = np.finfo(dt).max
+                    np.log(x)
+            except FloatingPointError as exc:
+                if dt == 'g' and IS_MUSL:
+                    # FloatingPointError is known to occur on longdouble
+                    # for musllinux_x86_64 x is very large
+                    pytest.skip(
+                        "Overflow has occurred for"
+                        " np.log(np.finfo(np.longdouble).max)"
+                    )
+                else:
+                    raise exc
 
     def test_log_strides(self):
         np.random.seed(42)
@@ -1395,23 +1484,50 @@ class TestSpecialFloats:
             np.log(a)
 
     @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
-    def test_sincos_values(self):
+    @pytest.mark.parametrize('dtype', ['e', 'f', 'd', 'g'])
+    def test_sincos_values(self, dtype):
         with np.errstate(all='ignore'):
             x = [np.nan, np.nan, np.nan, np.nan]
             y = [np.nan, -np.nan, np.inf, -np.inf]
-            for dt in ['e', 'f', 'd', 'g']:
-                xf = np.array(x, dtype=dt)
-                yf = np.array(y, dtype=dt)
-                assert_equal(np.sin(yf), xf)
-                assert_equal(np.cos(yf), xf)
+            xf = np.array(x, dtype=dtype)
+            yf = np.array(y, dtype=dtype)
+            assert_equal(np.sin(yf), xf)
+            assert_equal(np.cos(yf), xf)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    @pytest.mark.xfail(
+        sys.platform.startswith("darwin"),
+        reason="underflow is triggered for scalar 'sin'"
+    )
+    def test_sincos_underflow(self):
+        with np.errstate(under='raise'):
+            underflow_trigger = np.array(
+                float.fromhex("0x1.f37f47a03f82ap-511"),
+                dtype=np.float64
+            )
+            np.sin(underflow_trigger)
+            np.cos(underflow_trigger)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    @pytest.mark.parametrize('callable', [np.sin, np.cos])
+    @pytest.mark.parametrize('dtype', ['e', 'f', 'd'])
+    @pytest.mark.parametrize('value', [np.inf, -np.inf])
+    def test_sincos_errors(self, callable, dtype, value):
         with np.errstate(invalid='raise'):
-            for callable in [np.sin, np.cos]:
-                for value in [np.inf, -np.inf]:
-                    for dt in ['e', 'f', 'd']:
-                        assert_raises(FloatingPointError, callable,
-                                np.array([value], dtype=dt))
+            assert_raises(FloatingPointError, callable,
+                np.array([value], dtype=dtype))
+
+    @pytest.mark.parametrize('callable', [np.sin, np.cos])
+    @pytest.mark.parametrize('dtype', ['f', 'd'])
+    @pytest.mark.parametrize('stride', [-1, 1, 2, 4, 5])
+    def test_sincos_overlaps(self, callable, dtype, stride):
+        N = 100
+        M = N // abs(stride)
+        rng = np.random.default_rng(42)
+        x = rng.standard_normal(N, dtype)
+        y = callable(x[::stride])
+        callable(x[::stride], out=x[:M])
+        assert_equal(x[:M], y)
 
     @pytest.mark.parametrize('dt', ['e', 'f', 'd', 'g'])
     def test_sqrt_values(self, dt):
@@ -1631,18 +1747,78 @@ class TestSpecialFloats:
                                   np.array(value, dtype=dt))
 
     # test to ensure no spurious FP exceptions are raised due to SIMD
-    def test_spurious_fpexception(self):
-        for dt in ['e', 'f', 'd']:
-            arr = np.array([1.0, 2.0], dtype=dt)
-            with assert_no_warnings():
-                np.log(arr)
-                np.log2(arr)
-                np.log10(arr)
-                np.arccosh(arr)
-
+    INF_INVALID_ERR = [
+        np.cos, np.sin, np.tan, np.arccos, np.arcsin, np.spacing, np.arctanh
+    ]
+    NEG_INVALID_ERR = [
+        np.log, np.log2, np.log10, np.log1p, np.sqrt, np.arccosh,
+        np.arctanh
+    ]
+    ONE_INVALID_ERR = [
+        np.arctanh,
+    ]
+    LTONE_INVALID_ERR = [
+        np.arccosh,
+    ]
+    BYZERO_ERR = [
+        np.log, np.log2, np.log10, np.reciprocal, np.arccosh
+    ]
+
+    @pytest.mark.parametrize("ufunc", UFUNCS_UNARY_FP)
+    @pytest.mark.parametrize("dtype", ('e', 'f', 'd'))
+    @pytest.mark.parametrize("data, escape", (
+        ([0.03], LTONE_INVALID_ERR),
+        ([0.03]*32, LTONE_INVALID_ERR),
+        # neg
+        ([-1.0], NEG_INVALID_ERR),
+        ([-1.0]*32, NEG_INVALID_ERR),
+        # flat
+        ([1.0], ONE_INVALID_ERR),
+        ([1.0]*32, ONE_INVALID_ERR),
+        # zero
+        ([0.0], BYZERO_ERR),
+        ([0.0]*32, BYZERO_ERR),
+        ([-0.0], BYZERO_ERR),
+        ([-0.0]*32, BYZERO_ERR),
+        # nan
+        ([0.5, 0.5, 0.5, np.nan], LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, np.nan]*32, LTONE_INVALID_ERR),
+        ([np.nan, 1.0, 1.0, 1.0], ONE_INVALID_ERR),
+        ([np.nan, 1.0, 1.0, 1.0]*32, ONE_INVALID_ERR),
+        ([np.nan], []),
+        ([np.nan]*32, []),
+        # inf
+        ([0.5, 0.5, 0.5, np.inf], INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, np.inf]*32, INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([np.inf, 1.0, 1.0, 1.0], INF_INVALID_ERR),
+        ([np.inf, 1.0, 1.0, 1.0]*32, INF_INVALID_ERR),
+        ([np.inf], INF_INVALID_ERR),
+        ([np.inf]*32, INF_INVALID_ERR),
+        # ninf
+        ([0.5, 0.5, 0.5, -np.inf],
+         NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, -np.inf]*32,
+         NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([-np.inf, 1.0, 1.0, 1.0], NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf, 1.0, 1.0, 1.0]*32, NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf], NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf]*32, NEG_INVALID_ERR + INF_INVALID_ERR),
+    ))
+    def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape):
+        if escape and ufunc in escape:
+            return
+        # FIXME: NAN raises FP invalid exception:
+        #  - ceil/float16 on MSVC:32-bit
+        #  - spacing/float16 on almost all platforms
+        if ufunc in (np.spacing, np.ceil) and dtype == 'e':
+            return
+        array = np.array(data, dtype=dtype)
+        with assert_no_warnings():
+            ufunc(array)
 
 class TestFPClass:
-    @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+    @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1,
+                                2, 4, 5, 6, 7, 8, 9, 10])
     def test_fpclass(self, stride):
         arr_f64 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.2251e-308], dtype='d')
         arr_f32 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 1.4013e-045, -1.4013e-045], dtype='f')
@@ -1659,6 +1835,52 @@ class TestFPClass:
         assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride])
         assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride])
 
+    @pytest.mark.parametrize("dtype", ['d', 'f'])
+    def test_fp_noncontiguous(self, dtype):
+        data = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0,
+                            1.0, -0.0, 0.0, 2.2251e-308,
+                            -2.2251e-308], dtype=dtype)
+        nan = np.array([True, True, False, False, False, False,
+                            False, False, False, False])
+        inf = np.array([False, False, True, True, False, False,
+                            False, False, False, False])
+        sign = np.array([False, True, False, True, True, False,
+                            True, False, False, True])
+        finite = np.array([False, False, False, False, True, True,
+                            True, True, True, True])
+        out = np.ndarray(data.shape, dtype='bool')
+        ncontig_in = data[1::3]
+        ncontig_out = out[1::3]
+        contig_in = np.array(ncontig_in)
+        assert_equal(ncontig_in.flags.c_contiguous, False)
+        assert_equal(ncontig_out.flags.c_contiguous, False)
+        assert_equal(contig_in.flags.c_contiguous, True)
+        # ncontig in, ncontig out
+        assert_equal(np.isnan(ncontig_in, out=ncontig_out), nan[1::3])
+        assert_equal(np.isinf(ncontig_in, out=ncontig_out), inf[1::3])
+        assert_equal(np.signbit(ncontig_in, out=ncontig_out), sign[1::3])
+        assert_equal(np.isfinite(ncontig_in, out=ncontig_out), finite[1::3])
+        # contig in, ncontig out
+        assert_equal(np.isnan(contig_in, out=ncontig_out), nan[1::3])
+        assert_equal(np.isinf(contig_in, out=ncontig_out), inf[1::3])
+        assert_equal(np.signbit(contig_in, out=ncontig_out), sign[1::3])
+        assert_equal(np.isfinite(contig_in, out=ncontig_out), finite[1::3])
+        # ncontig in, contig out
+        assert_equal(np.isnan(ncontig_in), nan[1::3])
+        assert_equal(np.isinf(ncontig_in), inf[1::3])
+        assert_equal(np.signbit(ncontig_in), sign[1::3])
+        assert_equal(np.isfinite(ncontig_in), finite[1::3])
+        # contig in, contig out, nd stride
+        data_split = np.array(np.array_split(data, 2))
+        nan_split = np.array(np.array_split(nan, 2))
+        inf_split = np.array(np.array_split(inf, 2))
+        sign_split = np.array(np.array_split(sign, 2))
+        finite_split = np.array(np.array_split(finite, 2))
+        assert_equal(np.isnan(data_split), nan_split)
+        assert_equal(np.isinf(data_split), inf_split)
+        assert_equal(np.signbit(data_split), sign_split)
+        assert_equal(np.isfinite(data_split), finite_split)
+
 class TestLDExp:
     @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
     @pytest.mark.parametrize("dtype", ['f', 'd'])
@@ -1672,6 +1894,7 @@ class TestLDExp:
 class TestFRExp:
     @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
     @pytest.mark.parametrize("dtype", ['f', 'd'])
+    @pytest.mark.xfail(IS_MUSL, reason="gh23048")
     @pytest.mark.skipif(not sys.platform.startswith('linux'),
                         reason="np.frexp gives different answers for NAN/INF on windows and linux")
     def test_frexp(self, dtype, stride):
@@ -3359,6 +3582,79 @@ class TestSpecialMethods:
         assert_raises(ValueError, np.modf, a, out=('one', 'two', 'three'))
         assert_raises(ValueError, np.modf, a, out=('one',))
 
+    def test_ufunc_override_where(self):
+
+        class OverriddenArrayOld(np.ndarray):
+
+            def _unwrap(self, objs):
+                cls = type(self)
+                result = []
+                for obj in objs:
+                    if isinstance(obj, cls):
+                        obj = np.array(obj)
+                    elif type(obj) != np.ndarray:
+                        return NotImplemented
+                    result.append(obj)
+                return result
+
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+
+                inputs = self._unwrap(inputs)
+                if inputs is NotImplemented:
+                    return NotImplemented
+
+                kwargs = kwargs.copy()
+                if "out" in kwargs:
+                    kwargs["out"] = self._unwrap(kwargs["out"])
+                    if kwargs["out"] is NotImplemented:
+                        return NotImplemented
+
+                r = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
+                if r is not NotImplemented:
+                    r = r.view(type(self))
+
+                return r
+
+        class OverriddenArrayNew(OverriddenArrayOld):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+
+                kwargs = kwargs.copy()
+                if "where" in kwargs:
+                    kwargs["where"] = self._unwrap((kwargs["where"], ))
+                    if kwargs["where"] is NotImplemented:
+                        return NotImplemented
+                    else:
+                        kwargs["where"] = kwargs["where"][0]
+
+                r = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
+                if r is not NotImplemented:
+                    r = r.view(type(self))
+
+                return r
+
+        ufunc = np.negative
+
+        array = np.array([1, 2, 3])
+        where = np.array([True, False, True])
+        expected = ufunc(array, where=where)
+
+        with pytest.raises(TypeError):
+            ufunc(array, where=where.view(OverriddenArrayOld))
+
+        result_1 = ufunc(
+            array,
+            where=where.view(OverriddenArrayNew)
+        )
+        assert isinstance(result_1, OverriddenArrayNew)
+        assert np.all(np.array(result_1) == expected, where=where)
+
+        result_2 = ufunc(
+            array.view(OverriddenArrayNew),
+            where=where.view(OverriddenArrayNew)
+        )
+        assert isinstance(result_2, OverriddenArrayNew)
+        assert np.all(np.array(result_2) == expected, where=where)
+
     def test_ufunc_override_exception(self):
 
         class A:
@@ -3780,6 +4076,7 @@ class TestComplexFunctions:
             assert_almost_equal(fz.real, fr, err_msg='real part %s' % f)
             assert_almost_equal(fz.imag, 0., err_msg='imag part %s' % f)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
     @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_precisions_consistent(self):
         z = 1 + 1j
@@ -3790,6 +4087,7 @@ class TestComplexFunctions:
             assert_almost_equal(fcf, fcd, decimal=6, err_msg='fch-fcd %s' % f)
             assert_almost_equal(fcl, fcd, decimal=15, err_msg='fch-fcl %s' % f)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
     @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_branch_cuts(self):
         # check branch cuts and continuity on them
@@ -3816,6 +4114,7 @@ class TestComplexFunctions:
         _check_branch_cut(np.arccosh, [0-2j, 2j, 2], [1,  1,  1j], 1, 1)
         _check_branch_cut(np.arctanh, [0-2j, 2j, 0], [1,  1,  1j], 1, 1)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
     @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_branch_cuts_complex64(self):
         # check branch cuts and continuity on them
@@ -3861,6 +4160,7 @@ class TestComplexFunctions:
                 b = cfunc(p)
                 assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b))
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
     @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     @pytest.mark.parametrize('dtype', [np.complex64, np.complex_, np.longcomplex])
     def test_loss_of_precision(self, dtype):
@@ -3952,6 +4252,14 @@ class TestComplexFunctions:
             check(func, pts, 1j)
             check(func, pts, 1+1j)
 
+    @np.errstate(all="ignore")
+    def test_promotion_corner_cases(self):
+        for func in self.funcs:
+            assert func(np.float16(1)).dtype == np.float16
+            # Integer to low precision float promotion is a dubious choice:
+            assert func(np.uint8(1)).dtype == np.float16
+            assert func(np.int16(1)).dtype == np.float32
+
 
 class TestAttributes:
     def test_attributes(self):
@@ -4125,7 +4433,7 @@ def _test_spacing(t):
     nan = t(np.nan)
     inf = t(np.inf)
     with np.errstate(invalid='ignore'):
-        assert_(np.spacing(one) == eps)
+        assert_equal(np.spacing(one), eps)
         assert_(np.isnan(np.spacing(nan)))
         assert_(np.isnan(np.spacing(inf)))
         assert_(np.isnan(np.spacing(-inf)))
@@ -4261,6 +4569,7 @@ def test_rint_big_int():
     # Rint should not change the value
     assert_equal(val, np.rint(val))
 
+
 @pytest.mark.parametrize('ftype', [np.float32, np.float64])
 def test_memoverlap_accumulate(ftype):
     # Reproduces bug https://github.com/numpy/numpy/issues/15597
@@ -4270,6 +4579,38 @@ def test_memoverlap_accumulate(ftype):
     assert_equal(np.maximum.accumulate(arr), out_max)
     assert_equal(np.minimum.accumulate(arr), out_min)
 
+@pytest.mark.parametrize("ufunc, dtype", [
+    (ufunc, t[0])
+    for ufunc in UFUNCS_BINARY_ACC
+    for t in ufunc.types
+    if t[-1] == '?' and t[0] not in 'DFGMmO'
+])
+def test_memoverlap_accumulate_cmp(ufunc, dtype):
+    if ufunc.signature:
+        pytest.skip('For generic signatures only')
+    for size in (2, 8, 32, 64, 128, 256):
+        arr = np.array([0, 1, 1]*size, dtype=dtype)
+        acc = ufunc.accumulate(arr, dtype='?')
+        acc_u8 = acc.view(np.uint8)
+        exp = np.array(list(itertools.accumulate(arr, ufunc)), dtype=np.uint8)
+        assert_equal(exp, acc_u8)
+
+@pytest.mark.parametrize("ufunc, dtype", [
+    (ufunc, t[0])
+    for ufunc in UFUNCS_BINARY_ACC
+    for t in ufunc.types
+    if t[0] == t[1] and t[0] == t[-1] and t[0] not in 'DFGMmO?'
+])
+def test_memoverlap_accumulate_symmetric(ufunc, dtype):
+    if ufunc.signature:
+        pytest.skip('For generic signatures only')
+    with np.errstate(all='ignore'):
+        for size in (2, 8, 32, 64, 128, 256):
+            arr = np.array([0, 1, 2]*size).astype(dtype)
+            acc = ufunc.accumulate(arr, dtype=dtype)
+            exp = np.array(list(itertools.accumulate(arr, ufunc)), dtype=dtype)
+            assert_equal(exp, acc)
+
 def test_signaling_nan_exceptions():
     with assert_no_warnings():
         a = np.ndarray(shape=(), dtype='float32', buffer=b'\x00\xe0\xbf\xff')
diff --git a/numpy/ctypeslib.pyi b/numpy/ctypeslib.pyi
index 0313cd82a..3edf98e14 100644
--- a/numpy/ctypeslib.pyi
+++ b/numpy/ctypeslib.pyi
@@ -91,10 +91,10 @@ class _ndptr(ctypes.c_void_p, Generic[_DTypeOptional]):
 
     @overload
     @classmethod
-    def from_param(cls: type[_ndptr[None]], obj: ndarray[Any, Any]) -> _ctypes: ...
+    def from_param(cls: type[_ndptr[None]], obj: ndarray[Any, Any]) -> _ctypes[Any]: ...
     @overload
     @classmethod
-    def from_param(cls: type[_ndptr[_DType]], obj: ndarray[Any, _DType]) -> _ctypes: ...
+    def from_param(cls: type[_ndptr[_DType]], obj: ndarray[Any, _DType]) -> _ctypes[Any]: ...
 
 class _concrete_ndptr(_ndptr[_DType]):
     _dtype_: ClassVar[_DType]
diff --git a/numpy/distutils/ccompiler.py b/numpy/distutils/ccompiler.py
index f0487cb64..40f495fc7 100644
--- a/numpy/distutils/ccompiler.py
+++ b/numpy/distutils/ccompiler.py
@@ -1,10 +1,12 @@
 import os
 import re
 import sys
+import platform
 import shlex
 import time
 import subprocess
 from copy import copy
+from pathlib import Path
 from distutils import ccompiler
 from distutils.ccompiler import (
     compiler_class, gen_lib_options, get_default_compiler, new_compiler,
@@ -57,7 +59,7 @@ def _needs_build(obj, cc_args, extra_postargs, pp_opts):
     # the last line contains the compiler commandline arguments as some
     # projects may compile an extension multiple times with different
     # arguments
-    with open(dep_file, "r") as f:
+    with open(dep_file) as f:
         lines = f.readlines()
 
     cmdline =_commandline_dep_string(cc_args, extra_postargs, pp_opts)
@@ -279,7 +281,8 @@ def CCompiler_compile(self, sources, output_dir=None, macros=None,
 
     if not sources:
         return []
-    from numpy.distutils.fcompiler import (FCompiler, is_f_file,
+    from numpy.distutils.fcompiler import (FCompiler,
+                                           FORTRAN_COMMON_FIXED_EXTENSIONS,
                                            has_f90_header)
     if isinstance(self, FCompiler):
         display = []
@@ -338,7 +341,8 @@ def CCompiler_compile(self, sources, output_dir=None, macros=None,
                 if self.compiler_type=='absoft':
                     obj = cyg2win32(obj)
                     src = cyg2win32(src)
-                if is_f_file(src) and not has_f90_header(src):
+                if Path(src).suffix.lower() in FORTRAN_COMMON_FIXED_EXTENSIONS \
+                   and not has_f90_header(src):
                     f77_objects.append((obj, (src, ext)))
                 else:
                     other_objects.append((obj, (src, ext)))
@@ -391,8 +395,14 @@ def CCompiler_customize_cmd(self, cmd, ignore=()):
     log.info('customize %s using %s' % (self.__class__.__name__,
                                         cmd.__class__.__name__))
 
-    if hasattr(self, 'compiler') and 'clang' in self.compiler[0]:
+    if (
+        hasattr(self, 'compiler') and
+        'clang' in self.compiler[0] and
+        not (platform.machine() == 'arm64' and sys.platform == 'darwin')
+    ):
         # clang defaults to a non-strict floating error point model.
+        # However, '-ftrapping-math' is not currently supported (2023-04-08)
+        # for macosx_arm64.
         # Since NumPy and most Python libs give warnings for these, override:
         self.compiler.append('-ftrapping-math')
         self.compiler_so.append('-ftrapping-math')
@@ -717,6 +727,8 @@ compiler_class['pathcc'] = ('pathccompiler', 'PathScaleCCompiler',
                             "PathScale Compiler for SiCortex-based applications")
 compiler_class['arm'] = ('armccompiler', 'ArmCCompiler',
                             "Arm C Compiler")
+compiler_class['fujitsu'] = ('fujitsuccompiler', 'FujitsuCCompiler',
+                            "Fujitsu C Compiler")
 
 ccompiler._default_compilers += (('linux.*', 'intel'),
                                  ('linux.*', 'intele'),
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index 5f18efc7d..6ba4cd816 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -16,15 +16,6 @@ import re
 import subprocess
 import textwrap
 
-# These flags are used to compile any C++ source within Numpy.
-# They are chosen to have very few runtime dependencies.
-NPY_CXX_FLAGS = [
-    '-std=c++11',  # Minimal standard version
-    '-D__STDC_VERSION__=0',  # for compatibility with C headers
-    '-fno-exceptions',  # no exception support
-    '-fno-rtti']  # no runtime type information
-
-
 class _Config:
     """An abstract class holds all configurable attributes of `CCompilerOpt`,
     these class attributes can be used to change the default behavior
@@ -229,6 +220,11 @@ class _Config:
             native = None,
             opt = '/O2',
             werror = '/WX',
+        ),
+        fcc = dict(
+            native = '-mcpu=a64fx',
+            opt = None,
+            werror = None,
         )
     )
     conf_min_features = dict(
@@ -295,6 +291,10 @@ class _Config:
             group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ",
             detect="AVX512_ICL", implies_detect=False
         ),
+        AVX512_SPR = dict(
+            interest=46, implies="AVX512_ICL", group="AVX512FP16",
+            detect="AVX512_SPR", implies_detect=False
+        ),
         # IBM/Power
         ## Power7/ISA 2.06
         VSX = dict(interest=1, headers="altivec.h", extra_checks="VSX_ASM"),
@@ -338,7 +338,7 @@ class _Config:
             return {}
 
         on_x86 = self.cc_on_x86 or self.cc_on_x64
-        is_unix = self.cc_is_gcc or self.cc_is_clang
+        is_unix = self.cc_is_gcc or self.cc_is_clang or self.cc_is_fcc
 
         if on_x86 and is_unix: return dict(
             SSE    = dict(flags="-msse"),
@@ -365,7 +365,8 @@ class _Config:
             AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"),
             AVX512_ICL = dict(
                 flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq"
-            )
+            ),
+            AVX512_SPR = dict(flags="-mavx512fp16"),
         )
         if on_x86 and self.cc_is_icc: return dict(
             SSE    = dict(flags="-msse"),
@@ -397,6 +398,7 @@ class _Config:
             AVX512_CLX = dict(flags="-xCASCADELAKE"),
             AVX512_CNL = dict(flags="-xCANNONLAKE"),
             AVX512_ICL = dict(flags="-xICELAKE-CLIENT"),
+            AVX512_SPR = dict(disable="Not supported yet")
         )
         if on_x86 and self.cc_is_iccw: return dict(
             SSE    = dict(flags="/arch:SSE"),
@@ -429,7 +431,8 @@ class _Config:
             AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"),
             AVX512_CLX = dict(flags="/Qx:CASCADELAKE"),
             AVX512_CNL = dict(flags="/Qx:CANNONLAKE"),
-            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT")
+            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT"),
+            AVX512_SPR = dict(disable="Not supported yet")
         )
         if on_x86 and self.cc_is_msvc: return dict(
             SSE = dict(flags="/arch:SSE") if self.cc_on_x86 else {},
@@ -467,7 +470,10 @@ class _Config:
             AVX512_SKX = dict(flags="/arch:AVX512"),
             AVX512_CLX = {},
             AVX512_CNL = {},
-            AVX512_ICL = {}
+            AVX512_ICL = {},
+            AVX512_SPR= dict(
+                disable="MSVC compiler doesn't support it"
+            )
         )
 
         on_power = self.cc_on_ppc64le or self.cc_on_ppc64
@@ -959,8 +965,12 @@ class _CCompiler:
         detect_arch = (
             ("cc_on_x64",      ".*(x|x86_|amd)64.*", ""),
             ("cc_on_x86",      ".*(win32|x86|i386|i686).*", ""),
-            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*", ""),
-            ("cc_on_ppc64",    ".*(powerpc|ppc)64.*", ""),
+            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*|.*powerpc.*",
+                                          "defined(__powerpc64__) && "
+                                          "defined(__LITTLE_ENDIAN__)"),
+            ("cc_on_ppc64",    ".*(powerpc|ppc).*|.*powerpc.*",
+                                          "defined(__powerpc64__) && "
+                                          "defined(__BIG_ENDIAN__)"),
             ("cc_on_aarch64",  ".*(aarch64|arm64).*", ""),
             ("cc_on_armhf",    ".*arm.*", "defined(__ARM_ARCH_7__) || "
                                           "defined(__ARM_ARCH_7A__)"),
@@ -975,12 +985,14 @@ class _CCompiler:
             ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*", ""),
             ("cc_is_icc",      ".*(intel|icc).*", ""),  # intel unix like
             ("cc_is_msvc",     ".*msvc.*", ""),
+            ("cc_is_fcc",     ".*fcc.*", ""),
             # undefined compiler will be treat it as gcc
             ("cc_is_nocc",     "", ""),
         )
         detect_args = (
            ("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*", ""),
-           ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*", ""),
+           ("cc_has_native",
+                ".*(-march=native|-xHost|/QxHost|-mcpu=a64fx).*", ""),
            # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
            ("cc_noopt", ".*DISABLE_OPT.*", ""),
         )
@@ -1041,7 +1053,7 @@ class _CCompiler:
                 break
 
         self.cc_name = "unknown"
-        for name in ("gcc", "clang", "iccw", "icc", "msvc"):
+        for name in ("gcc", "clang", "iccw", "icc", "msvc", "fcc"):
             if getattr(self, "cc_is_" + name):
                 self.cc_name = name
                 break
@@ -1163,7 +1175,7 @@ class _CCompiler:
                 continue
             lower_flags = flags[:-(i+1)]
             upper_flags = flags[-i:]
-            filterd = list(filter(
+            filtered = list(filter(
                 self._cc_normalize_unix_frgx.search, lower_flags
             ))
             # gather subflags
@@ -1175,7 +1187,7 @@ class _CCompiler:
                         subflags = xsubflags + subflags
                 cur_flag = arch + '+' + '+'.join(subflags)
 
-            flags = filterd + [cur_flag]
+            flags = filtered + [cur_flag]
             if i > 0:
                 flags += upper_flags
             break
diff --git a/numpy/distutils/checks/cpu_avx512_spr.c b/numpy/distutils/checks/cpu_avx512_spr.c
new file mode 100644
index 000000000..9710d0b2f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_spr.c
@@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512FP16__)
+        #error "HOST/ARCH doesn't support Sapphire Rapids AVX512FP16 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+/* clang has a bug regarding our spr coode, see gh-23730. */
+#if __clang__
+#error
+#endif
+    __m512h a = _mm512_loadu_ph((void*)argv[argc-1]);
+    __m512h temp = _mm512_fmadd_ph(a, a, a);
+    _mm512_storeu_ph((void*)(argv[argc-1]), temp);
+    return 0;
+}
diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py
index 45201f98f..11999dae2 100644
--- a/numpy/distutils/command/build_clib.py
+++ b/numpy/distutils/command/build_clib.py
@@ -307,6 +307,7 @@ class build_clib(old_build_clib):
             # problem, msvc uses its own convention :(
             c_sources += cxx_sources
             cxx_sources = []
+            extra_cflags += extra_cxxflags
 
         # filtering C dispatch-table sources when optimization is not disabled,
         # otherwise treated as normal sources.
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index 6dc6b4265..68b13c0dd 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -259,7 +259,7 @@ class build_ext (old_build_ext):
                     log.warn('resetting extension %r language from %r to %r.' %
                              (ext.name, l, ext_language))
 
-                ext.language = ext_language
+            ext.language = ext_language
 
             # global language
             all_languages.update(ext_languages)
@@ -407,6 +407,7 @@ class build_ext (old_build_ext):
             if cxx_sources:
                 # Needed to compile kiva.agg._agg extension.
                 extra_args.append('/Zm1000')
+                extra_cflags += extra_cxxflags
             # this hack works around the msvc compiler attributes
             # problem, msvc uses its own convention :(
             c_sources += cxx_sources
@@ -641,12 +642,12 @@ class build_ext (old_build_ext):
                 if os.path.isfile(fake_lib):
                     # Replace fake static library
                     libraries.remove(lib)
-                    with open(fake_lib, 'r') as f:
+                    with open(fake_lib) as f:
                         unlinkable_fobjects.extend(f.read().splitlines())
 
                     # Expand C objects
                     c_lib = os.path.join(libdir, lib + '.cobjects')
-                    with open(c_lib, 'r') as f:
+                    with open(c_lib) as f:
                         objects.extend(f.read().splitlines())
 
         # Wrap unlinkable objects to a linkable one
diff --git a/numpy/distutils/command/build_src.py b/numpy/distutils/command/build_src.py
index 5581011f6..bf3d03c70 100644
--- a/numpy/distutils/command/build_src.py
+++ b/numpy/distutils/command/build_src.py
@@ -725,7 +725,7 @@ _has_c_header = re.compile(r'-\*-\s*c\s*-\*-', re.I).search
 _has_cpp_header = re.compile(r'-\*-\s*c\+\+\s*-\*-', re.I).search
 
 def get_swig_target(source):
-    with open(source, 'r') as f:
+    with open(source) as f:
         result = None
         line = f.readline()
         if _has_cpp_header(line):
@@ -735,7 +735,7 @@ def get_swig_target(source):
     return result
 
 def get_swig_modulename(source):
-    with open(source, 'r') as f:
+    with open(source) as f:
         name = None
         for line in f:
             m = _swig_module_name_match(line)
diff --git a/numpy/distutils/command/install.py b/numpy/distutils/command/install.py
index 2eff2d145..efa9b4740 100644
--- a/numpy/distutils/command/install.py
+++ b/numpy/distutils/command/install.py
@@ -62,7 +62,7 @@ class install(old_install):
             # bdist_rpm fails when INSTALLED_FILES contains
             # paths with spaces. Such paths must be enclosed
             # with double-quotes.
-            with open(self.record, 'r') as f:
+            with open(self.record) as f:
                 lines = []
                 need_rewrite = False
                 for l in f:
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index ecba3e5d5..5160e2abf 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -19,6 +19,7 @@ __all__ = ['FCompiler', 'new_fcompiler', 'show_fcompilers',
 import os
 import sys
 import re
+from pathlib import Path
 
 from distutils.sysconfig import get_python_lib
 from distutils.fancy_getopt import FancyGetopt
@@ -37,6 +38,10 @@ from .environment import EnvironmentConfig
 
 __metaclass__ = type
 
+
+FORTRAN_COMMON_FIXED_EXTENSIONS = ['.for', '.ftn', '.f77', '.f']
+
+
 class CompilerNotFound(Exception):
     pass
 
@@ -571,7 +576,8 @@ class FCompiler(CCompiler):
     def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
         """Compile 'src' to product 'obj'."""
         src_flags = {}
-        if is_f_file(src) and not has_f90_header(src):
+        if Path(src).suffix.lower() in FORTRAN_COMMON_FIXED_EXTENSIONS \
+           and not has_f90_header(src):
             flavor = ':f77'
             compiler = self.compiler_f77
             src_flags = get_f77flags(src)
@@ -970,7 +976,6 @@ def dummy_fortran_file():
     return name[:-2]
 
 
-is_f_file = re.compile(r'.*\.(for|ftn|f77|f)\Z', re.I).match
 _has_f_header = re.compile(r'-\*-\s*fortran\s*-\*-', re.I).search
 _has_f90_header = re.compile(r'-\*-\s*f90\s*-\*-', re.I).search
 _has_fix_header = re.compile(r'-\*-\s*fix\s*-\*-', re.I).search
diff --git a/numpy/distutils/fcompiler/absoft.py b/numpy/distutils/fcompiler/absoft.py
index efe3a4cb5..68f516b92 100644
--- a/numpy/distutils/fcompiler/absoft.py
+++ b/numpy/distutils/fcompiler/absoft.py
@@ -1,6 +1,6 @@
 
-# http://www.absoft.com/literature/osxuserguide.pdf
-# http://www.absoft.com/documentation.html
+# Absoft Corporation ceased operations on 12/31/2022.
+# Thus, all links to <http://www.absoft.com> are invalid.
 
 # Notes:
 # - when using -g77 then use -DUNDERSCORE_G77 to compile f2py
diff --git a/numpy/distutils/fcompiler/ibm.py b/numpy/distutils/fcompiler/ibm.py
index eff24401a..29927518c 100644
--- a/numpy/distutils/fcompiler/ibm.py
+++ b/numpy/distutils/fcompiler/ibm.py
@@ -76,7 +76,7 @@ class IBMFCompiler(FCompiler):
                 xlf_cfg = '/etc/opt/ibmcmp/xlf/%s/xlf.cfg' % version
             fo, new_cfg = make_temp_file(suffix='_xlf.cfg')
             log.info('Creating '+new_cfg)
-            with open(xlf_cfg, 'r') as fi:
+            with open(xlf_cfg) as fi:
                 crt1_match = re.compile(r'\s*crt\s*=\s*(?P<path>.*)/crt1.o').match
                 for line in fi:
                     m = crt1_match(line)
diff --git a/numpy/distutils/fujitsuccompiler.py b/numpy/distutils/fujitsuccompiler.py
new file mode 100644
index 000000000..c25900b34
--- /dev/null
+++ b/numpy/distutils/fujitsuccompiler.py
@@ -0,0 +1,28 @@
+from distutils.unixccompiler import UnixCCompiler
+
+class FujitsuCCompiler(UnixCCompiler):
+
+    """
+    Fujitsu compiler.
+    """
+
+    compiler_type = 'fujitsu'
+    cc_exe = 'fcc'
+    cxx_exe = 'FCC'
+
+    def __init__(self, verbose=0, dry_run=0, force=0):
+        UnixCCompiler.__init__(self, verbose, dry_run, force)
+        cc_compiler = self.cc_exe
+        cxx_compiler = self.cxx_exe
+        self.set_executables(
+            compiler=cc_compiler +
+            ' -O3 -Nclang -fPIC',
+            compiler_so=cc_compiler +
+            ' -O3 -Nclang -fPIC',
+            compiler_cxx=cxx_compiler +
+            ' -O3 -Nclang -fPIC',
+            linker_exe=cc_compiler +
+            ' -lfj90i -lfj90f -lfjsrcinfo -lelf -shared',
+            linker_so=cc_compiler +
+            ' -lfj90i -lfj90f -lfjsrcinfo -lelf -shared'
+            )
diff --git a/numpy/distutils/mingw32ccompiler.py b/numpy/distutils/mingw32ccompiler.py
index 5d1c27a65..4763f41ad 100644
--- a/numpy/distutils/mingw32ccompiler.py
+++ b/numpy/distutils/mingw32ccompiler.py
@@ -8,7 +8,6 @@ Support code for building Python extensions on Windows.
 
 """
 import os
-import platform
 import sys
 import subprocess
 import re
@@ -203,11 +202,11 @@ def find_python_dll():
 
     # search in the file system for possible candidates
     major_version, minor_version = tuple(sys.version_info[:2])
-    implementation = platform.python_implementation()
-    if implementation == 'CPython':
+    implementation = sys.implementation.name
+    if implementation == 'cpython':
         dllname = f'python{major_version}{minor_version}.dll'
-    elif implementation == 'PyPy':
-        dllname = f'libpypy{major_version}-c.dll'
+    elif implementation == 'pypy':
+        dllname = f'libpypy{major_version}.{minor_version}-c.dll'
     else:
         dllname = f'Unknown platform {implementation}' 
     print("Looking for %s" % dllname)
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 79ba08515..e226b4744 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -475,7 +475,7 @@ def _get_f90_modules(source):
     if not f90_ext_match(source):
         return []
     modules = []
-    with open(source, 'r') as f:
+    with open(source) as f:
         for line in f:
             m = f90_module_name_match(line)
             if m:
@@ -1932,7 +1932,7 @@ class Configuration:
                 revision0 = f.read().strip()
 
             branch_map = {}
-            with open(branch_cache_fn, 'r') as f:
+            with open(branch_cache_fn) as f:
                 for line in f:
                     branch1, revision1  = line.split()[:2]
                     if revision1==revision0:
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index d5a1687da..3dca7fb5a 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -62,6 +62,7 @@ Currently, the following classes are available, along with their section names:
     blas_ilp64_plain_opt_info:ALL      # usage recommended (general ILP64 BLAS, no symbol suffix)
     blas_info:blas
     blas_mkl_info:mkl
+    blas_ssl2_info:ssl2
     blas_opt_info:ALL                  # usage recommended
     blas_src_info:blas_src
     blis_info:blis
@@ -93,9 +94,11 @@ Currently, the following classes are available, along with their section names:
     lapack_ilp64_plain_opt_info:ALL    # usage recommended (general ILP64 LAPACK, no symbol suffix)
     lapack_info:lapack
     lapack_mkl_info:mkl
+    lapack_ssl2_info:ssl2
     lapack_opt_info:ALL                # usage recommended
     lapack_src_info:lapack_src
     mkl_info:mkl
+    ssl2_info:ssl2
     numarray_info:numarray
     numerix_info:numerix
     numpy_info:numpy
@@ -519,6 +522,7 @@ def get_info(name, notfound_action=0):
           'lapack_atlas_3_10_threads': lapack_atlas_3_10_threads_info,  # ditto
           'flame': flame_info,          # use lapack_opt instead
           'mkl': mkl_info,
+          'ssl2': ssl2_info,
           # openblas which may or may not have embedded lapack
           'openblas': openblas_info,          # use blas_opt instead
           # openblas with embedded lapack
@@ -527,6 +531,8 @@ def get_info(name, notfound_action=0):
           'blis': blis_info,                  # use blas_opt instead
           'lapack_mkl': lapack_mkl_info,      # use lapack_opt instead
           'blas_mkl': blas_mkl_info,          # use blas_opt instead
+          'lapack_ssl2': lapack_ssl2_info,      
+          'blas_ssl2': blas_ssl2_info,          
           'accelerate': accelerate_info,      # use blas_opt instead
           'openblas64_': openblas64__info,
           'openblas64__lapack': openblas64__lapack_info,
@@ -1260,7 +1266,7 @@ class mkl_info(system_info):
         paths = os.environ.get('LD_LIBRARY_PATH', '').split(os.pathsep)
         ld_so_conf = '/etc/ld.so.conf'
         if os.path.isfile(ld_so_conf):
-            with open(ld_so_conf, 'r') as f:
+            with open(ld_so_conf) as f:
                 for d in f:
                     d = d.strip()
                     if d:
@@ -1325,6 +1331,63 @@ class blas_mkl_info(mkl_info):
     pass
 
 
+class ssl2_info(system_info):
+    section = 'ssl2'
+    dir_env_var = 'SSL2_DIR'
+    # Multi-threaded version. Python itself must be built by Fujitsu compiler.
+    _lib_ssl2 = ['fjlapackexsve']
+    # Single-threaded version
+    #_lib_ssl2 = ['fjlapacksve']
+
+    def get_tcsds_rootdir(self):
+        tcsdsroot = os.environ.get('TCSDS_PATH', None)
+        if tcsdsroot is not None:
+            return tcsdsroot
+        return None
+
+    def __init__(self):
+        tcsdsroot = self.get_tcsds_rootdir()
+        if tcsdsroot is None:
+            system_info.__init__(self)
+        else:
+            system_info.__init__(
+                self,
+                default_lib_dirs=[os.path.join(tcsdsroot, 'lib64')],
+                default_include_dirs=[os.path.join(tcsdsroot,
+                    'clang-comp/include')])
+
+    def calc_info(self):
+        tcsdsroot = self.get_tcsds_rootdir()
+
+        lib_dirs = self.get_lib_dirs()
+        if lib_dirs is None:
+            lib_dirs = os.path.join(tcsdsroot, 'lib64')
+
+        incl_dirs = self.get_include_dirs()
+        if incl_dirs is None:
+            incl_dirs = os.path.join(tcsdsroot, 'clang-comp/include')
+
+        ssl2_libs = self.get_libs('ssl2_libs', self._lib_ssl2)
+
+        info = self.check_libs2(lib_dirs, ssl2_libs)
+        if info is None:
+            return
+        dict_append(info,
+                    define_macros=[('HAVE_CBLAS', None),
+                                   ('HAVE_SSL2', 1)],
+                    include_dirs=incl_dirs,)
+        self.set_info(**info)
+
+
+class lapack_ssl2_info(ssl2_info):
+    pass
+
+
+class blas_ssl2_info(ssl2_info):
+    pass
+
+
+
 class armpl_info(system_info):
     section = 'armpl'
     dir_env_var = 'ARMPL_DIR'
@@ -1787,7 +1850,7 @@ class lapack_opt_info(system_info):
     notfounderror = LapackNotFoundError
 
     # List of all known LAPACK libraries, in the default order
-    lapack_order = ['armpl', 'mkl', 'openblas', 'flame',
+    lapack_order = ['armpl', 'mkl', 'ssl2', 'openblas', 'flame',
                     'accelerate', 'atlas', 'lapack']
     order_env_var_name = 'NPY_LAPACK_ORDER'
     
@@ -1805,6 +1868,13 @@ class lapack_opt_info(system_info):
             return True
         return False
 
+    def _calc_info_ssl2(self):
+        info = get_info('lapack_ssl2')
+        if info:
+            self.set_info(**info)
+            return True
+        return False
+
     def _calc_info_openblas(self):
         info = get_info('openblas_lapack')
         if info:
@@ -1971,7 +2041,7 @@ class blas_opt_info(system_info):
     notfounderror = BlasNotFoundError
     # List of all known BLAS libraries, in the default order
 
-    blas_order = ['armpl', 'mkl', 'blis', 'openblas',
+    blas_order = ['armpl', 'mkl', 'ssl2', 'blis', 'openblas',
                   'accelerate', 'atlas', 'blas']
     order_env_var_name = 'NPY_BLAS_ORDER'
     
@@ -1989,6 +2059,13 @@ class blas_opt_info(system_info):
             return True
         return False
 
+    def _calc_info_ssl2(self):
+        info = get_info('blas_ssl2')
+        if info:
+            self.set_info(**info)
+            return True
+        return False
+
     def _calc_info_blis(self):
         info = get_info('blis')
         if info:
@@ -2190,7 +2267,7 @@ class blas_info(system_info):
             }""")
         src = os.path.join(tmpdir, 'source.c')
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
 
             try:
@@ -2345,7 +2422,7 @@ class openblas_info(blas_info):
         except Exception:
             extra_args = []
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
             obj = c.compile([src], output_dir=tmpdir)
             try:
@@ -2456,7 +2533,7 @@ class flame_info(system_info):
         # Add the additional "extra" arguments
         extra_args = info.get('extra_link_args', [])
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
             obj = c.compile([src], output_dir=tmpdir)
             try:
diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py
index 657ebdb68..a1b780336 100644
--- a/numpy/distutils/tests/test_ccompiler_opt.py
+++ b/numpy/distutils/tests/test_ccompiler_opt.py
@@ -31,7 +31,7 @@ arch_compilers = dict(
     ppc64 = ("gcc", "clang"),
     ppc64le = ("gcc", "clang"),
     armhf = ("gcc", "clang"),
-    aarch64 = ("gcc", "clang"),
+    aarch64 = ("gcc", "clang", "fcc"),
     s390x = ("gcc", "clang"),
     noarch = ("gcc",)
 )
@@ -422,8 +422,8 @@ class _Test_CCompilerOpt:
         # when option "native" is activated through the args
         try:
             self.expect("native",
-                trap_flags=".*(-march=native|-xHost|/QxHost).*",
-                x86=".*", ppc64=".*", armhf=".*", s390x=".*"
+                trap_flags=".*(-march=native|-xHost|/QxHost|-mcpu=a64fx).*",
+                x86=".*", ppc64=".*", armhf=".*", s390x=".*", aarch64=".*",
             )
             if self.march() != "unknown":
                 raise AssertionError(
diff --git a/numpy/distutils/tests/test_system_info.py b/numpy/distutils/tests/test_system_info.py
index eb7235e04..66304a5e5 100644
--- a/numpy/distutils/tests/test_system_info.py
+++ b/numpy/distutils/tests/test_system_info.py
@@ -271,7 +271,7 @@ class TestSystemInfoReading:
 
             # But if we copy the values to a '[mkl]' section the value
             # is correct
-            with open(cfg, 'r') as fid:
+            with open(cfg) as fid:
                 mkl = fid.read().replace('[ALL]', '[mkl]', 1)
             with open(cfg, 'w') as fid:
                 fid.write(mkl)
@@ -279,7 +279,7 @@ class TestSystemInfoReading:
             assert info.get_lib_dirs() == lib_dirs
 
             # Also, the values will be taken from a section named '[DEFAULT]'
-            with open(cfg, 'r') as fid:
+            with open(cfg) as fid:
                 dflt = fid.read().replace('[mkl]', '[DEFAULT]', 1)
             with open(cfg, 'w') as fid:
                 fid.write(dflt)
diff --git a/numpy/dtypes.py b/numpy/dtypes.py
new file mode 100644
index 000000000..068a6a1a0
--- /dev/null
+++ b/numpy/dtypes.py
@@ -0,0 +1,77 @@
+"""
+DType classes and utility (:mod:`numpy.dtypes`)
+===============================================
+
+This module is home to specific dtypes related functionality and their classes.
+For more general information about dtypes, also see `numpy.dtype` and
+:ref:`arrays.dtypes`.
+
+Similar to the builtin ``types`` module, this submodule defines types (classes)
+that are not widely used directly.
+
+.. versionadded:: NumPy 1.25
+
+    The dtypes module is new in NumPy 1.25.  Previously DType classes were
+    only accessible indirectly.
+
+
+DType classes
+-------------
+
+The following are the classes of the corresponding NumPy dtype instances and
+NumPy scalar types.  The classes can be used in ``isinstance`` checks and can
+also be instantiated or used directly.  Direct use of these classes is not
+typical, since their scalar counterparts (e.g. ``np.float64``) or strings
+like ``"float64"`` can be used.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Group
+      - DType class
+
+    * - Boolean
+      - ``BoolDType``
+
+    * - Bit-sized integers
+      - ``Int8DType``, ``UInt8DType``, ``Int16DType``, ``UInt16DType``,
+        ``Int32DType``, ``UInt32DType``, ``Int64DType``, ``UInt64DType``
+
+    * - C-named integers (may be aliases)
+      - ``ByteDType``, ``UByteDType``, ``ShortDType``, ``UShortDType``,
+        ``IntDType``, ``UIntDType``, ``LongDType``, ``ULongDType``,
+        ``LongLongDType``, ``ULongLongDType``
+
+    * - Floating point
+      - ``Float16DType``, ``Float32DType``, ``Float64DType``,
+        ``LongDoubleDType``
+
+    * - Complex
+      - ``Complex64DType``, ``Complex128DType``, ``CLongDoubleDType``
+
+    * - Strings
+      - ``BytesDType``, ``BytesDType``
+
+    * - Times
+      - ``DateTime64DType``, ``TimeDelta64DType``
+
+    * - Others
+      - ``ObjectDType``, ``VoidDType``
+
+"""
+
+__all__ = []
+
+
+def _add_dtype_helper(DType, alias):
+    # Function to add DTypes a bit more conveniently without channeling them
+    # through `numpy.core._multiarray_umath` namespace or similar.
+    from numpy import dtypes
+
+    setattr(dtypes, DType.__name__, DType)
+    __all__.append(DType.__name__)
+
+    if alias:
+        alias = alias.removeprefix("numpy.dtypes.")
+        setattr(dtypes, alias, DType)
+        __all__.append(alias)
diff --git a/numpy/dtypes.pyi b/numpy/dtypes.pyi
new file mode 100644
index 000000000..2f7e846f2
--- /dev/null
+++ b/numpy/dtypes.pyi
@@ -0,0 +1,43 @@
+import numpy as np
+
+
+__all__: list[str]
+
+# Boolean:
+BoolDType = np.dtype[np.bool_]
+# Sized integers:
+Int8DType = np.dtype[np.int8]
+UInt8DType = np.dtype[np.uint8]
+Int16DType = np.dtype[np.int16]
+UInt16DType = np.dtype[np.uint16]
+Int32DType = np.dtype[np.int32]
+UInt32DType = np.dtype[np.uint32]
+Int64DType = np.dtype[np.int64]
+UInt64DType = np.dtype[np.uint64]
+# Standard C-named version/alias:
+ByteDType = np.dtype[np.byte]
+UByteDType = np.dtype[np.ubyte]
+ShortDType = np.dtype[np.short]
+UShortDType = np.dtype[np.ushort]
+IntDType = np.dtype[np.intc]
+UIntDType = np.dtype[np.uintc]
+LongDType = np.dtype[np.int_]  # Unfortunately, the correct scalar
+ULongDType = np.dtype[np.uint]  # Unfortunately, the correct scalar
+LongLongDType = np.dtype[np.longlong]
+ULongLongDType = np.dtype[np.ulonglong]
+# Floats
+Float16DType = np.dtype[np.float16]
+Float32DType = np.dtype[np.float32]
+Float64DType = np.dtype[np.float64]
+LongDoubleDType = np.dtype[np.longdouble]
+# Complex:
+Complex64DType = np.dtype[np.complex64]
+Complex128DType = np.dtype[np.complex128]
+CLongDoubleDType = np.dtype[np.clongdouble]
+# Others:
+ObjectDType = np.dtype[np.object_]
+BytesDType = np.dtype[np.bytes_]
+StrDType = np.dtype[np.str_]
+VoidDType = np.dtype[np.void]
+DateTime64DType = np.dtype[np.datetime64]
+TimeDelta64DType = np.dtype[np.timedelta64]
diff --git a/numpy/dual.py b/numpy/dual.py
deleted file mode 100644
index eb7e61aac..000000000
--- a/numpy/dual.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""
-.. deprecated:: 1.20
-
-*This module is deprecated.  Instead of importing functions from*
-``numpy.dual``, *the functions should be imported directly from NumPy
-or SciPy*.
-
-Aliases for functions which may be accelerated by SciPy.
-
-SciPy_ can be built to use accelerated or otherwise improved libraries
-for FFTs, linear algebra, and special functions. This module allows
-developers to transparently support these accelerated functions when
-SciPy is available but still support users who have only installed
-NumPy.
-
-.. _SciPy : https://www.scipy.org
-
-"""
-import warnings
-
-
-warnings.warn('The module numpy.dual is deprecated.  Instead of using dual, '
-              'use the functions directly from numpy or scipy.',
-              category=DeprecationWarning,
-              stacklevel=2)
-
-# This module should be used for functions both in numpy and scipy if
-#  you want to use the numpy version if available but the scipy version
-#  otherwise.
-#  Usage  --- from numpy.dual import fft, inv
-
-__all__ = ['fft', 'ifft', 'fftn', 'ifftn', 'fft2', 'ifft2',
-           'norm', 'inv', 'svd', 'solve', 'det', 'eig', 'eigvals',
-           'eigh', 'eigvalsh', 'lstsq', 'pinv', 'cholesky', 'i0']
-
-import numpy.linalg as linpkg
-import numpy.fft as fftpkg
-from numpy.lib import i0
-import sys
-
-
-fft = fftpkg.fft
-ifft = fftpkg.ifft
-fftn = fftpkg.fftn
-ifftn = fftpkg.ifftn
-fft2 = fftpkg.fft2
-ifft2 = fftpkg.ifft2
-
-norm = linpkg.norm
-inv = linpkg.inv
-svd = linpkg.svd
-solve = linpkg.solve
-det = linpkg.det
-eig = linpkg.eig
-eigvals = linpkg.eigvals
-eigh = linpkg.eigh
-eigvalsh = linpkg.eigvalsh
-lstsq = linpkg.lstsq
-pinv = linpkg.pinv
-cholesky = linpkg.cholesky
-
-_restore_dict = {}
-
-def register_func(name, func):
-    if name not in __all__:
-        raise ValueError("{} not a dual function.".format(name))
-    f = sys._getframe(0).f_globals
-    _restore_dict[name] = f[name]
-    f[name] = func
-
-def restore_func(name):
-    if name not in __all__:
-        raise ValueError("{} not a dual function.".format(name))
-    try:
-        val = _restore_dict[name]
-    except KeyError:
-        return
-    else:
-        sys._getframe(0).f_globals[name] = val
-
-def restore_all():
-    for name in _restore_dict.keys():
-        restore_func(name)
diff --git a/numpy/exceptions.py b/numpy/exceptions.py
index d2fe9257b..2f8438101 100644
--- a/numpy/exceptions.py
+++ b/numpy/exceptions.py
@@ -32,8 +32,6 @@ Exceptions
 """
 
 
-from ._utils import set_module as _set_module
-
 __all__ = [
     "ComplexWarning", "VisibleDeprecationWarning", "ModuleDeprecationWarning",
     "TooHardError", "AxisError", "DTypePromotionError"]
@@ -46,12 +44,6 @@ if '_is_loaded' in globals():
 _is_loaded = True
 
 
-# TODO: One day, we should remove the _set_module here before removing them
-#       fully.  Not doing it now, just to allow unpickling to work on older
-#       versions for a bit.  (Module exists since NumPy 1.25.)
-#       This then also means that the typing stubs should be moved!
-
-
 class ComplexWarning(RuntimeWarning):
     """
     The warning raised when casting a complex dtype to a real dtype.
@@ -68,7 +60,7 @@ class ModuleDeprecationWarning(DeprecationWarning):
 
     .. warning::
 
-        This warning should not be used, since nose testing is not relvant
+        This warning should not be used, since nose testing is not relevant
         anymore.
 
     The nose tester turns ordinary Deprecation warnings into test failures.
diff --git a/numpy/exceptions.pyi b/numpy/exceptions.pyi
index 53b7a0c16..c76a0946b 100644
--- a/numpy/exceptions.pyi
+++ b/numpy/exceptions.pyi
@@ -1,8 +1,18 @@
-from numpy.exceptions import (
-    ComplexWarning as ComplexWarning,
-    ModuleDeprecationWarning as ModuleDeprecationWarning,
-    VisibleDeprecationWarning as VisibleDeprecationWarning,
-    TooHardError as TooHardError,
-    AxisError as AxisError,
-)
+from typing import overload
 
+__all__: list[str]
+
+class ComplexWarning(RuntimeWarning): ...
+class ModuleDeprecationWarning(DeprecationWarning): ...
+class VisibleDeprecationWarning(UserWarning): ...
+class TooHardError(RuntimeError): ...
+class DTypePromotionError(TypeError): ...
+
+class AxisError(ValueError, IndexError):
+    axis: None | int
+    ndim: None | int
+    @overload
+    def __init__(self, axis: str, ndim: None = ..., msg_prefix: None = ...) -> None: ...
+    @overload
+    def __init__(self, axis: int, ndim: int, msg_prefix: None | str = ...) -> None: ...
+    def __str__(self) -> str: ...
diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py
index f07066a09..f0a7221b7 100644
--- a/numpy/f2py/capi_maps.py
+++ b/numpy/f2py/capi_maps.py
@@ -196,7 +196,7 @@ def load_f2cmap_file(f2cmap_file):
     # they use PARAMETERS in type specifications.
     try:
         outmess('Reading f2cmap from {!r} ...\n'.format(f2cmap_file))
-        with open(f2cmap_file, 'r') as f:
+        with open(f2cmap_file) as f:
             d = eval(f.read().lower(), {}, {})
         for k, d1 in d.items():
             for k1 in d1.keys():
@@ -342,9 +342,9 @@ def getstrlength(var):
 def getarrdims(a, var, verbose=0):
     ret = {}
     if isstring(var) and not isarray(var):
-        ret['dims'] = getstrlength(var)
-        ret['size'] = ret['dims']
-        ret['rank'] = '1'
+        ret['size'] = getstrlength(var)
+        ret['rank'] = '0'
+        ret['dims'] = ''
     elif isscalar(var):
         ret['size'] = '1'
         ret['rank'] = '0'
diff --git a/numpy/f2py/cfuncs.py b/numpy/f2py/cfuncs.py
index 741692562..2d27b6524 100644
--- a/numpy/f2py/cfuncs.py
+++ b/numpy/f2py/cfuncs.py
@@ -800,16 +800,23 @@ character_from_pyobj(character* v, PyObject *obj, const char *errmess) {
         }
     }
     {
+        /* TODO: This error (and most other) error handling needs cleaning. */
         char mess[F2PY_MESSAGE_BUFFER_SIZE];
         strcpy(mess, errmess);
         PyObject* err = PyErr_Occurred();
         if (err == NULL) {
             err = PyExc_TypeError;
+            Py_INCREF(err);
+        }
+        else {
+            Py_INCREF(err);
+            PyErr_Clear();
         }
         sprintf(mess + strlen(mess),
                 " -- expected str|bytes|sequence-of-str-or-bytes, got ");
         f2py_describe(obj, mess + strlen(mess));
         PyErr_SetString(err, mess);
+        Py_DECREF(err);
     }
     return 0;
 }
@@ -1227,8 +1234,8 @@ static int try_pyarr_from_character(PyObject* obj, character* v) {
             strcpy(mess, "try_pyarr_from_character failed"
                          " -- expected bytes array-scalar|array, got ");
             f2py_describe(obj, mess + strlen(mess));
+            PyErr_SetString(err, mess);
         }
-        PyErr_SetString(err, mess);
     }
     return 0;
 }
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 27e257c48..4871d2628 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -147,10 +147,11 @@ import os
 import copy
 import platform
 import codecs
+from pathlib import Path
 try:
-    import chardet
+    import charset_normalizer
 except ImportError:
-    chardet = None
+    charset_normalizer = None
 
 from . import __version__
 
@@ -289,69 +290,69 @@ def undo_rmbadname(names):
     return [undo_rmbadname1(_m) for _m in names]
 
 
-def getextension(name):
-    i = name.rfind('.')
-    if i == -1:
-        return ''
-    if '\\' in name[i:]:
-        return ''
-    if '/' in name[i:]:
-        return ''
-    return name[i + 1:]
-
-is_f_file = re.compile(r'.*\.(for|ftn|f77|f)\Z', re.I).match
 _has_f_header = re.compile(r'-\*-\s*fortran\s*-\*-', re.I).search
 _has_f90_header = re.compile(r'-\*-\s*f90\s*-\*-', re.I).search
 _has_fix_header = re.compile(r'-\*-\s*fix\s*-\*-', re.I).search
 _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match
 
+# Extensions
+COMMON_FREE_EXTENSIONS = ['.f90', '.f95', '.f03', '.f08']
+COMMON_FIXED_EXTENSIONS = ['.for', '.ftn', '.f77', '.f']
+
 
 def openhook(filename, mode):
     """Ensures that filename is opened with correct encoding parameter.
 
-    This function uses chardet package, when available, for
-    determining the encoding of the file to be opened. When chardet is
-    not available, the function detects only UTF encodings, otherwise,
-    ASCII encoding is used as fallback.
+    This function uses charset_normalizer package, when available, for
+    determining the encoding of the file to be opened. When charset_normalizer
+    is not available, the function detects only UTF encodings, otherwise, ASCII
+    encoding is used as fallback.
     """
-    bytes = min(32, os.path.getsize(filename))
-    with open(filename, 'rb') as f:
-        raw = f.read(bytes)
-    if raw.startswith(codecs.BOM_UTF8):
-        encoding = 'UTF-8-SIG'
-    elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
-        encoding = 'UTF-32'
-    elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
-        encoding = 'UTF-16'
+    # Reads in the entire file. Robust detection of encoding.
+    # Correctly handles comments or late stage unicode characters
+    # gh-22871
+    if charset_normalizer is not None:
+        encoding = charset_normalizer.from_path(filename).best().encoding
     else:
-        if chardet is not None:
-            encoding = chardet.detect(raw)['encoding']
-        else:
-            # hint: install chardet to ensure correct encoding handling
-            encoding = 'ascii'
+        # hint: install charset_normalizer for correct encoding handling
+        # No need to read the whole file for trying with startswith
+        nbytes = min(32, os.path.getsize(filename))
+        with open(filename, 'rb') as fhandle:
+            raw = fhandle.read(nbytes)
+            if raw.startswith(codecs.BOM_UTF8):
+                encoding = 'UTF-8-SIG'
+            elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
+                encoding = 'UTF-32'
+            elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
+                encoding = 'UTF-16'
+            else:
+                # Fallback, without charset_normalizer
+                encoding = 'ascii'
     return open(filename, mode, encoding=encoding)
 
 
-def is_free_format(file):
+def is_free_format(fname):
     """Check if file is in free format Fortran."""
     # f90 allows both fixed and free format, assuming fixed unless
     # signs of free format are detected.
-    result = 0
-    with openhook(file, 'r') as f:
-        line = f.readline()
+    result = False
+    if Path(fname).suffix.lower() in COMMON_FREE_EXTENSIONS:
+        result = True
+    with openhook(fname, 'r') as fhandle:
+        line = fhandle.readline()
         n = 15  # the number of non-comment lines to scan for hints
         if _has_f_header(line):
             n = 0
         elif _has_f90_header(line):
             n = 0
-            result = 1
+            result = True
         while n > 0 and line:
             if line[0] != '!' and line.strip():
                 n -= 1
                 if (line[0] != '\t' and _free_f90_start(line[:5])) or line[-2:-1] == '&':
-                    result = 1
+                    result = True
                     break
-            line = f.readline()
+            line = fhandle.readline()
     return result
 
 
@@ -394,7 +395,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
         except UnicodeDecodeError as msg:
             raise Exception(
                 f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
-                f' failed with\n{msg}.\nIt is likely that installing chardet'
+                f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
                 ' package will help f2py determine the input file encoding'
                 ' correctly.')
         if not l:
@@ -407,7 +408,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
             strictf77 = 0
             sourcecodeform = 'fix'
             ext = os.path.splitext(currentfilename)[1]
-            if is_f_file(currentfilename) and \
+            if Path(currentfilename).suffix.lower() in COMMON_FIXED_EXTENSIONS and \
                     not (_has_f90_header(l) or _has_fix_header(l)):
                 strictf77 = 1
             elif is_free_format(currentfilename) and not _has_fix_header(l):
@@ -612,15 +613,15 @@ beginpattern90 = re.compile(
 groupends = (r'end|endprogram|endblockdata|endmodule|endpythonmodule|'
              r'endinterface|endsubroutine|endfunction')
 endpattern = re.compile(
-    beforethisafter % ('', groupends, groupends, r'.*'), re.I), 'end'
+    beforethisafter % ('', groupends, groupends, '.*'), re.I), 'end'
 endifs = r'end\s*(if|do|where|select|while|forall|associate|block|' + \
          r'critical|enum|team)'
 endifpattern = re.compile(
-    beforethisafter % (r'[\w]*?', endifs, endifs, r'[\w\s]*'), re.I), 'endif'
+    beforethisafter % (r'[\w]*?', endifs, endifs, '.*'), re.I), 'endif'
 #
 moduleprocedures = r'module\s*procedure'
 moduleprocedurepattern = re.compile(
-    beforethisafter % ('', moduleprocedures, moduleprocedures, r'.*'), re.I), \
+    beforethisafter % ('', moduleprocedures, moduleprocedures, '.*'), re.I), \
     'moduleprocedure'
 implicitpattern = re.compile(
     beforethisafter % ('', 'implicit', 'implicit', '.*'), re.I), 'implicit'
@@ -934,7 +935,7 @@ typedefpattern = re.compile(
     r'(?:,(?P<attributes>[\w(),]+))?(::)?(?P<name>\b[a-z$_][\w$]*\b)'
     r'(?:\((?P<params>[\w,]*)\))?\Z', re.I)
 nameargspattern = re.compile(
-    r'\s*(?P<name>\b[\w$]+\b)\s*(@\(@\s*(?P<args>[\w\s,]*)\s*@\)@|)\s*((result(\s*@\(@\s*(?P<result>\b[\w$]+\b)\s*@\)@|))|(bind\s*@\(@\s*(?P<bind>.*)\s*@\)@))*\s*\Z', re.I)
+    r'\s*(?P<name>\b[\w$]+\b)\s*(@\(@\s*(?P<args>[\w\s,]*)\s*@\)@|)\s*((result(\s*@\(@\s*(?P<result>\b[\w$]+\b)\s*@\)@|))|(bind\s*@\(@\s*(?P<bind>(?:(?!@\)@).)*)\s*@\)@))*\s*\Z', re.I)
 operatorpattern = re.compile(
     r'\s*(?P<scheme>(operator|assignment))'
     r'@\(@\s*(?P<name>[^)]+)\s*@\)@\s*\Z', re.I)
@@ -1739,6 +1740,28 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                         d1[k] = unmarkouterparen(d1[k])
                     else:
                         del d1[k]
+
+                if 'len' in d1:
+                    if typespec in ['complex', 'integer', 'logical', 'real']:
+                        if ('kindselector' not in edecl) or (not edecl['kindselector']):
+                            edecl['kindselector'] = {}
+                        edecl['kindselector']['*'] = d1['len']
+                        del d1['len']
+                    elif typespec == 'character':
+                        if ('charselector' not in edecl) or (not edecl['charselector']):
+                            edecl['charselector'] = {}
+                        if 'len' in edecl['charselector']:
+                            del edecl['charselector']['len']
+                        edecl['charselector']['*'] = d1['len']
+                        del d1['len']
+
+                if 'init' in d1:
+                    if '=' in edecl and (not edecl['='] == d1['init']):
+                        outmess('updatevars: attempt to change the init expression of "%s" ("%s") to "%s". Ignoring.\n' % (
+                            ename, edecl['='], d1['init']))
+                    else:
+                        edecl['='] = d1['init']
+
                 if 'len' in d1 and 'array' in d1:
                     if d1['len'] == '':
                         d1['len'] = d1['array']
@@ -1748,6 +1771,7 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                         del d1['len']
                         errmess('updatevars: "%s %s" is mapped to "%s %s(%s)"\n' % (
                             typespec, e, typespec, ename, d1['array']))
+
                 if 'array' in d1:
                     dm = 'dimension(%s)' % d1['array']
                     if 'attrspec' not in edecl or (not edecl['attrspec']):
@@ -1761,23 +1785,6 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                                         % (ename, dm1, dm))
                                 break
 
-                if 'len' in d1:
-                    if typespec in ['complex', 'integer', 'logical', 'real']:
-                        if ('kindselector' not in edecl) or (not edecl['kindselector']):
-                            edecl['kindselector'] = {}
-                        edecl['kindselector']['*'] = d1['len']
-                    elif typespec == 'character':
-                        if ('charselector' not in edecl) or (not edecl['charselector']):
-                            edecl['charselector'] = {}
-                        if 'len' in edecl['charselector']:
-                            del edecl['charselector']['len']
-                        edecl['charselector']['*'] = d1['len']
-                if 'init' in d1:
-                    if '=' in edecl and (not edecl['='] == d1['init']):
-                        outmess('updatevars: attempt to change the init expression of "%s" ("%s") to "%s". Ignoring.\n' % (
-                            ename, edecl['='], d1['init']))
-                    else:
-                        edecl['='] = d1['init']
             else:
                 outmess('updatevars: could not crack entity declaration "%s". Ignoring.\n' % (
                     ename + m.group('after')))
@@ -2386,19 +2393,19 @@ def _selected_int_kind_func(r):
 
 def _selected_real_kind_func(p, r=0, radix=0):
     # XXX: This should be processor dependent
-    # This is only good for 0 <= p <= 20
+    # This is only verified for 0 <= p <= 20, possibly good for p <= 33 and above
     if p < 7:
         return 4
     if p < 16:
         return 8
     machine = platform.machine().lower()
-    if machine.startswith(('aarch64', 'power', 'ppc', 'riscv', 's390x', 'sparc')):
-        if p <= 20:
+    if machine.startswith(('aarch64', 'arm64', 'power', 'ppc', 'riscv', 's390x', 'sparc')):
+        if p <= 33:
             return 16
     else:
         if p < 19:
             return 10
-        elif p <= 20:
+        elif p <= 33:
             return 16
     return -1
 
@@ -2849,6 +2856,11 @@ def analyzevars(block):
                         kindselect, charselect, typename = cracktypespec(
                             typespec, selector)
                         vars[n]['typespec'] = typespec
+                        try:
+                            if block['result']:
+                                vars[block['result']]['typespec'] = typespec
+                        except Exception:
+                            pass
                         if kindselect:
                             if 'kind' in kindselect:
                                 try:
diff --git a/numpy/f2py/diagnose.py b/numpy/f2py/diagnose.py
index 21ee399f0..86d7004ab 100644
--- a/numpy/f2py/diagnose.py
+++ b/numpy/f2py/diagnose.py
@@ -30,15 +30,15 @@ def run():
     try:
         import numpy
         has_newnumpy = 1
-    except ImportError:
-        print('Failed to import new numpy:', sys.exc_info()[1])
+    except ImportError as e:
+        print('Failed to import new numpy:', e)
         has_newnumpy = 0
 
     try:
         from numpy.f2py import f2py2e
         has_f2py2e = 1
-    except ImportError:
-        print('Failed to import f2py2e:', sys.exc_info()[1])
+    except ImportError as e:
+        print('Failed to import f2py2e:', e)
         has_f2py2e = 0
 
     try:
@@ -48,8 +48,8 @@ def run():
         try:
             import numpy_distutils
             has_numpy_distutils = 1
-        except ImportError:
-            print('Failed to import numpy_distutils:', sys.exc_info()[1])
+        except ImportError as e:
+            print('Failed to import numpy_distutils:', e)
             has_numpy_distutils = 0
 
     if has_newnumpy:
diff --git a/numpy/f2py/func2subr.py b/numpy/f2py/func2subr.py
index 2a05f065b..cc3cdc5b4 100644
--- a/numpy/f2py/func2subr.py
+++ b/numpy/f2py/func2subr.py
@@ -119,6 +119,12 @@ def createfuncwrapper(rout, signature=0):
 
     sargs = ', '.join(args)
     if f90mode:
+        # gh-23598 fix warning
+        # Essentially, this gets called again with modules where the name of the
+        # function is added to the arguments, which is not required, and removed
+        sargs = sargs.replace(f"{name}, ", '')
+        args = [arg for arg in args if arg != name]
+        rout['args'] = args
         add('subroutine f2pywrap_%s_%s (%s)' %
             (rout['modulename'], name, sargs))
         if not signature:
diff --git a/numpy/f2py/symbolic.py b/numpy/f2py/symbolic.py
index c2ab0f140..b1b9f5b6a 100644
--- a/numpy/f2py/symbolic.py
+++ b/numpy/f2py/symbolic.py
@@ -801,7 +801,7 @@ def normalize(obj):
             else:
                 _pairs_add(d, t, c)
         if len(d) == 0:
-            # TODO: deterimine correct kind
+            # TODO: determine correct kind
             return as_number(0)
         elif len(d) == 1:
             (t, c), = d.items()
@@ -836,7 +836,7 @@ def normalize(obj):
             else:
                 _pairs_add(d, b, e)
         if len(d) == 0 or coeff == 0:
-            # TODO: deterimine correct kind
+            # TODO: determine correct kind
             assert isinstance(coeff, number_types)
             return as_number(coeff)
         elif len(d) == 1:
diff --git a/numpy/f2py/tests/src/crackfortran/gh23533.f b/numpy/f2py/tests/src/crackfortran/gh23533.f
new file mode 100644
index 000000000..db522afa7
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23533.f
@@ -0,0 +1,5 @@
+      SUBROUTINE EXAMPLE( )
+        IF( .TRUE. ) THEN
+            CALL DO_SOMETHING()
+        END IF ! ** .TRUE. **
+      END
diff --git a/numpy/f2py/tests/src/crackfortran/gh23598.f90 b/numpy/f2py/tests/src/crackfortran/gh23598.f90
new file mode 100644
index 000000000..e0dffb5ef
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23598.f90
@@ -0,0 +1,4 @@
+integer function intproduct(a, b) result(res)
+  integer, intent(in) :: a, b
+  res = a*b
+end function
diff --git a/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90 b/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90
new file mode 100644
index 000000000..3b44efc5e
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90
@@ -0,0 +1,11 @@
+module test_bug
+    implicit none
+    private
+    public :: intproduct
+
+contains
+    integer function intproduct(a, b) result(res)
+    integer, intent(in) :: a, b
+    res = a*b
+    end function
+end module
diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
new file mode 100644
index 000000000..13515ce98
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
+subroutine foo(x)
+  real(8), intent(in) :: x
+  ! Écrit à l'écran la valeur de x
+end subroutine
diff --git a/numpy/f2py/tests/src/string/scalar_string.f90 b/numpy/f2py/tests/src/string/scalar_string.f90
new file mode 100644
index 000000000..f8f076172
--- /dev/null
+++ b/numpy/f2py/tests/src/string/scalar_string.f90
@@ -0,0 +1,9 @@
+MODULE string_test
+
+  character(len=8) :: string
+  character string77 * 8
+
+  character(len=12), dimension(5,7) :: strarr
+  character strarr77(5,7) * 12
+
+END MODULE string_test
diff --git a/numpy/f2py/tests/test_character.py b/numpy/f2py/tests/test_character.py
index b54b4d981..0bb0f4290 100644
--- a/numpy/f2py/tests/test_character.py
+++ b/numpy/f2py/tests/test_character.py
@@ -457,9 +457,10 @@ class TestMiscCharacter(util.F2PyTest):
          character(len=*), intent(in) :: x(:)
          !f2py intent(out) x
          integer :: i
-         do i=1, size(x)
-           print*, "x(",i,")=", x(i)
-         end do
+         ! Uncomment for debug printing:
+         !do i=1, size(x)
+         !   print*, "x(",i,")=", x(i)
+         !end do
        end subroutine {fprefix}_gh4519
 
        pure function {fprefix}_gh3425(x) result (y)
@@ -568,3 +569,25 @@ class TestMiscCharacter(util.F2PyTest):
         assert_equal(len(a), 2)
 
         assert_raises(Exception, lambda: f(b'c'))
+
+
+class TestStringScalarArr(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "string", "scalar_string.f90")]
+
+    @pytest.mark.slow
+    def test_char(self):
+        for out in (self.module.string_test.string,
+                    self.module.string_test.string77):
+            expected = ()
+            assert out.shape == expected
+            expected = '|S8'
+            assert out.dtype == expected
+
+    @pytest.mark.slow
+    def test_char_arr(self):
+        for out in (self.module.string_test.strarr,
+                    self.module.string_test.strarr77):
+            expected = (5,7)
+            assert out.shape == expected
+            expected = '|S12'
+            assert out.dtype == expected
diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py
index dcf8760db..49bfc13af 100644
--- a/numpy/f2py/tests/test_crackfortran.py
+++ b/numpy/f2py/tests/test_crackfortran.py
@@ -1,7 +1,10 @@
+import importlib
 import codecs
+import time
+import unicodedata
 import pytest
 import numpy as np
-from numpy.f2py.crackfortran import markinnerspaces
+from numpy.f2py.crackfortran import markinnerspaces, nameargspattern
 from . import util
 from numpy.f2py import crackfortran
 import textwrap
@@ -132,6 +135,7 @@ class TestMarkinnerspaces:
         assert markinnerspaces("a 'b c' 'd e'") == "a 'b@_@c' 'd@_@e'"
         assert markinnerspaces(r'a "b c" "d e"') == r'a "b@_@c" "d@_@e"'
 
+
 class TestDimSpec(util.F2PyTest):
     """This test suite tests various expressions that are used as dimension
     specifications.
@@ -241,6 +245,7 @@ class TestModuleDeclaration:
         assert len(mod) == 1
         assert mod[0]["vars"]["abar"]["="] == "bar('abar')"
 
+
 class TestEval(util.F2PyTest):
     def test_eval_scalar(self):
         eval_scalar = crackfortran._eval_scalar
@@ -257,13 +262,76 @@ class TestFortranReader(util.F2PyTest):
     def test_input_encoding(self, tmp_path, encoding):
         # gh-635
         f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
-        # explicit BOM is required for UTF8
-        bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
         with f_path.open('w', encoding=encoding) as ff:
-            ff.write(bom.decode(encoding) +
-                     """
+            ff.write("""
                      subroutine foo()
                      end subroutine foo
                      """)
         mod = crackfortran.crackfortran([str(f_path)])
         assert mod[0]['name'] == 'foo'
+
+
+class TestUnicodeComment(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]
+
+    @pytest.mark.skipif(
+        (importlib.util.find_spec("charset_normalizer") is None),
+        reason="test requires charset_normalizer which is not installed",
+    )
+    def test_encoding_comment(self):
+        self.module.foo(3)
+
+
+class TestNameArgsPatternBacktracking:
+    @pytest.mark.parametrize(
+        ['adversary'],
+        [
+            ('@)@bind@(@',),
+            ('@)@bind                         @(@',),
+            ('@)@bind foo bar baz@(@',)
+        ]
+    )
+    def test_nameargspattern_backtracking(self, adversary):
+        '''address ReDOS vulnerability:
+        https://github.com/numpy/numpy/issues/23338'''
+        trials_per_batch = 12
+        batches_per_regex = 4
+        start_reps, end_reps = 15, 25
+        for ii in range(start_reps, end_reps):
+            repeated_adversary = adversary * ii
+            # test times in small batches.
+            # this gives us more chances to catch a bad regex
+            # while still catching it before too long if it is bad
+            for _ in range(batches_per_regex):
+                times = []
+                for _ in range(trials_per_batch):
+                    t0 = time.perf_counter()
+                    mtch = nameargspattern.search(repeated_adversary)
+                    times.append(time.perf_counter() - t0)
+                # our pattern should be much faster than 0.2s per search
+                # it's unlikely that a bad regex will pass even on fast CPUs
+                assert np.median(times) < 0.2
+            assert not mtch
+            # if the adversary is capped with @)@, it becomes acceptable
+            # according to the old version of the regex.
+            # that should still be true.
+            good_version_of_adversary = repeated_adversary + '@)@'
+            assert nameargspattern.search(good_version_of_adversary)
+
+
+class TestFunctionReturn(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "crackfortran", "gh23598.f90")]
+
+    def test_function_rettype(self):
+        # gh-23598
+        assert self.module.intproduct(3, 4) == 12
+
+
+class TestFortranGroupCounters(util.F2PyTest):
+    def test_end_if_comment(self):
+        # gh-23533
+        fpath = util.getpath("tests", "src", "crackfortran", "gh23533.f")
+        try:
+            crackfortran.crackfortran([str(fpath)])
+        except Exception as exc:
+            assert False, f"'crackfortran.crackfortran' raised an exception {exc}"
diff --git a/numpy/f2py/tests/test_f2py2e.py b/numpy/f2py/tests/test_f2py2e.py
index 2c10f046f..5f7b56a68 100644
--- a/numpy/f2py/tests/test_f2py2e.py
+++ b/numpy/f2py/tests/test_f2py2e.py
@@ -63,6 +63,15 @@ def hello_world_f90(tmpdir_factory):
 
 
 @pytest.fixture(scope="session")
+def gh23598_warn(tmpdir_factory):
+    """F90 file for testing warnings in gh23598"""
+    fdat = util.getpath("tests", "src", "crackfortran", "gh23598Warn.f90").read_text()
+    fn = tmpdir_factory.getbasetemp() / "gh23598Warn.f90"
+    fn.write_text(fdat, encoding="ascii")
+    return fn
+
+
+@pytest.fixture(scope="session")
 def hello_world_f77(tmpdir_factory):
     """Generates a single f77 file for testing"""
     fdat = util.getpath("tests", "src", "cli", "hi77.f").read_text()
@@ -91,6 +100,19 @@ def f2cmap_f90(tmpdir_factory):
     return fn
 
 
+def test_gh23598_warn(capfd, gh23598_warn, monkeypatch):
+    foutl = get_io_paths(gh23598_warn, mname="test")
+    ipath = foutl.f90inp
+    monkeypatch.setattr(
+        sys, "argv",
+        f'f2py {ipath} -m test'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()  # Generate files
+        wrapper = foutl.wrap90.read_text()
+        assert "intproductf2pywrap, intpr" not in wrapper
+
+
 def test_gen_pyf(capfd, hello_world_f90, monkeypatch):
     """Ensures that a signature file is generated via the CLI
     CLI :: -h
diff --git a/numpy/f2py/tests/test_kind.py b/numpy/f2py/tests/test_kind.py
index f0cb61fb6..69b85aaad 100644
--- a/numpy/f2py/tests/test_kind.py
+++ b/numpy/f2py/tests/test_kind.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import platform
 
 from numpy.f2py.crackfortran import (
     _selected_int_kind_func as selected_int_kind,
@@ -11,8 +12,8 @@ from . import util
 class TestKind(util.F2PyTest):
     sources = [util.getpath("tests", "src", "kind", "foo.f90")]
 
-    def test_all(self):
-        selectedrealkind = self.module.selectedrealkind
+    def test_int(self):
+        """Test `int` kind_func for integers up to 10**40."""
         selectedintkind = self.module.selectedintkind
 
         for i in range(40):
@@ -20,7 +21,27 @@ class TestKind(util.F2PyTest):
                 i
             ), f"selectedintkind({i}): expected {selected_int_kind(i)!r} but got {selectedintkind(i)!r}"
 
-        for i in range(20):
+    def test_real(self):
+        """
+        Test (processor-dependent) `real` kind_func for real numbers
+        of up to 31 digits precision (extended/quadruple).
+        """
+        selectedrealkind = self.module.selectedrealkind
+
+        for i in range(32):
+            assert selectedrealkind(i) == selected_real_kind(
+                i
+            ), f"selectedrealkind({i}): expected {selected_real_kind(i)!r} but got {selectedrealkind(i)!r}"
+
+    @pytest.mark.xfail(platform.machine().lower().startswith("ppc"),
+                       reason="Some PowerPC may not support full IEEE 754 precision")
+    def test_quad_precision(self):
+        """
+        Test kind_func for quadruple precision [`real(16)`] of 32+ digits .
+        """
+        selectedrealkind = self.module.selectedrealkind
+
+        for i in range(32, 40):
             assert selectedrealkind(i) == selected_real_kind(
                 i
             ), f"selectedrealkind({i}): expected {selected_real_kind(i)!r} but got {selectedrealkind(i)!r}"
diff --git a/numpy/f2py/tests/util.py b/numpy/f2py/tests/util.py
index 1534c4e7d..26fa7e49d 100644
--- a/numpy/f2py/tests/util.py
+++ b/numpy/f2py/tests/util.py
@@ -6,6 +6,7 @@ Utility functions for
 - determining paths to tests
 
 """
+import glob
 import os
 import sys
 import subprocess
@@ -30,6 +31,10 @@ from importlib import import_module
 _module_dir = None
 _module_num = 5403
 
+if sys.platform == "cygwin":
+    NUMPY_INSTALL_ROOT = Path(__file__).parent.parent.parent
+    _module_list = list(NUMPY_INSTALL_ROOT.glob("**/*.dll"))
+
 
 def _cleanup():
     global _module_dir
@@ -147,6 +152,21 @@ def build_module(source_files, options=[], skip=[], only=[], module_name=None):
         for fn in dst_sources:
             os.unlink(fn)
 
+    # Rebase (Cygwin-only)
+    if sys.platform == "cygwin":
+        # If someone starts deleting modules after import, this will
+        # need to change to record how big each module is, rather than
+        # relying on rebase being able to find that from the files.
+        _module_list.extend(
+            glob.glob(os.path.join(d, "{:s}*".format(module_name)))
+        )
+        subprocess.check_call(
+            ["/usr/bin/rebase", "--database", "--oblivious", "--verbose"]
+            + _module_list
+        )
+
+
+
     # Import
     return import_module(module_name)
 
diff --git a/numpy/fft/_pocketfft.c b/numpy/fft/_pocketfft.c
index 86de57bd3..37649386f 100644
--- a/numpy/fft/_pocketfft.c
+++ b/numpy/fft/_pocketfft.c
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 
 #define RALLOC(type,num) \
-  ((type *)malloc((num)*sizeof(type)))
+  (assert(num != 0), ((type *)malloc((num)*sizeof(type))))
 #define DEALLOC(ptr) \
   do { free(ptr); (ptr)=NULL; } while(0)
 
@@ -1056,8 +1056,10 @@ static cfftp_plan make_cfftp_plan (size_t length)
   if (length==1) return plan;
   if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
   size_t tws=cfftp_twsize(plan);
-  plan->mem=RALLOC(cmplx,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (tws != 0) {
+    plan->mem=RALLOC(cmplx,tws);
+    if (!plan->mem) { DEALLOC(plan); return NULL; }
+  }
   if (cfftp_comp_twiddle(plan)!=0)
     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
   return plan;
@@ -1820,7 +1822,6 @@ static size_t rfftp_twsize(rfftp_plan plan)
     l1*=ip;
     }
   return twsize;
-  return 0;
   }
 
 WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
@@ -1876,8 +1877,10 @@ NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
   if (length==1) return plan;
   if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
   size_t tws=rfftp_twsize(plan);
-  plan->mem=RALLOC(double,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (tws != 0) {
+    plan->mem=RALLOC(double,tws);
+    if (!plan->mem) { DEALLOC(plan); return NULL; }
+  }
   if (rfftp_comp_twiddle(plan)!=0)
     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
   return plan;
@@ -2229,6 +2232,8 @@ execute_real_forward(PyObject *a1, double fct)
 {
     rfft_plan plan=NULL;
     int fail = 0;
+    npy_intp tdim[NPY_MAXDIMS];
+
     PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
             PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
             NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
@@ -2238,15 +2243,11 @@ execute_real_forward(PyObject *a1, double fct)
     int ndim = PyArray_NDIM(data);
     const npy_intp *odim = PyArray_DIMS(data);
     int npts = odim[ndim - 1];
-    npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
-    if (!tdim)
-      { Py_XDECREF(data); return NULL; }
     for (int d=0; d<ndim-1; ++d)
       tdim[d] = odim[d];
     tdim[ndim-1] = npts/2 + 1;
     PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
             tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
-    free(tdim);
     if (!ret) fail=1;
     if (!fail) {
       int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
diff --git a/numpy/fft/helper.pyi b/numpy/fft/helper.pyi
index b49fc88f7..9b6525190 100644
--- a/numpy/fft/helper.pyi
+++ b/numpy/fft/helper.pyi
@@ -27,21 +27,21 @@ def ifftshift(x: ArrayLike, axes: None | _ShapeLike = ...) -> NDArray[Any]: ...
 @overload
 def fftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeFloat_co,
+    d: _ArrayLikeFloat_co = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def fftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeComplex_co,
+    d: _ArrayLikeComplex_co = ...,
 ) -> NDArray[complexfloating[Any, Any]]: ...
 
 @overload
 def rfftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeFloat_co,
+    d: _ArrayLikeFloat_co = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def rfftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeComplex_co,
+    d: _ArrayLikeComplex_co = ...,
 ) -> NDArray[complexfloating[Any, Any]]: ...
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index 58166d4b1..d3cc9fee4 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -11,7 +11,6 @@ Most contains basic functions that are used by several submodules and are
 useful to have in the main name-space.
 
 """
-import math
 
 from numpy.version import version as __version__
 
@@ -58,7 +57,7 @@ from .arraypad import *
 from ._version import *
 from numpy.core._multiarray_umath import tracemalloc_domain
 
-__all__ = ['emath', 'math', 'tracemalloc_domain', 'Arrayterator']
+__all__ = ['emath', 'tracemalloc_domain', 'Arrayterator']
 __all__ += type_check.__all__
 __all__ += index_tricks.__all__
 __all__ += function_base.__all__
@@ -77,3 +76,19 @@ __all__ += histograms.__all__
 from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
+
+def __getattr__(attr):
+    # Warn for reprecated attributes
+    import math
+    import warnings
+
+    if attr == 'math':
+        warnings.warn(
+            "`np.lib.math` is a deprecated alias for the standard library "
+            "`math` module (Deprecated Numpy 1.25). Replace usages of "
+            "`numpy.lib.math` with `math`", DeprecationWarning, stacklevel=2)
+        return math
+    else:
+        raise AttributeError("module {!r} has no attribute "
+                             "{!r}".format(__name__, attr))
+        
diff --git a/numpy/lib/__init__.pyi b/numpy/lib/__init__.pyi
index 1fa2d226e..d3553bbcc 100644
--- a/numpy/lib/__init__.pyi
+++ b/numpy/lib/__init__.pyi
@@ -64,7 +64,6 @@ from numpy.lib.function_base import (
     digitize as digitize,
     cov as cov,
     corrcoef as corrcoef,
-    msort as msort,
     median as median,
     sinc as sinc,
     hamming as hamming,
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 4a5ac1285..534d1b3ee 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -513,8 +513,8 @@ class StringConverter:
                     (nx.complexfloating, complex, nx.nan + 0j),
                     # Last, try with the string types (must be last, because
                     # `_mapper[-1]` is used as default in some cases)
-                    (nx.unicode_, asunicode, '???'),
-                    (nx.string_, asbytes, '???'),
+                    (nx.str_, asunicode, '???'),
+                    (nx.bytes_, asbytes, '???'),
                     ])
 
     @classmethod
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index cf5f47a82..300bbda26 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -649,8 +649,24 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
         ar2_range = int(ar2_max) - int(ar2_min)
 
         # Constraints on whether we can actually use the table method:
-        range_safe_from_overflow = ar2_range < np.iinfo(ar2.dtype).max
+        #  1. Assert memory usage is not too large
         below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
+        #  2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
+        range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
+        #  3. Check overflows for (ar1 - ar2_min); dtype=ar1.dtype
+        if ar1.size > 0:
+            ar1_min = np.min(ar1)
+            ar1_max = np.max(ar1)
+
+            # After masking, the range of ar1 is guaranteed to be
+            # within the range of ar2:
+            ar1_upper = min(int(ar1_max), int(ar2_max))
+            ar1_lower = max(int(ar1_min), int(ar2_min))
+
+            range_safe_from_overflow &= all((
+                ar1_upper - int(ar2_min) <= np.iinfo(ar1.dtype).max,
+                ar1_lower - int(ar2_min) >= np.iinfo(ar1.dtype).min
+            ))
 
         # Optimal performance is for approximately
         # log10(size) > (log10(range) - 2.27) / 0.927.
@@ -687,7 +703,7 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
         elif kind == 'table':  # not range_safe_from_overflow
             raise RuntimeError(
                 "You have specified kind='table', "
-                "but the range of values in `ar2` exceeds the "
+                "but the range of values in `ar2` or `ar1` exceed the "
                 "maximum integer of the datatype. "
                 "Please set `kind` to None or 'sort'."
             )
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 54fd0b0bc..ef50fb19d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -437,15 +437,15 @@ def _write_array_header(fp, d, version=None):
         header.append("'%s': %s, " % (key, repr(value)))
     header.append("}")
     header = "".join(header)
-    
+
     # Add some spare space so that the array header can be modified in-place
     # when changing the array size, e.g. when growing it by appending data at
-    # the end. 
+    # the end.
     shape = d['shape']
     header += " " * ((GROWTH_AXIS_MAX_DIGITS - len(repr(
         shape[-1 if d['fortran_order'] else 0]
     ))) if len(shape) > 0 else 0)
-    
+
     if version is None:
         header = _wrap_header_guess_version(header)
     else:
@@ -505,7 +505,7 @@ def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Raises
     ------
@@ -532,7 +532,7 @@ def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Returns
     -------
@@ -623,13 +623,27 @@ def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
     #   "descr" : dtype.descr
     # Versions (2, 0) and (1, 0) could have been created by a Python 2
     # implementation before header filtering was implemented.
-    if version <= (2, 0):
-        header = _filter_header(header)
+    #
+    # For performance reasons, we try without _filter_header first though
     try:
         d = safe_eval(header)
     except SyntaxError as e:
-        msg = "Cannot parse header: {!r}"
-        raise ValueError(msg.format(header)) from e
+        if version <= (2, 0):
+            header = _filter_header(header)
+            try:
+                d = safe_eval(header)
+            except SyntaxError as e2:
+                msg = "Cannot parse header: {!r}"
+                raise ValueError(msg.format(header)) from e2
+            else:
+                warnings.warn(
+                    "Reading `.npy` or `.npz` file required additional "
+                    "header parsing as it was created on Python 2. Save the "
+                    "file again to speed up loading and avoid this warning.",
+                    UserWarning, stacklevel=4)
+        else:
+            msg = "Cannot parse header: {!r}"
+            raise ValueError(msg.format(header)) from e
     if not isinstance(d, dict):
         msg = "Header is not a dictionary: {!r}"
         raise ValueError(msg.format(d))
@@ -750,7 +764,7 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -869,7 +883,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Returns
     -------
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 5e666c17e..02e141920 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -24,7 +24,7 @@ from numpy.core import overrides
 from numpy.core.function_base import add_newdoc
 from numpy.lib.twodim_base import diag
 from numpy.core.multiarray import (
-    _insert, add_docstring, bincount, normalize_axis_index, _monotonicity,
+    _place, add_docstring, bincount, normalize_axis_index, _monotonicity,
     interp as compiled_interp, interp_complex as compiled_interp_complex
     )
 from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
@@ -161,6 +161,8 @@ def rot90(m, k=1, axes=(0, 1)):
     Rotate an array by 90 degrees in the plane specified by axes.
 
     Rotation direction is from the first towards the second axis.
+    This means for a 2D array with the default `k` and `axes`, the
+    rotation will be counterclockwise.
 
     Parameters
     ----------
@@ -1309,6 +1311,8 @@ def gradient(f, *varargs, axis=None, edge_order=1):
 
     if len_axes == 1:
         return outvals[0]
+    elif np._using_numpy2_behavior():
+        return tuple(outvals)
     else:
         return outvals
 
@@ -1947,11 +1951,7 @@ def place(arr, mask, vals):
            [44, 55, 44]])
 
     """
-    if not isinstance(arr, np.ndarray):
-        raise TypeError("argument 1 must be numpy.ndarray, "
-                        "not {name}".format(name=type(arr).__name__))
-
-    return _insert(arr, mask, vals)
+    return _place(arr, mask, vals)
 
 
 def disp(mesg, device=None, linefeed=True):
@@ -2117,10 +2117,10 @@ def _create_arrays(broadcast_shape, dim_sizes, list_of_core_dims, dtypes,
 @set_module('numpy')
 class vectorize:
     """
-    vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
-              signature=None)
+    vectorize(pyfunc=np._NoValue, otypes=None, doc=None, excluded=None,
+    cache=False, signature=None)
 
-    Generalized function class.
+    Returns an object that acts like pyfunc, but takes arrays as input.
 
     Define a vectorized function which takes a nested sequence of objects or
     numpy arrays as inputs and returns a single numpy array or a tuple of numpy
@@ -2134,8 +2134,9 @@ class vectorize:
 
     Parameters
     ----------
-    pyfunc : callable
+    pyfunc : callable, optional
         A python function or method.
+        Can be omitted to produce a decorator with keyword arguments.
     otypes : str or list of dtypes, optional
         The output data type. It must be specified as either a string of
         typecode characters or a list of data type specifiers. There should
@@ -2167,8 +2168,9 @@ class vectorize:
 
     Returns
     -------
-    vectorized : callable
-        Vectorized function.
+    out : callable
+        A vectorized function if ``pyfunc`` was provided,
+        a decorator otherwise.
 
     See Also
     --------
@@ -2265,18 +2267,44 @@ class vectorize:
            [0., 0., 1., 2., 1., 0.],
            [0., 0., 0., 1., 2., 1.]])
 
+    Decorator syntax is supported.  The decorator can be called as
+    a function to provide keyword arguments.
+    >>>@np.vectorize
+    ...def identity(x):
+    ...    return x
+    ...
+    >>>identity([0, 1, 2])
+    array([0, 1, 2])
+    >>>@np.vectorize(otypes=[float])
+    ...def as_float(x):
+    ...    return x
+    ...
+    >>>as_float([0, 1, 2])
+    array([0., 1., 2.])
     """
-    def __init__(self, pyfunc, otypes=None, doc=None, excluded=None,
-                 cache=False, signature=None):
+    def __init__(self, pyfunc=np._NoValue, otypes=None, doc=None,
+                 excluded=None, cache=False, signature=None):
+
+        if (pyfunc != np._NoValue) and (not callable(pyfunc)):
+            #Splitting the error message to keep
+            #the length below 79 characters.
+            part1 = "When used as a decorator, "
+            part2 = "only accepts keyword arguments."
+            raise TypeError(part1 + part2)
+
         self.pyfunc = pyfunc
         self.cache = cache
         self.signature = signature
-        self._ufunc = {}    # Caching to improve default performance
+        if pyfunc != np._NoValue and hasattr(pyfunc, '__name__'):
+            self.__name__ = pyfunc.__name__
 
-        if doc is None:
+        self._ufunc = {}    # Caching to improve default performance
+        self._doc = None
+        self.__doc__ = doc
+        if doc is None and hasattr(pyfunc, '__doc__'):
             self.__doc__ = pyfunc.__doc__
         else:
-            self.__doc__ = doc
+            self._doc = doc
 
         if isinstance(otypes, str):
             for char in otypes:
@@ -2298,7 +2326,15 @@ class vectorize:
         else:
             self._in_and_out_core_dims = None
 
-    def __call__(self, *args, **kwargs):
+    def _init_stage_2(self, pyfunc, *args, **kwargs):
+        self.__name__ = pyfunc.__name__
+        self.pyfunc = pyfunc
+        if self._doc is None:
+            self.__doc__ = pyfunc.__doc__
+        else:
+            self.__doc__ = self._doc
+
+    def _call_as_normal(self, *args, **kwargs):
         """
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
@@ -2328,6 +2364,13 @@ class vectorize:
 
         return self._vectorize_call(func=func, args=vargs)
 
+    def __call__(self, *args, **kwargs):
+        if self.pyfunc is np._NoValue:
+            self._init_stage_2(*args, **kwargs)
+            return self
+
+        return self._call_as_normal(*args, **kwargs)
+
     def _get_ufunc_and_otypes(self, func, args):
         """Return (ufunc, otypes)."""
         # frompyfunc will fail if args is empty
@@ -2693,7 +2736,7 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
 
     if fact <= 0:
         warnings.warn("Degrees of freedom <= 0 for slice",
-                      RuntimeWarning, stacklevel=3)
+                      RuntimeWarning, stacklevel=2)
         fact = 0.0
 
     X -= avg[:, None]
@@ -2842,7 +2885,7 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue, *,
     if bias is not np._NoValue or ddof is not np._NoValue:
         # 2015-03-15, 1.10
         warnings.warn('bias and ddof have no effect and are deprecated',
-                      DeprecationWarning, stacklevel=3)
+                      DeprecationWarning, stacklevel=2)
     c = cov(x, y, rowvar, dtype=dtype)
     try:
         d = diag(c)
@@ -2956,10 +2999,15 @@ def blackman(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.42 + 0.5*cos(pi*n/(M-1)) + 0.08*cos(2.0*pi*n/(M-1))
 
@@ -3064,10 +3112,15 @@ def bartlett(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return where(less_equal(n, 0), 1 + n/(M-1), 1 - n/(M-1))
 
@@ -3168,10 +3221,15 @@ def hanning(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.5 + 0.5*cos(pi*n/(M-1))
 
@@ -3268,10 +3326,15 @@ def hamming(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.54 + 0.46*cos(pi*n/(M-1))
 
@@ -3547,11 +3610,19 @@ def kaiser(M, beta):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.  (Simplified result_type with 0.0
+    # strongly typed.  result-type is not/less order sensitive, but that mainly
+    # matters for integers anyway.)
+    values = np.array([0.0, M, beta])
+    M = values[1]
+    beta = values[2]
+
     if M == 1:
-        return np.ones(1, dtype=np.result_type(M, 0.0))
+        return np.ones(1, dtype=values.dtype)
     n = arange(0, M)
     alpha = (M-1)/2.0
-    return i0(beta * sqrt(1-((n-alpha)/alpha)**2.0))/i0(float(beta))
+    return i0(beta * sqrt(1-((n-alpha)/alpha)**2.0))/i0(beta)
 
 
 def _sinc_dispatcher(x):
@@ -3682,7 +3753,7 @@ def msort(a):
     warnings.warn(
         "msort is deprecated, use np.sort(a, axis=0) instead",
         DeprecationWarning,
-        stacklevel=3,
+        stacklevel=2,
     )
     b = array(a, subok=True, copy=True)
     b.sort(0)
@@ -3937,8 +4008,8 @@ def percentile(a,
     a : array_like of real numbers
         Input array or object that can be converted to an array.
     q : array_like of float
-        Percentile or sequence of percentiles to compute, which must be between
-        0 and 100 inclusive.
+        Percentage or sequence of percentages for the percentiles to compute.
+        Values must be between 0 and 100 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the percentiles are computed. The
         default is to compute the percentile(s) along a flattened
@@ -4185,7 +4256,8 @@ def percentile(a,
             xlabel='Percentile',
             ylabel='Estimated percentile value',
             yticks=a)
-        ax.legend()
+        ax.legend(bbox_to_anchor=(1.03, 1))
+        plt.tight_layout()
         plt.show()
 
     References
@@ -4236,8 +4308,8 @@ def quantile(a,
     a : array_like of real numbers
         Input array or object that can be converted to an array.
     q : array_like of float
-        Quantile or sequence of quantiles to compute, which must be between
-        0 and 1 inclusive.
+        Probability or sequence of probabilities for the quantiles to compute.
+        Values must be between 0 and 1 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the quantiles are computed. The default is
         to compute the quantile(s) along a flattened version of the array.
@@ -4291,8 +4363,8 @@ def quantile(a,
     Returns
     -------
     quantile : scalar or ndarray
-        If `q` is a single quantile and `axis=None`, then the result
-        is a scalar. If multiple quantiles are given, first axis of
+        If `q` is a single probability and `axis=None`, then the result
+        is a scalar. If multiple probabilies levels are given, first axis of
         the result corresponds to the quantiles. The other axes are
         the axes that remain after the reduction of `a`. If the input
         contains integers or floats smaller than ``float64``, the output
@@ -4909,6 +4981,24 @@ def trapz(y, x=None, dx=1.0, axis=-1):
     return ret
 
 
+# __array_function__ has no __code__ or other attributes normal Python funcs we
+# wrap everything into a C callable. SciPy however, tries to "clone" `trapz`
+# into a new Python function which requires `__code__` and a few other
+# attributes. So we create a dummy clone and copy over its attributes allowing
+# SciPy <= 1.10 to work: https://github.com/scipy/scipy/issues/17811
+assert not hasattr(trapz, "__code__")
+
+def _fake_trapz(y, x=None, dx=1.0, axis=-1):
+    return trapz(y, x=x, dx=dx, axis=axis)
+
+
+trapz.__code__ = _fake_trapz.__code__
+trapz.__globals__ = _fake_trapz.__globals__
+trapz.__defaults__ = _fake_trapz.__defaults__
+trapz.__closure__ = _fake_trapz.__closure__
+trapz.__kwdefaults__ = _fake_trapz.__kwdefaults__
+
+
 def _meshgrid_dispatcher(*xi, copy=None, sparse=None, indexing=None):
     return xi
 
@@ -4917,7 +5007,7 @@ def _meshgrid_dispatcher(*xi, copy=None, sparse=None, indexing=None):
 @array_function_dispatch(_meshgrid_dispatcher)
 def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
     """
-    Return coordinate matrices from coordinate vectors.
+    Return a list of coordinate matrices from coordinate vectors.
 
     Make N-D coordinate arrays for vectorized evaluations of
     N-D scalar/vector fields over N-D grids, given
@@ -4958,7 +5048,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
 
     Returns
     -------
-    X1, X2,..., XN : ndarray
+    X1, X2,..., XN : list of ndarrays
         For vectors `x1`, `x2`,..., `xn` with lengths ``Ni=len(xi)``,
         returns ``(N1, N2, N3,..., Nn)`` shaped arrays if indexing='ij'
         or ``(N2, N1, N3,..., Nn)`` shaped arrays if indexing='xy'
@@ -5395,7 +5485,7 @@ def insert(arr, obj, values, axis=None):
             warnings.warn(
                 "in the future insert will treat boolean arrays and "
                 "array-likes as a boolean index instead of casting it to "
-                "integer", FutureWarning, stacklevel=3)
+                "integer", FutureWarning, stacklevel=2)
             indices = indices.astype(intp)
             # Code after warning period:
             #if obj.ndim != 1:
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 0dfa7b4c1..35745e6dd 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -981,7 +981,7 @@ def histogramdd(sample, bins=10, range=None, density=None, weights=None):
         if M != D:
             raise ValueError(
                 'The dimension of bins must be equal to the dimension of the '
-                ' sample x.')
+                'sample x.')
     except TypeError:
         # bins is an integer
         bins = D*[bins]
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 58dd394e1..6913d2b95 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -3,12 +3,11 @@ import sys
 import math
 import warnings
 
+import numpy as np
 from .._utils import set_module
 import numpy.core.numeric as _nx
-from numpy.core.numeric import (
-    asarray, ScalarType, array, alltrue, cumprod, arange, ndim
-)
-from numpy.core.numerictypes import find_common_type, issubdtype
+from numpy.core.numeric import ScalarType, array
+from numpy.core.numerictypes import issubdtype
 
 import numpy.matrixlib as matrixlib
 from .function_base import diff
@@ -94,7 +93,7 @@ def ix_(*args):
     nd = len(args)
     for k, new in enumerate(args):
         if not isinstance(new, _nx.ndarray):
-            new = asarray(new)
+            new = np.asarray(new)
             if new.size == 0:
                 # Explicitly type empty arrays to avoid float default
                 new = new.astype(_nx.intp)
@@ -211,13 +210,13 @@ class nd_grid:
 
 class MGridClass(nd_grid):
     """
-    `nd_grid` instance which returns a dense multi-dimensional "meshgrid".
+    An instance which returns a dense multi-dimensional "meshgrid".
 
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an dense
-    (or fleshed out) mesh-grid when indexed, so that each returned argument
-    has the same shape.  The dimensions and number of the output arrays are
-    equal to the number of indexing dimensions.  If the step length is not a
-    complex number, then the stop is not inclusive.
+    An instance which returns a dense (or fleshed out) mesh-grid
+    when indexed, so that each returned argument has the same shape.
+    The dimensions and number of the output arrays are equal to the
+    number of indexing dimensions.  If the step length is not a complex
+    number, then the stop is not inclusive.
 
     However, if the step length is a **complex number** (e.g. 5j), then
     the integer part of its magnitude is interpreted as specifying the
@@ -230,8 +229,7 @@ class MGridClass(nd_grid):
 
     See Also
     --------
-    lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
-    ogrid : like mgrid but returns open (not fleshed out) mesh grids
+    ogrid : like `mgrid` but returns open (not fleshed out) mesh grids
     meshgrid: return coordinate matrices from coordinate vectors
     r_ : array concatenator
     :ref:`how-to-partition`
@@ -263,13 +261,13 @@ mgrid = MGridClass()
 
 class OGridClass(nd_grid):
     """
-    `nd_grid` instance which returns an open multi-dimensional "meshgrid".
+    An instance which returns an open multi-dimensional "meshgrid".
 
-    An instance of `numpy.lib.index_tricks.nd_grid` which returns an open
-    (i.e. not fleshed out) mesh-grid when indexed, so that only one dimension
-    of each returned array is greater than 1.  The dimension and number of the
-    output arrays are equal to the number of indexing dimensions.  If the step
-    length is not a complex number, then the stop is not inclusive.
+    An instance which returns an open (i.e. not fleshed out) mesh-grid
+    when indexed, so that only one dimension of each returned array is
+    greater than 1.  The dimension and number of the output arrays are
+    equal to the number of indexing dimensions.  If the step length is
+    not a complex number, then the stop is not inclusive.
 
     However, if the step length is a **complex number** (e.g. 5j), then
     the integer part of its magnitude is interpreted as specifying the
@@ -283,7 +281,6 @@ class OGridClass(nd_grid):
 
     See Also
     --------
-    np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
     mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
     meshgrid: return coordinate matrices from coordinate vectors
     r_ : array concatenator
@@ -343,9 +340,8 @@ class AxisConcatenator:
         axis = self.axis
 
         objs = []
-        scalars = []
-        arraytypes = []
-        scalartypes = []
+        # dtypes or scalars for weak scalar handling in result_type
+        result_type_objs = []
 
         for k, item in enumerate(key):
             scalar = False
@@ -391,12 +387,10 @@ class AxisConcatenator:
                 except (ValueError, TypeError) as e:
                     raise ValueError("unknown special directive") from e
             elif type(item) in ScalarType:
-                newobj = array(item, ndmin=ndmin)
-                scalars.append(len(objs))
                 scalar = True
-                scalartypes.append(newobj.dtype)
+                newobj = item
             else:
-                item_ndim = ndim(item)
+                item_ndim = np.ndim(item)
                 newobj = array(item, copy=False, subok=True, ndmin=ndmin)
                 if trans1d != -1 and item_ndim < ndmin:
                     k2 = ndmin - item_ndim
@@ -406,15 +400,20 @@ class AxisConcatenator:
                     defaxes = list(range(ndmin))
                     axes = defaxes[:k1] + defaxes[k2:] + defaxes[k1:k2]
                     newobj = newobj.transpose(axes)
+
             objs.append(newobj)
-            if not scalar and isinstance(newobj, _nx.ndarray):
-                arraytypes.append(newobj.dtype)
+            if scalar:
+                result_type_objs.append(item)
+            else:
+                result_type_objs.append(newobj.dtype)
 
-        # Ensure that scalars won't up-cast unless warranted
-        final_dtype = find_common_type(arraytypes, scalartypes)
-        if final_dtype is not None:
-            for k in scalars:
-                objs[k] = objs[k].astype(final_dtype)
+        # Ensure that scalars won't up-cast unless warranted, for 0, drops
+        # through to error in concatenate.
+        if len(result_type_objs) != 0:
+            final_dtype = _nx.result_type(*result_type_objs)
+            # concatenate could do cast, but that can be overriden:
+            objs = [array(obj, copy=False, subok=True,
+                          ndmin=ndmin, dtype=final_dtype) for obj in objs]
 
         res = self.concatenate(tuple(objs), axis=axis)
 
@@ -596,7 +595,7 @@ class ndenumerate:
     """
 
     def __init__(self, arr):
-        self.iter = asarray(arr).flat
+        self.iter = np.asarray(arr).flat
 
     def __next__(self):
         """
@@ -909,9 +908,9 @@ def fill_diagonal(a, val, wrap=False):
     else:
         # For more than d=2, the strided formula is only valid for arrays with
         # all dimensions equal, so we check first.
-        if not alltrue(diff(a.shape) == 0):
+        if not np.all(diff(a.shape) == 0):
             raise ValueError("All dimensions of input must be of equal length")
-        step = 1 + (cumprod(a.shape[:-1])).sum()
+        step = 1 + (np.cumprod(a.shape[:-1])).sum()
 
     # Write the value out into the diagonal.
     a.flat[:end:step] = val
@@ -982,7 +981,7 @@ def diag_indices(n, ndim=2):
             [0, 1]]])
 
     """
-    idx = arange(n)
+    idx = np.arange(n)
     return (idx,) * ndim
 
 
@@ -1009,13 +1008,39 @@ def diag_indices_from(arr):
     -----
     .. versionadded:: 1.4.0
 
+    Examples
+    --------
+    
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+    
+    Get the indices of the diagonal elements.
+
+    >>> di = np.diag_indices_from(a)
+    >>> di
+    (array([0, 1, 2, 3]), array([0, 1, 2, 3]))
+
+    >>> a[di]
+    array([ 0,  5, 10, 15])
+
+    This is simply syntactic sugar for diag_indices.
+
+    >>> np.diag_indices(a.shape[0])
+    (array([0, 1, 2, 3]), array([0, 1, 2, 3]))
+
     """
 
     if not arr.ndim >= 2:
         raise ValueError("input array must be at least 2-d")
     # For more than d=2, the strided formula is only valid for arrays with
     # all dimensions equal, so we check first.
-    if not alltrue(diff(arr.shape) == 0):
+    if not np.all(diff(arr.shape) == 0):
         raise ValueError("All dimensions of input must be of equal length")
 
     return diag_indices(arr.shape[0], arr.ndim)
diff --git a/numpy/lib/index_tricks.pyi b/numpy/lib/index_tricks.pyi
index c9251abd1..29a6b9e2b 100644
--- a/numpy/lib/index_tricks.pyi
+++ b/numpy/lib/index_tricks.pyi
@@ -119,7 +119,7 @@ class AxisConcatenator:
     @staticmethod
     def makemat(
         data: ArrayLike, dtype: DTypeLike = ..., copy: bool = ...
-    ) -> _Matrix: ...
+    ) -> _Matrix[Any, Any]: ...
 
     # TODO: Sort out this `__getitem__` method
     def __getitem__(self, key: Any) -> Any: ...
diff --git a/numpy/lib/mixins.py b/numpy/lib/mixins.py
index c81239f6b..117cc7851 100644
--- a/numpy/lib/mixins.py
+++ b/numpy/lib/mixins.py
@@ -133,6 +133,7 @@ class NDArrayOperatorsMixin:
 
     .. versionadded:: 1.13
     """
+    __slots__ = ()
     # Like np.ndarray, this mixin class implements "Option 1" from the ufunc
     # overrides NEP.
 
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index 786d2021e..b3b570860 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -169,7 +169,7 @@ def _remove_nan_1d(arr1d, overwrite_input=False):
     s = np.nonzero(c)[0]
     if s.size == arr1d.size:
         warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                      stacklevel=5)
+                      stacklevel=6)
         return arr1d[:0], True
     elif s.size == 0:
         return arr1d, overwrite_input
@@ -343,7 +343,7 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         res = np.fmin.reduce(a, axis=axis, out=out, **kwargs)
         if np.isnan(res).any():
             warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     else:
         # Slow, but safe for subclasses of ndarray
         a, mask = _replace_nan(a, +np.inf)
@@ -357,7 +357,7 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         if np.any(mask):
             res = _copyto(res, np.nan, mask)
             warnings.warn("All-NaN axis encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     return res
 
 
@@ -476,7 +476,7 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         res = np.fmax.reduce(a, axis=axis, out=out, **kwargs)
         if np.isnan(res).any():
             warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     else:
         # Slow, but safe for subclasses of ndarray
         a, mask = _replace_nan(a, -np.inf)
@@ -490,7 +490,7 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         if np.any(mask):
             res = _copyto(res, np.nan, mask)
             warnings.warn("All-NaN axis encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     return res
 
 
@@ -1049,7 +1049,7 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
 
     isbad = (cnt == 0)
     if isbad.any():
-        warnings.warn("Mean of empty slice", RuntimeWarning, stacklevel=3)
+        warnings.warn("Mean of empty slice", RuntimeWarning, stacklevel=2)
         # NaN is the only possible bad value, so no further
         # action is needed to handle bad results.
     return avg
@@ -1109,7 +1109,7 @@ def _nanmedian_small(a, axis=None, out=None, overwrite_input=False):
     m = np.ma.median(a, axis=axis, overwrite_input=overwrite_input)
     for i in range(np.count_nonzero(m.mask.ravel())):
         warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                      stacklevel=4)
+                      stacklevel=5)
 
     fill_value = np.timedelta64("NaT") if m.dtype.kind == "m" else np.nan
     if out is not None:
@@ -1415,8 +1415,8 @@ def nanquantile(
         Input array or object that can be converted to an array, containing
         nan values to be ignored
     q : array_like of float
-        Quantile or sequence of quantiles to compute, which must be between
-        0 and 1 inclusive.
+        Probability or sequence of probabilities for the quantiles to compute.
+        Values must be between 0 and 1 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the quantiles are computed. The
         default is to compute the quantile(s) along a flattened
@@ -1476,8 +1476,8 @@ def nanquantile(
     Returns
     -------
     quantile : scalar or ndarray
-        If `q` is a single percentile and `axis=None`, then the result
-        is a scalar. If multiple quantiles are given, first axis of
+        If `q` is a single probability and `axis=None`, then the result
+        is a scalar. If multiple probability levels are given, first axis of
         the result corresponds to the quantiles. The other axes are
         the axes that remain after the reduction of `a`. If the input
         contains integers or floats smaller than ``float64``, the output
@@ -1763,7 +1763,7 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue,
     isbad = (dof <= 0)
     if np.any(isbad):
         warnings.warn("Degrees of freedom <= 0 for slice.", RuntimeWarning,
-                      stacklevel=3)
+                      stacklevel=2)
         # NaN, inf, or negative numbers are all possible bad
         # values, so explicitly replace them with NaN.
         var = _copyto(var, np.nan, isbad)
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 4a27c7898..339b1dc62 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -142,7 +142,7 @@ class NpzFile(Mapping):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -167,6 +167,8 @@ class NpzFile(Mapping):
     >>> npz = np.load(outfile)
     >>> isinstance(npz, np.lib.npyio.NpzFile)
     True
+    >>> npz
+    NpzFile 'object' with keys x, y
     >>> sorted(npz.files)
     ['x', 'y']
     >>> npz['x']  # getitem access
@@ -178,6 +180,7 @@ class NpzFile(Mapping):
     # Make __exit__ safe if zipfile_factory raises an exception
     zip = None
     fid = None
+    _MAX_REPR_ARRAY_COUNT = 5
 
     def __init__(self, fid, own_fid=False, allow_pickle=False,
                  pickle_kwargs=None, *,
@@ -257,7 +260,23 @@ class NpzFile(Mapping):
             else:
                 return self.zip.read(key)
         else:
-            raise KeyError("%s is not a file in the archive" % key)
+            raise KeyError(f"{key} is not a file in the archive")
+
+    def __contains__(self, key):
+        return (key in self._files or key in self.files)
+
+    def __repr__(self):
+        # Get filename or default to `object`
+        if isinstance(self.fid, str):
+            filename = self.fid
+        else:
+            filename = getattr(self.fid, "name", "object")
+
+        # Get the name of arrays
+        array_names = ', '.join(self.files[:self._MAX_REPR_ARRAY_COUNT])
+        if len(self.files) > self._MAX_REPR_ARRAY_COUNT:
+            array_names += "..."
+        return f"NpzFile {filename!r} with keys: {array_names}"
 
 
 @set_module('numpy')
@@ -309,7 +328,7 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -327,6 +346,9 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
         If ``allow_pickle=True``, but the file cannot be loaded as a pickle.
     ValueError
         The file contains an object array, but ``allow_pickle=False`` given.
+    EOFError
+        When calling ``np.load`` multiple times on the same file handle,
+        if all data has already been read
 
     See Also
     --------
@@ -410,6 +432,8 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
         _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
         N = len(format.MAGIC_PREFIX)
         magic = fid.read(N)
+        if not magic:
+            raise EOFError("No data left in file")
         # If the file size is less than N, we need to make sure not
         # to seek past the beginning of the file
         fid.seek(-min(N, len(magic)), 1)  # back-up
@@ -760,13 +784,6 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int):
 _loadtxt_chunksize = 50000
 
 
-def _loadtxt_dispatcher(
-        fname, dtype=None, comments=None, delimiter=None,
-        converters=None, skiprows=None, usecols=None, unpack=None,
-        ndmin=None, encoding=None, max_rows=None, *, like=None):
-    return (like,)
-
-
 def _check_nonneg_int(value, name="argument"):
     try:
         operator.index(value)
@@ -1161,10 +1178,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         while such lines are counted in `skiprows`.
 
         .. versionadded:: 1.16.0
-        
+
         .. versionchanged:: 1.23.0
-            Lines containing no data, including comment lines (e.g., lines 
-            starting with '#' or as specified via `comments`) are not counted 
+            Lines containing no data, including comment lines (e.g., lines
+            starting with '#' or as specified via `comments`) are not counted
             towards `max_rows`.
     quotechar : unicode character or None, optional
         The character used to denote the start and end of a quoted item.
@@ -1303,6 +1320,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     array([('alpha, #42', 10.), ('beta, #64',  2.)],
           dtype=[('label', '<U12'), ('value', '<f8')])
 
+    Quoted fields can be separated by multiple whitespace characters:
+
+    >>> s = StringIO('"alpha, #42"       10.0\n"beta, #64" 2.0\n')
+    >>> dtype = np.dtype([("label", "U12"), ("value", float)])
+    >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"')
+    array([('alpha, #42', 10.), ('beta, #64',  2.)],
+          dtype=[('label', '<U12'), ('value', '<f8')])
+
     Two consecutive quote characters within a quoted field are treated as a
     single escaped character:
 
@@ -1323,10 +1348,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
     if like is not None:
         return _loadtxt_with_like(
-            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
             converters=converters, skiprows=skiprows, usecols=usecols,
             unpack=unpack, ndmin=ndmin, encoding=encoding,
-            max_rows=max_rows, like=like
+            max_rows=max_rows
         )
 
     if isinstance(delimiter, bytes):
@@ -1353,9 +1378,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     return arr
 
 
-_loadtxt_with_like = array_function_dispatch(
-    _loadtxt_dispatcher
-)(loadtxt)
+_loadtxt_with_like = array_function_dispatch()(loadtxt)
 
 
 def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
@@ -1716,17 +1739,6 @@ def fromregex(file, regexp, dtype, encoding=None):
 #####--------------------------------------------------------------------------
 
 
-def _genfromtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
-                           skip_header=None, skip_footer=None, converters=None,
-                           missing_values=None, filling_values=None, usecols=None,
-                           names=None, excludelist=None, deletechars=None,
-                           replace_space=None, autostrip=None, case_sensitive=None,
-                           defaultfmt=None, unpack=None, usemask=None, loose=None,
-                           invalid_raise=None, max_rows=None, encoding=None,
-                           *, ndmin=None, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
@@ -1924,7 +1936,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     if like is not None:
         return _genfromtxt_with_like(
-            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
             skip_header=skip_header, skip_footer=skip_footer,
             converters=converters, missing_values=missing_values,
             filling_values=filling_values, usecols=usecols, names=names,
@@ -1934,7 +1946,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             unpack=unpack, usemask=usemask, loose=loose,
             invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding,
             ndmin=ndmin,
-            like=like
         )
 
     _ensure_ndmin_ndarray_check_param(ndmin)
@@ -2327,7 +2338,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         column_types = [conv.type for conv in converters]
         # Find the columns with strings...
         strcolidx = [i for (i, v) in enumerate(column_types)
-                     if v == np.unicode_]
+                     if v == np.str_]
 
         if byte_converters and strcolidx:
             # convert strings back to bytes for backward compatibility
@@ -2463,9 +2474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     return output
 
 
-_genfromtxt_with_like = array_function_dispatch(
-    _genfromtxt_dispatcher
-)(genfromtxt)
+_genfromtxt_with_like = array_function_dispatch()(genfromtxt)
 
 
 def recfromtxt(fname, **kwargs):
diff --git a/numpy/lib/npyio.pyi b/numpy/lib/npyio.pyi
index 8007b2dc7..ef0f2a5f1 100644
--- a/numpy/lib/npyio.pyi
+++ b/numpy/lib/npyio.pyi
@@ -72,6 +72,7 @@ class NpzFile(Mapping[str, NDArray[Any]]):
     files: list[str]
     allow_pickle: bool
     pickle_kwargs: None | Mapping[str, Any]
+    _MAX_REPR_ARRAY_COUNT: int
     # Represent `f` as a mutable property so we can access the type of `self`
     @property
     def f(self: _T) -> BagObj[_T]: ...
@@ -97,6 +98,8 @@ class NpzFile(Mapping[str, NDArray[Any]]):
     def __iter__(self) -> Iterator[str]: ...
     def __len__(self) -> int: ...
     def __getitem__(self, key: str) -> NDArray[Any]: ...
+    def __contains__(self, key: str) -> bool: ...
+    def __repr__(self) -> str: ...
 
 # NOTE: Returns a `NpzFile` if file is a zip file;
 # returns an `ndarray`/`memmap` otherwise
@@ -236,7 +239,7 @@ def genfromtxt(
     *,
     ndmin: L[0, 1, 2] = ...,
     like: None | _SupportsArrayFunc = ...,
-) -> NDArray[float64]: ...
+) -> NDArray[Any]: ...
 @overload
 def genfromtxt(
     fname: str | os.PathLike[str] | Iterable[str] | Iterable[bytes],
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 0f7ab0334..3b8db2a95 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -104,7 +104,7 @@ def poly(seq_of_zeros):
 
     References
     ----------
-    .. [1] M. Sullivan and M. Sullivan, III, "Algebra and Trignometry,
+    .. [1] M. Sullivan and M. Sullivan, III, "Algebra and Trigonometry,
        Enhanced With Graphing Utilities," Prentice-Hall, pg. 318, 1996.
 
     .. [2] G. Strang, "Linear Algebra and Its Applications, 2nd Edition,"
@@ -672,7 +672,7 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False):
     # warn on rank reduction, which indicates an ill conditioned matrix
     if rank != order and not full:
         msg = "Polyfit may be poorly conditioned"
-        warnings.warn(msg, RankWarning, stacklevel=4)
+        warnings.warn(msg, RankWarning, stacklevel=2)
 
     if full:
         return c, resids, rank, s, rcond
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 3cb4dc19c..5d8a41bfe 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -643,10 +643,6 @@ def column_stack(tup):
            [3, 4]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrays = []
     for v in tup:
         arr = asanyarray(v)
@@ -713,10 +709,6 @@ def dstack(tup):
            [[3, 4]]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrs = atleast_3d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index bb07e25a9..a180accbe 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -414,13 +414,48 @@ class TestSetOps:
         with pytest.raises(ValueError):
             in1d(a, b, kind="table")
 
+    @pytest.mark.parametrize(
+        "dtype1,dtype2",
+        [
+            (np.int8, np.int16),
+            (np.int16, np.int8),
+            (np.uint8, np.uint16),
+            (np.uint16, np.uint8),
+            (np.uint8, np.int16),
+            (np.int16, np.uint8),
+        ]
+    )
+    @pytest.mark.parametrize("kind", [None, "sort", "table"])
+    def test_in1d_mixed_dtype(self, dtype1, dtype2, kind):
+        """Test that in1d works as expected for mixed dtype input."""
+        is_dtype2_signed = np.issubdtype(dtype2, np.signedinteger)
+        ar1 = np.array([0, 0, 1, 1], dtype=dtype1)
+
+        if is_dtype2_signed:
+            ar2 = np.array([-128, 0, 127], dtype=dtype2)
+        else:
+            ar2 = np.array([127, 0, 255], dtype=dtype2)
+
+        expected = np.array([True, True, False, False])
+
+        expect_failure = kind == "table" and any((
+            dtype1 == np.int8 and dtype2 == np.int16,
+            dtype1 == np.int16 and dtype2 == np.int8
+        ))
+
+        if expect_failure:
+            with pytest.raises(RuntimeError, match="exceed the maximum"):
+                in1d(ar1, ar2, kind=kind)
+        else:
+            assert_array_equal(in1d(ar1, ar2, kind=kind), expected)
+
     @pytest.mark.parametrize("kind", [None, "sort", "table"])
     def test_in1d_mixed_boolean(self, kind):
         """Test that in1d works as expected for bool/int input."""
         for dtype in np.typecodes["AllInteger"]:
             a = np.array([True, False, False], dtype=bool)
-            b = np.array([1, 1, 1, 1], dtype=dtype)
-            expected = np.array([True, False, False], dtype=bool)
+            b = np.array([0, 0, 0, 0], dtype=dtype)
+            expected = np.array([False, True, True], dtype=bool)
             assert_array_equal(in1d(a, b, kind=kind), expected)
 
             a, b = b, a
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index 6f6406cf8..58d08f1e5 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -531,7 +531,8 @@ def test_load_padded_dtype(tmpdir, dt):
 def test_python2_python3_interoperability():
     fname = 'win64python2.npy'
     path = os.path.join(os.path.dirname(__file__), 'data', fname)
-    data = np.load(path)
+    with pytest.warns(UserWarning, match="Reading.*this warning\\."):
+        data = np.load(path)
     assert_array_equal(data, np.ones(2))
 
 def test_pickle_python2_python3():
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index e38a187d8..b0944ec85 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -8,7 +8,7 @@ import pytest
 import hypothesis
 from hypothesis.extra.numpy import arrays
 import hypothesis.strategies as st
-
+from functools import partial
 
 import numpy as np
 from numpy import ma
@@ -229,8 +229,8 @@ class TestAny:
     def test_nd(self):
         y1 = [[0, 0, 0], [0, 1, 0], [1, 1, 0]]
         assert_(np.any(y1))
-        assert_array_equal(np.sometrue(y1, axis=0), [1, 1, 0])
-        assert_array_equal(np.sometrue(y1, axis=1), [0, 1, 1])
+        assert_array_equal(np.any(y1, axis=0), [1, 1, 0])
+        assert_array_equal(np.any(y1, axis=1), [0, 1, 1])
 
 
 class TestAll:
@@ -247,8 +247,8 @@ class TestAll:
     def test_nd(self):
         y1 = [[0, 0, 1], [0, 1, 1], [1, 1, 1]]
         assert_(not np.all(y1))
-        assert_array_equal(np.alltrue(y1, axis=0), [0, 0, 1])
-        assert_array_equal(np.alltrue(y1, axis=1), [0, 0, 1])
+        assert_array_equal(np.all(y1, axis=0), [0, 0, 1])
+        assert_array_equal(np.all(y1, axis=1), [0, 0, 1])
 
 
 class TestCopy:
@@ -1217,6 +1217,13 @@ class TestGradient:
         dfdx = gradient(f, x)
         assert_array_equal(dfdx, [0.5, 0.5])
 
+    def test_return_type(self):
+        res = np.gradient(([1, 2], [2, 3]))
+        if np._using_numpy2_behavior():
+            assert type(res) is tuple
+        else:
+            assert type(res) is list
+
 
 class TestAngle:
 
@@ -1780,6 +1787,70 @@ class TestVectorize:
         assert_equal(type(r), subclass)
         assert_equal(r, m * v)
 
+    def test_name(self):
+        #See gh-23021
+        @np.vectorize
+        def f2(a, b):
+            return a + b
+
+        assert f2.__name__ == 'f2'
+
+    def test_decorator(self):
+        @vectorize
+        def addsubtract(a, b):
+            if a > b:
+                return a - b
+            else:
+                return a + b
+
+        r = addsubtract([0, 3, 6, 9], [1, 3, 5, 7])
+        assert_array_equal(r, [1, 6, 1, 2])
+
+    def test_docstring(self):
+        @vectorize
+        def f(x):
+            """Docstring"""
+            return x
+
+        if sys.flags.optimize < 2:
+            assert f.__doc__ == "Docstring"
+
+    def test_partial(self):
+        def foo(x, y):
+            return x + y
+
+        bar = partial(foo, 3)
+        vbar = np.vectorize(bar)
+        assert vbar(1) == 4
+
+    def test_signature_otypes_decorator(self):
+        @vectorize(signature='(n)->(n)', otypes=['float64'])
+        def f(x):
+            return x
+
+        r = f([1, 2, 3])
+        assert_equal(r.dtype, np.dtype('float64'))
+        assert_array_equal(r, [1, 2, 3])
+        assert f.__name__ == 'f'
+
+    def test_bad_input(self):
+        with assert_raises(TypeError):
+            A = np.vectorize(pyfunc = 3)
+
+    def test_no_keywords(self):
+        with assert_raises(TypeError):
+            @np.vectorize("string")
+            def foo():
+                return "bar"
+
+    def test_positional_regression_9477(self):
+        # This supplies the first keyword argument as a positional,
+        # to ensure that they are still properly forwarded after the
+        # enhancement for #9477
+        f = vectorize((lambda x: x), ['float64'])
+        r = f([2])
+        assert_equal(r.dtype, np.dtype('float64'))
+
 
 class TestLeaks:
     class A:
@@ -3467,9 +3538,20 @@ class TestPercentile:
             np.percentile([1, 2, 3, 4.0], q)
 
 
+quantile_methods = [
+    'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
+    'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
+    'median_unbiased', 'normal_unbiased', 'nearest', 'lower', 'higher',
+    'midpoint']
+
+
 class TestQuantile:
     # most of this is already tested by TestPercentile
 
+    def V(self, x, y, alpha):
+        # Identification function used in several tests.
+        return (x >= y) - alpha
+
     def test_max_ulp(self):
         x = [0.0, 0.2, 0.4]
         a = np.quantile(x, 0.45)
@@ -3484,7 +3566,6 @@ class TestQuantile:
         assert_equal(np.quantile(x, 1), 3.5)
         assert_equal(np.quantile(x, 0.5), 1.75)
 
-    @pytest.mark.xfail(reason="See gh-19154")
     def test_correct_quantile_value(self):
         a = np.array([True])
         tf_quant = np.quantile(True, False)
@@ -3549,11 +3630,7 @@ class TestQuantile:
                           method="nearest")
         assert res.dtype == dtype
 
-    @pytest.mark.parametrize("method",
-             ['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
-              'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
-              'median_unbiased', 'normal_unbiased',
-              'nearest', 'lower', 'higher', 'midpoint'])
+    @pytest.mark.parametrize("method", quantile_methods)
     def test_quantile_monotonic(self, method):
         # GH 14685
         # test that the return value of quantile is monotonic if p0 is ordered
@@ -3584,6 +3661,94 @@ class TestQuantile:
         assert np.isscalar(actual)
         assert_equal(np.quantile(a, 0.5), np.nan)
 
+    @pytest.mark.parametrize("method", quantile_methods)
+    @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9])
+    def test_quantile_identification_equation(self, method, alpha):
+        # Test that the identification equation holds for the empirical
+        # CDF:
+        #   E[V(x, Y)] = 0  <=>  x is quantile
+        # with Y the random variable for which we have observed values and
+        # V(x, y) the canonical identification function for the quantile (at
+        # level alpha), see
+        # https://doi.org/10.48550/arXiv.0912.0902        
+        rng = np.random.default_rng(4321)
+        # We choose n and alpha such that we cover 3 cases:
+        #  - n * alpha is an integer
+        #  - n * alpha is a float that gets rounded down
+        #  - n * alpha is a float that gest rounded up
+        n = 102  # n * alpha = 20.4, 51. , 91.8
+        y = rng.random(n)
+        x = np.quantile(y, alpha, method=method)
+        if method in ("higher",):
+            # These methods do not fulfill the identification equation.
+            assert np.abs(np.mean(self.V(x, y, alpha))) > 0.1 / n
+        elif int(n * alpha) == n * alpha:
+            # We can expect exact results, up to machine precision.
+            assert_allclose(np.mean(self.V(x, y, alpha)), 0, atol=1e-14)
+        else:
+            # V = (x >= y) - alpha cannot sum to zero exactly but within
+            # "sample precision".
+            assert_allclose(np.mean(self.V(x, y, alpha)), 0,
+                atol=1 / n / np.amin([alpha, 1 - alpha]))
+
+    @pytest.mark.parametrize("method", quantile_methods)
+    @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9])
+    def test_quantile_add_and_multiply_constant(self, method, alpha):
+        # Test that
+        #  1. quantile(c + x) = c + quantile(x)
+        #  2. quantile(c * x) = c * quantile(x)
+        #  3. quantile(-x) = -quantile(x, 1 - alpha)
+        #     On empirical quantiles, this equation does not hold exactly.
+        # Koenker (2005) "Quantile Regression" Chapter 2.2.3 calls these
+        # properties equivariance.
+        rng = np.random.default_rng(4321)
+        # We choose n and alpha such that we have cases for
+        #  - n * alpha is an integer
+        #  - n * alpha is a float that gets rounded down
+        #  - n * alpha is a float that gest rounded up
+        n = 102  # n * alpha = 20.4, 51. , 91.8
+        y = rng.random(n)
+        q = np.quantile(y, alpha, method=method)
+        c = 13.5
+
+        # 1
+        assert_allclose(np.quantile(c + y, alpha, method=method), c + q)
+        # 2
+        assert_allclose(np.quantile(c * y, alpha, method=method), c * q)
+        # 3
+        q = -np.quantile(-y, 1 - alpha, method=method)
+        if method == "inverted_cdf":
+            if (
+                n * alpha == int(n * alpha)
+                or np.round(n * alpha) == int(n * alpha) + 1
+            ):
+                assert_allclose(q, np.quantile(y, alpha, method="higher"))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        elif method == "closest_observation":
+            if n * alpha == int(n * alpha):
+                assert_allclose(q, np.quantile(y, alpha, method="higher"))
+            elif np.round(n * alpha) == int(n * alpha) + 1:
+                assert_allclose(
+                    q, np.quantile(y, alpha + 1/n, method="higher"))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        elif method == "interpolated_inverted_cdf":
+            assert_allclose(q, np.quantile(y, alpha + 1/n, method=method))
+        elif method == "nearest":
+            if n * alpha == int(n * alpha):
+                assert_allclose(q, np.quantile(y, alpha + 1/n, method=method))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method=method))
+        elif method == "lower":
+            assert_allclose(q, np.quantile(y, alpha, method="higher"))
+        elif method == "higher":
+            assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        else:
+            # "averaged_inverted_cdf", "hazen", "weibull", "linear",
+            # "median_unbiased", "normal_unbiased", "midpoint"
+            assert_allclose(q, np.quantile(y, alpha, method=method))
+
 
 class TestLerp:
     @hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False,
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 4699935ca..c1032df8e 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -232,6 +232,17 @@ class TestSavezLoad(RoundtripTest):
         assert_equal(a, l['file_a'])
         assert_equal(b, l['file_b'])
 
+
+    def test_tuple_getitem_raises(self):
+        # gh-23748
+        a = np.array([1, 2, 3])
+        f = BytesIO()
+        np.savez(f, a=a)
+        f.seek(0)
+        l = np.load(f)
+        with pytest.raises(KeyError, match="(1, 2)"):
+            l[1, 2]
+
     def test_BagObj(self):
         a = np.array([[1, 2], [3, 4]], float)
         b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
@@ -321,6 +332,21 @@ class TestSavezLoad(RoundtripTest):
             data.close()
             assert_(fp.closed)
 
+    @pytest.mark.parametrize("count, expected_repr", [
+        (1, "NpzFile {fname!r} with keys: arr_0"),
+        (5, "NpzFile {fname!r} with keys: arr_0, arr_1, arr_2, arr_3, arr_4"),
+        # _MAX_REPR_ARRAY_COUNT is 5, so files with more than 5 keys are
+        # expected to end in '...'
+        (6, "NpzFile {fname!r} with keys: arr_0, arr_1, arr_2, arr_3, arr_4..."),
+    ])
+    def test_repr_lists_keys(self, count, expected_repr):
+        a = np.array([[1, 2], [3, 4]], float)
+        with temppath(suffix='.npz') as tmp:
+            np.savez(tmp, *[a]*count)
+            l = np.load(tmp)
+            assert repr(l) == expected_repr.format(fname=tmp)
+            l.close()
+
 
 class TestSaveTxt:
     def test_array(self):
@@ -522,7 +548,7 @@ class TestSaveTxt:
 
     def test_unicode(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         with tempdir() as tmpdir:
             # set encoding as on windows it may not be unicode even on py3
             np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'],
@@ -530,7 +556,7 @@ class TestSaveTxt:
 
     def test_unicode_roundtrip(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         # our gz wrapper support encoding
         suffixes = ['', '.gz']
         if HAS_BZ2:
@@ -542,12 +568,12 @@ class TestSaveTxt:
                 np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a,
                            fmt=['%s'], encoding='UTF-16-LE')
                 b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix),
-                               encoding='UTF-16-LE', dtype=np.unicode_)
+                               encoding='UTF-16-LE', dtype=np.str_)
                 assert_array_equal(a, b)
 
     def test_unicode_bytestream(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         s = BytesIO()
         np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
         s.seek(0)
@@ -555,7 +581,7 @@ class TestSaveTxt:
 
     def test_unicode_stringstream(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         s = StringIO()
         np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
         s.seek(0)
@@ -597,8 +623,8 @@ class TestSaveTxt:
         # in our process if needed, see gh-16889
         memoryerror_raised = Value(c_bool)
 
-        # Since Python 3.8, the default start method for multiprocessing has 
-        # been changed from 'fork' to 'spawn' on macOS, causing inconsistency 
+        # Since Python 3.8, the default start method for multiprocessing has
+        # been changed from 'fork' to 'spawn' on macOS, causing inconsistency
         # on memory sharing model, lead to failed test for check_large_zip
         ctx = get_context('fork')
         p = ctx.Process(target=check_large_zip, args=(memoryerror_raised,))
@@ -652,12 +678,12 @@ class LoadTxtBase:
         with temppath() as path:
             with open(path, "wb") as f:
                 f.write(nonascii.encode("UTF-16"))
-            x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode_)
+            x = self.loadfunc(path, encoding="UTF-16", dtype=np.str_)
             assert_array_equal(x, nonascii)
 
     def test_binary_decode(self):
         utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
-        v = self.loadfunc(BytesIO(utf16), dtype=np.unicode_, encoding='UTF-16')
+        v = self.loadfunc(BytesIO(utf16), dtype=np.str_, encoding='UTF-16')
         assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
 
     def test_converters_decode(self):
@@ -665,7 +691,7 @@ class LoadTxtBase:
         c = TextIO()
         c.write(b'\xcf\x96')
         c.seek(0)
-        x = self.loadfunc(c, dtype=np.unicode_,
+        x = self.loadfunc(c, dtype=np.str_,
                           converters={0: lambda x: x.decode('UTF-8')})
         a = np.array([b'\xcf\x96'.decode('UTF-8')])
         assert_array_equal(x, a)
@@ -676,7 +702,7 @@ class LoadTxtBase:
         with temppath() as path:
             with io.open(path, 'wt', encoding='UTF-8') as f:
                 f.write(utf8)
-            x = self.loadfunc(path, dtype=np.unicode_,
+            x = self.loadfunc(path, dtype=np.str_,
                               converters={0: lambda x: x + 't'},
                               encoding='UTF-8')
             a = np.array([utf8 + 't'])
@@ -1161,7 +1187,7 @@ class TestLoadTxt(LoadTxtBase):
             with open(path, "wb") as f:
                 f.write(butf8)
             with open(path, "rb") as f:
-                x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode_)
+                x = np.loadtxt(f, encoding="UTF-8", dtype=np.str_)
             assert_array_equal(x, sutf8)
             # test broken latin1 conversion people now rely on
             with open(path, "rb") as f:
@@ -2219,7 +2245,7 @@ M   33  21.99
             ctl = np.array([
                      ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"],
                      ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
             assert_array_equal(test, ctl)
 
             # test a mixed dtype
@@ -2262,7 +2288,7 @@ M   33  21.99
                      ["norm1", "norm2", "norm3"],
                      ["norm1", latin1, "norm3"],
                      ["test1", "testNonethe" + utf8, "test3"]],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
             assert_array_equal(test, ctl)
 
     def test_recfromtxt(self):
@@ -2737,3 +2763,13 @@ def test_load_refcount():
     with assert_no_gc_cycles():
         x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt)
         assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt))
+
+def test_load_multiple_arrays_until_eof():
+    f = BytesIO()
+    np.save(f, 1)
+    np.save(f, 2)
+    f.seek(0)
+    assert np.load(f) == 1
+    assert np.load(f) == 2
+    with pytest.raises(EOFError):
+        np.load(f)
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index 0b8fe3c47..2d805e434 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -244,6 +244,14 @@ def test_converters_negative_indices_with_usecols():
                      usecols=[0, -1], converters={-1: (lambda x: -1)})
     assert_array_equal(res, [[0, -1], [0, -1]])
 
+
+def test_ragged_error():
+    rows = ["1,2,3", "1,2,3", "4,3,2,1"]
+    with pytest.raises(ValueError,
+            match="the number of columns changed from 3 to 4 at row 3"):
+        np.loadtxt(rows, delimiter=",")
+
+
 def test_ragged_usecols():
     # usecols, and negative ones, work even with varying number of columns.
     txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n")
@@ -534,12 +542,27 @@ def test_quoted_field(q):
     assert_array_equal(res, expected)
 
 
+@pytest.mark.parametrize("q", ('"', "'", "`"))
+def test_quoted_field_with_whitepace_delimiter(q):
+    txt = StringIO(
+        f"{q}alpha, x{q}     2.5\n{q}beta, y{q} 4.5\n{q}gamma, z{q}   5.0\n"
+    )
+    dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)])
+    expected = np.array(
+        [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype
+    )
+
+    res = np.loadtxt(txt, dtype=dtype, delimiter=None, quotechar=q)
+    assert_array_equal(res, expected)
+
+
 def test_quote_support_default():
     """Support for quoted fields is disabled by default."""
     txt = StringIO('"lat,long", 45, 30\n')
     dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)])
 
-    with pytest.raises(ValueError, match="the number of columns changed"):
+    with pytest.raises(ValueError,
+            match="the dtype passed requires 3 columns but 4 were"):
         np.loadtxt(txt, dtype=dtype, delimiter=",")
 
     # Enable quoting support with non-None value for quotechar param
@@ -1011,3 +1034,15 @@ def test_control_characters_as_bytes():
     """Byte control characters (comments, delimiter) are supported."""
     a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
     assert_equal(a, [1, 2, 3])
+
+
+@pytest.mark.filterwarnings('ignore::UserWarning')
+def test_field_growing_cases():
+    # Test empty field appending/growing (each field still takes 1 character)
+    # to see if the final field appending does not create issues.
+    res = np.loadtxt([""], delimiter=",", dtype=bytes)
+    assert len(res) == 0
+
+    for i in range(1, 1024):
+        res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
+        assert len(res) == i+1
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 45cacb792..257de381b 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -824,6 +824,7 @@ class TestNanFunctions_Median:
             (-3, -1),
         ]
     )
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
     def test_keepdims_out(self, axis):
         d = np.ones((3, 5, 7, 11))
         # Randomly set some elements to NaN:
@@ -1027,6 +1028,7 @@ class TestNanFunctions_Percentile:
             (-3, -1),
         ]
     )
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
     def test_keepdims_out(self, q, axis):
         d = np.ones((3, 5, 7, 11))
         # Randomly set some elements to NaN:
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 76058cf20..eb6628904 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -492,7 +492,7 @@ class TestColumnStack:
         assert_equal(actual, expected)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             column_stack((np.arange(3) for _ in range(2)))
 
 
@@ -529,7 +529,7 @@ class TestDstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             dstack((np.arange(3) for _ in range(2)))
 
 
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index 141f508fd..eb008c600 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -4,20 +4,14 @@
 from numpy.testing import (
     assert_equal, assert_array_equal, assert_array_max_ulp,
     assert_array_almost_equal, assert_raises, assert_
-    )
-
+)
 from numpy import (
     arange, add, fliplr, flipud, zeros, ones, eye, array, diag, histogram2d,
     tri, mask_indices, triu_indices, triu_indices_from, tril_indices,
     tril_indices_from, vander,
-    )
-
+)
 import numpy as np
 
-
-from numpy.core.tests.test_overrides import requires_array_function
-
-
 import pytest
 
 
@@ -283,7 +277,6 @@ class TestHistogram2d:
         assert_array_equal(H, answer)
         assert_array_equal(xe, array([0., 0.25, 0.5, 0.75, 1]))
 
-    @requires_array_function
     def test_dispatch(self):
         class ShouldDispatch:
             def __array_function__(self, function, types, args, kwargs):
diff --git a/numpy/lib/tests/test_type_check.py b/numpy/lib/tests/test_type_check.py
index 3f4ca6309..ea0326139 100644
--- a/numpy/lib/tests/test_type_check.py
+++ b/numpy/lib/tests/test_type_check.py
@@ -155,7 +155,7 @@ class TestIscomplex:
     def test_fail(self):
         z = np.array([-1, 0, 1])
         res = iscomplex(z)
-        assert_(not np.sometrue(res, axis=0))
+        assert_(not np.any(res, axis=0))
 
     def test_pass(self):
         z = np.array([-1j, 1, 0])
diff --git a/numpy/lib/tests/test_ufunclike.py b/numpy/lib/tests/test_ufunclike.py
index c280b6969..fac4f41d0 100644
--- a/numpy/lib/tests/test_ufunclike.py
+++ b/numpy/lib/tests/test_ufunclike.py
@@ -80,12 +80,6 @@ class TestUfunclike:
         assert_(isinstance(f0d, MyArray))
         assert_equal(f0d.metadata, 'bar')
 
-    def test_deprecated(self):
-        # NumPy 1.13.0, 2017-04-26
-        assert_warns(DeprecationWarning, ufl.fix, [1, 2], y=nx.empty(2))
-        assert_warns(DeprecationWarning, ufl.isposinf, [1, 2], y=nx.empty(2))
-        assert_warns(DeprecationWarning, ufl.isneginf, [1, 2], y=nx.empty(2))
-
     def test_scalar(self):
         x = np.inf
         actual = np.isposinf(x)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 654ee4cf5..6dcb65651 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -155,10 +155,6 @@ def flipud(m):
     return m[::-1, ...]
 
 
-def _eye_dispatcher(N, M=None, k=None, dtype=None, order=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
@@ -209,7 +205,7 @@ def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
 
     """
     if like is not None:
-        return _eye_with_like(N, M=M, k=k, dtype=dtype, order=order, like=like)
+        return _eye_with_like(like, N, M=M, k=k, dtype=dtype, order=order)
     if M is None:
         M = N
     m = zeros((N, M), dtype=dtype, order=order)
@@ -228,9 +224,7 @@ def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
     return m
 
 
-_eye_with_like = array_function_dispatch(
-    _eye_dispatcher
-)(eye)
+_eye_with_like = array_function_dispatch()(eye)
 
 
 def _diag_dispatcher(v, k=None):
@@ -369,10 +363,6 @@ def diagflat(v, k=0):
     return wrap(res)
 
 
-def _tri_dispatcher(N, M=None, k=None, dtype=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def tri(N, M=None, k=0, dtype=float, *, like=None):
@@ -416,7 +406,7 @@ def tri(N, M=None, k=0, dtype=float, *, like=None):
 
     """
     if like is not None:
-        return _tri_with_like(N, M=M, k=k, dtype=dtype, like=like)
+        return _tri_with_like(like, N, M=M, k=k, dtype=dtype)
 
     if M is None:
         M = N
@@ -430,9 +420,7 @@ def tri(N, M=None, k=0, dtype=float, *, like=None):
     return m
 
 
-_tri_with_like = array_function_dispatch(
-    _tri_dispatcher
-)(tri)
+_tri_with_like = array_function_dispatch()(tri)
 
 
 def _trilu_dispatcher(m, k=None):
@@ -766,7 +754,7 @@ def histogram2d(x, y, bins=10, range=None, density=None, weights=None):
     >>> xcenters = (xedges[:-1] + xedges[1:]) / 2
     >>> ycenters = (yedges[:-1] + yedges[1:]) / 2
     >>> im.set_data(xcenters, ycenters, H)
-    >>> ax.images.append(im)
+    >>> ax.add_image(im)
     >>> plt.show()
 
     It is also possible to construct a 2-D histogram without specifying bin
@@ -995,9 +983,42 @@ def tril_indices_from(arr, k=0):
     k : int, optional
         Diagonal offset (see `tril` for details).
 
+    Examples
+    --------
+
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+
+    Pass the array to get the indices of the lower triangular elements.
+
+    >>> trili = np.tril_indices_from(a)
+    >>> trili
+    (array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3]), array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3]))
+
+    >>> a[trili]
+    array([ 0,  4,  5,  8,  9, 10, 12, 13, 14, 15])
+
+    This is syntactic sugar for tril_indices().
+
+    >>> np.tril_indices(a.shape[0])
+    (array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3]), array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3]))
+
+    Use the `k` parameter to return the indices for the lower triangular array
+    up to the k-th diagonal.
+
+    >>> trili1 = np.tril_indices_from(a, k=1)
+    >>> a[trili1]
+    array([ 0,  1,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15])
+
     See Also
     --------
-    tril_indices, tril
+    tril_indices, tril, triu_indices_from
 
     Notes
     -----
@@ -1114,9 +1135,43 @@ def triu_indices_from(arr, k=0):
     triu_indices_from : tuple, shape(2) of ndarray, shape(N)
         Indices for the upper-triangle of `arr`.
 
+    Examples
+    --------
+
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+
+    Pass the array to get the indices of the upper triangular elements.
+
+    >>> triui = np.triu_indices_from(a)
+    >>> triui
+    (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), array([0, 1, 2, 3, 1, 2, 3, 2, 3, 3]))
+
+    >>> a[triui]
+    array([ 0,  1,  2,  3,  5,  6,  7, 10, 11, 15])
+
+    This is syntactic sugar for triu_indices().
+
+    >>> np.triu_indices(a.shape[0])
+    (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), array([0, 1, 2, 3, 1, 2, 3, 2, 3, 3]))
+
+    Use the `k` parameter to return the indices for the upper triangular array
+    from the k-th diagonal.
+
+    >>> triuim1 = np.triu_indices_from(a, k=1)
+    >>> a[triuim1]
+    array([ 1,  2,  3,  6,  7, 11])
+
+
     See Also
     --------
-    triu_indices, triu
+    triu_indices, triu, tril_indices_from
 
     Notes
     -----
diff --git a/numpy/lib/type_check.py b/numpy/lib/type_check.py
index 0dc014d76..3f84b80e5 100644
--- a/numpy/lib/type_check.py
+++ b/numpy/lib/type_check.py
@@ -2,7 +2,6 @@
 
 """
 import functools
-import warnings
 
 __all__ = ['iscomplexobj', 'isrealobj', 'imag', 'iscomplex',
            'isreal', 'nan_to_num', 'real', 'real_if_close',
@@ -12,7 +11,7 @@ __all__ = ['iscomplexobj', 'isrealobj', 'imag', 'iscomplex',
 from .._utils import set_module
 import numpy.core.numeric as _nx
 from numpy.core.numeric import asarray, asanyarray, isnan, zeros
-from numpy.core import overrides
+from numpy.core import overrides, getlimits
 from .ufunclike import isneginf, isposinf
 
 
@@ -541,7 +540,8 @@ def real_if_close(a, tol=100):
         Input array.
     tol : float
         Tolerance in machine epsilons for the complex part of the elements
-        in the array.
+        in the array. If the tolerance is <=1, then the absolute tolerance
+        is used.
 
     Returns
     -------
@@ -572,11 +572,11 @@ def real_if_close(a, tol=100):
 
     """
     a = asanyarray(a)
-    if not issubclass(a.dtype.type, _nx.complexfloating):
+    type_ = a.dtype.type
+    if not issubclass(type_, _nx.complexfloating):
         return a
     if tol > 1:
-        from numpy.core import getlimits
-        f = getlimits.finfo(a.dtype.type)
+        f = getlimits.finfo(type_)
         tol = f.eps * tol
     if _nx.all(_nx.absolute(a.imag) < tol):
         a = a.real
diff --git a/numpy/lib/ufunclike.py b/numpy/lib/ufunclike.py
index a93c4773b..05fe60c5b 100644
--- a/numpy/lib/ufunclike.py
+++ b/numpy/lib/ufunclike.py
@@ -6,72 +6,16 @@ storing results in an output array.
 __all__ = ['fix', 'isneginf', 'isposinf']
 
 import numpy.core.numeric as nx
-from numpy.core.overrides import (
-    array_function_dispatch, ARRAY_FUNCTION_ENABLED,
-)
+from numpy.core.overrides import array_function_dispatch
 import warnings
 import functools
 
 
-def _deprecate_out_named_y(f):
-    """
-    Allow the out argument to be passed as the name `y` (deprecated)
-
-    In future, this decorator should be removed.
-    """
-    @functools.wraps(f)
-    def func(x, out=None, **kwargs):
-        if 'y' in kwargs:
-            if 'out' in kwargs:
-                raise TypeError(
-                    "{} got multiple values for argument 'out'/'y'"
-                    .format(f.__name__)
-                )
-            out = kwargs.pop('y')
-            # NumPy 1.13.0, 2017-04-26
-            warnings.warn(
-                "The name of the out argument to {} has changed from `y` to "
-                "`out`, to match other ufuncs.".format(f.__name__),
-                DeprecationWarning, stacklevel=3)
-        return f(x, out=out, **kwargs)
-
-    return func
-
-
-def _fix_out_named_y(f):
-    """
-    Allow the out argument to be passed as the name `y` (deprecated)
-
-    This decorator should only be used if _deprecate_out_named_y is used on
-    a corresponding dispatcher function.
-    """
-    @functools.wraps(f)
-    def func(x, out=None, **kwargs):
-        if 'y' in kwargs:
-            # we already did error checking in _deprecate_out_named_y
-            out = kwargs.pop('y')
-        return f(x, out=out, **kwargs)
-
-    return func
-
-
-def _fix_and_maybe_deprecate_out_named_y(f):
-    """
-    Use the appropriate decorator, depending upon if dispatching is being used.
-    """
-    if ARRAY_FUNCTION_ENABLED:
-        return _fix_out_named_y(f)
-    else:
-        return _deprecate_out_named_y(f)
-
-
-@_deprecate_out_named_y
 def _dispatcher(x, out=None):
     return (x, out)
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def fix(x, out=None):
     """
     Round to nearest integer towards zero.
@@ -125,7 +69,6 @@ def fix(x, out=None):
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def isposinf(x, out=None):
     """
     Test element-wise for positive infinity, return result as bool array.
@@ -197,7 +140,6 @@ def isposinf(x, out=None):
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def isneginf(x, out=None):
     """
     Test element-wise for negative infinity, return result as bool array.
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index 2a9d30b16..095c914db 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -5,6 +5,7 @@ import types
 import re
 import warnings
 import functools
+import platform
 
 from .._utils import set_module
 from numpy.core.numerictypes import issubclass_, issubsctype, issubdtype
@@ -24,6 +25,8 @@ def show_runtime():
     including available intrinsic support and BLAS/LAPACK library
     in use
 
+    .. versionadded:: 1.24.0
+
     See Also
     --------
     show_config : Show libraries in the system on which NumPy was built.
@@ -31,45 +34,20 @@ def show_runtime():
     Notes
     -----
     1. Information is derived with the help of `threadpoolctl <https://pypi.org/project/threadpoolctl/>`_
-       library.
+       library if available.
     2. SIMD related information is derived from ``__cpu_features__``,
        ``__cpu_baseline__`` and ``__cpu_dispatch__``
 
-    Examples
-    --------
-    >>> import numpy as np
-    >>> np.show_runtime()
-    [{'simd_extensions': {'baseline': ['SSE', 'SSE2', 'SSE3'],
-                          'found': ['SSSE3',
-                                    'SSE41',
-                                    'POPCNT',
-                                    'SSE42',
-                                    'AVX',
-                                    'F16C',
-                                    'FMA3',
-                                    'AVX2'],
-                          'not_found': ['AVX512F',
-                                        'AVX512CD',
-                                        'AVX512_KNL',
-                                        'AVX512_KNM',
-                                        'AVX512_SKX',
-                                        'AVX512_CLX',
-                                        'AVX512_CNL',
-                                        'AVX512_ICL']}},
-     {'architecture': 'Zen',
-      'filepath': '/usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so',
-      'internal_api': 'openblas',
-      'num_threads': 12,
-      'prefix': 'libopenblas',
-      'threading_layer': 'pthreads',
-      'user_api': 'blas',
-      'version': '0.3.20'}]
     """
     from numpy.core._multiarray_umath import (
         __cpu_features__, __cpu_baseline__, __cpu_dispatch__
     )
     from pprint import pprint
-    config_found = []
+    config_found = [{
+        "numpy_version": np.__version__,
+        "python": sys.version,
+        "uname": platform.uname(),
+        }]
     features_found, features_not_found = [], []
     for feature in __cpu_dispatch__:
         if __cpu_features__[feature]:
@@ -550,15 +528,16 @@ def _info(obj, output=None):
 @set_module('numpy')
 def info(object=None, maxwidth=76, output=None, toplevel='numpy'):
     """
-    Get help information for a function, class, or module.
+    Get help information for an array, function, class, or module.
 
     Parameters
     ----------
     object : object or str, optional
-        Input object or name to get information about. If `object` is a
-        numpy object, its docstring is given. If it is a string, available
-        modules are searched for matching objects.  If None, information
-        about `info` itself is returned.
+        Input object or name to get information about. If `object` is
+        an `ndarray` instance, information about the array is printed.
+        If `object` is a numpy object, its docstring is given. If it is
+        a string, available modules are searched for matching objects.
+        If None, information about `info` itself is returned.
     maxwidth : int, optional
         Printing width.
     output : file like object, optional
@@ -597,6 +576,22 @@ def info(object=None, maxwidth=76, output=None, toplevel='numpy'):
          *** Repeat reference found in numpy.fft.fftpack ***
          *** Total of 3 references found. ***
 
+    When the argument is an array, information about the array is printed.
+
+    >>> a = np.array([[1 + 2j, 3, -4], [-5j, 6, 0]], dtype=np.complex64)
+    >>> np.info(a)
+    class:  ndarray
+    shape:  (2, 3)
+    strides:  (24, 8)
+    itemsize:  8
+    aligned:  True
+    contiguous:  True
+    fortran:  False
+    data pointer: 0x562b6e0d2860  # may vary
+    byteorder:  little
+    byteswap:  False
+    type: complex64
+
     """
     global _namedict, _dictlist
     # Local import to speed up numpy's import time.
diff --git a/numpy/linalg/lapack_lite/clapack_scrub.py b/numpy/linalg/lapack_lite/clapack_scrub.py
index fffd70910..d4da9070d 100644
--- a/numpy/linalg/lapack_lite/clapack_scrub.py
+++ b/numpy/linalg/lapack_lite/clapack_scrub.py
@@ -297,7 +297,7 @@ def scrubSource(source, nsteps=None, verbose=False):
 if __name__ == '__main__':
     filename = sys.argv[1]
     outfilename = os.path.join(sys.argv[2], os.path.basename(filename))
-    with open(filename, 'r') as fo:
+    with open(filename) as fo:
         source = fo.read()
 
     if len(sys.argv) > 3:
diff --git a/numpy/linalg/lapack_lite/make_lite.py b/numpy/linalg/lapack_lite/make_lite.py
index ca8d4c62c..20b212792 100755
--- a/numpy/linalg/lapack_lite/make_lite.py
+++ b/numpy/linalg/lapack_lite/make_lite.py
@@ -253,7 +253,7 @@ def dumpRoutineNames(library, output_dir):
 def concatenateRoutines(routines, output_file):
     with open(output_file, 'w') as output_fo:
         for r in routines:
-            with open(r.filename, 'r') as fo:
+            with open(r.filename) as fo:
                 source = fo.read()
             output_fo.write(source)
 
@@ -296,7 +296,7 @@ def create_name_header(output_dir):
         if not fn.endswith('.f'):
             continue
 
-        with open(fn, 'r') as f:
+        with open(fn) as f:
             for line in f:
                 m = routine_re.match(line)
                 if m:
@@ -304,7 +304,7 @@ def create_name_header(output_dir):
 
     # f2c symbols
     f2c_symbols = set()
-    with open('f2c.h', 'r') as f:
+    with open('f2c.h') as f:
         for line in f:
             m = extern_re.match(line)
             if m:
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 0f06c8520..b838b9397 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -24,7 +24,7 @@ from numpy.core import (
     array, asarray, zeros, empty, empty_like, intc, single, double,
     csingle, cdouble, inexact, complexfloating, newaxis, all, Inf, dot,
     add, multiply, sqrt, sum, isfinite,
-    finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
+    finfo, errstate, geterrobj, moveaxis, amin, amax, prod, abs,
     atleast_2d, intp, asanyarray, object_, matmul,
     swapaxes, divide, count_nonzero, isnan, sign, argsort, sort,
     reciprocal
@@ -65,11 +65,11 @@ fortran_int = intc
 
 
 @set_module('numpy.linalg')
-class LinAlgError(Exception):
+class LinAlgError(ValueError):
     """
     Generic Python-exception-derived object raised by linalg functions.
 
-    General purpose exception class, derived from Python's exception.Exception
+    General purpose exception class, derived from Python's ValueError
     class, programmatically raised in linalg functions when a Linear
     Algebra-related condition would prevent further correct execution of the
     function.
@@ -161,24 +161,24 @@ def _commonType(*arrays):
     result_type = single
     is_complex = False
     for a in arrays:
-        if issubclass(a.dtype.type, inexact):
-            if isComplexType(a.dtype.type):
+        type_ = a.dtype.type
+        if issubclass(type_, inexact):
+            if isComplexType(type_):
                 is_complex = True
-            rt = _realType(a.dtype.type, default=None)
-            if rt is None:
+            rt = _realType(type_, default=None)
+            if rt is double:
+                result_type = double
+            elif rt is None:
                 # unsupported inexact scalar
                 raise TypeError("array type %s is unsupported in linalg" %
                         (a.dtype.name,))
         else:
-            rt = double
-        if rt is double:
             result_type = double
     if is_complex:
-        t = cdouble
         result_type = _complex_types_map[result_type]
+        return cdouble, result_type
     else:
-        t = double
-    return t, result_type
+        return double, result_type
 
 
 def _to_native_byte_order(*arrays):
@@ -219,7 +219,7 @@ def _assert_finite(*arrays):
 
 def _is_empty_2d(arr):
     # check size first for efficiency
-    return arr.size == 0 and product(arr.shape[-2:]) == 0
+    return arr.size == 0 and prod(arr.shape[-2:]) == 0
 
 
 def transpose(a):
@@ -924,12 +924,12 @@ def qr(a, mode='reduced'):
             msg = "".join((
                     "The 'full' option is deprecated in favor of 'reduced'.\n",
                     "For backward compatibility let mode default."))
-            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
             mode = 'reduced'
         elif mode in ('e', 'economic'):
             # 2013-04-01, 1.8
             msg = "The 'economic' option is deprecated."
-            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
             mode = 'economic'
         else:
             raise ValueError(f"Unrecognized mode '{mode}'")
@@ -2308,7 +2308,7 @@ def lstsq(a, b, rcond="warn"):
                       "To use the future default and silence this warning "
                       "we advise to pass `rcond=None`, to keep using the old, "
                       "explicitly pass `rcond=-1`.",
-                      FutureWarning, stacklevel=3)
+                      FutureWarning, stacklevel=2)
         rcond = -1
     if rcond is None:
         rcond = finfo(t).eps * max(n, m)
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index 1c4e1295e..6f72635ab 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -4,7 +4,6 @@ import sysconfig
 
 def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
     from numpy.distutils.system_info import get_info, system_info
     config = Configuration('linalg', parent_package, top_path)
 
@@ -81,7 +80,6 @@ def configuration(parent_package='', top_path=None):
         sources=['umath_linalg.cpp', get_lapack_lite_sources],
         depends=['lapack_lite/f2c.h'],
         extra_info=lapack_info,
-        extra_cxx_compile_args=NPY_CXX_FLAGS,
         libraries=['npymath'],
     )
     config.add_data_files('*.pyi')
diff --git a/numpy/linalg/umath_linalg.cpp b/numpy/linalg/umath_linalg.cpp
index bbb4bb896..68db2b2f1 100644
--- a/numpy/linalg/umath_linalg.cpp
+++ b/numpy/linalg/umath_linalg.cpp
@@ -22,6 +22,7 @@
 #include <cstdio>
 #include <cassert>
 #include <cmath>
+#include <type_traits>
 #include <utility>
 
 
@@ -1148,7 +1149,7 @@ slogdet(char **args,
                void *NPY_UNUSED(func))
 {
     fortran_int m;
-    npy_uint8 *tmp_buff = NULL;
+    char *tmp_buff = NULL;
     size_t matrix_size;
     size_t pivot_size;
     size_t safe_m;
@@ -1162,10 +1163,11 @@ slogdet(char **args,
      */
     INIT_OUTER_LOOP_3
     m = (fortran_int) dimensions[0];
-    safe_m = m;
+    /* avoid empty malloc (buffers likely unused) and ensure m is `size_t` */
+    safe_m = m != 0 ? m : 1;
     matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
-    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+    tmp_buff = (char *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff) {
         LINEARIZE_DATA_t lin_data;
@@ -1182,6 +1184,13 @@ slogdet(char **args,
 
         free(tmp_buff);
     }
+    else {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
 }
 
 template<typename typ, typename basetyp>
@@ -1192,7 +1201,7 @@ det(char **args,
            void *NPY_UNUSED(func))
 {
     fortran_int m;
-    npy_uint8 *tmp_buff;
+    char *tmp_buff;
     size_t matrix_size;
     size_t pivot_size;
     size_t safe_m;
@@ -1206,10 +1215,11 @@ det(char **args,
      */
     INIT_OUTER_LOOP_2
     m = (fortran_int) dimensions[0];
-    safe_m = m;
+    /* avoid empty malloc (buffers likely unused) and ensure m is `size_t` */
+    safe_m = m != 0 ? m : 1;
     matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
-    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+    tmp_buff = (char *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff) {
         LINEARIZE_DATA_t lin_data;
@@ -1230,6 +1240,13 @@ det(char **args,
 
         free(tmp_buff);
     }
+    else {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
 }
 
 
@@ -3737,16 +3754,16 @@ scalar_trait)
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
-
-    if (!mem_buff)
-        goto error;
+    size_t msize = a_size + b_size + s_size;
+    mem_buff = (npy_uint8 *)malloc(msize != 0 ? msize : 1);
 
+    if (!mem_buff) {
+        goto no_memory;
+    }
     a = mem_buff;
     b = a + a_size;
     s = b + b_size;
 
-
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
@@ -3766,9 +3783,9 @@ scalar_trait)
         params->RWORK = NULL;
         params->LWORK = -1;
 
-        if (call_gelsd(params) != 0)
+        if (call_gelsd(params) != 0) {
             goto error;
-
+        }
         work_count = (fortran_int)work_size_query;
 
         work_size  = (size_t) work_size_query * sizeof(ftyp);
@@ -3776,9 +3793,9 @@ scalar_trait)
     }
 
     mem_buff2 = (npy_uint8 *)malloc(work_size + iwork_size);
-    if (!mem_buff2)
-        goto error;
-
+    if (!mem_buff2) {
+        goto no_memory;
+    }
     work = mem_buff2;
     iwork = work + work_size;
 
@@ -3788,12 +3805,18 @@ scalar_trait)
     params->LWORK = work_count;
 
     return 1;
+
+ no_memory:
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_NoMemory();
+    NPY_DISABLE_C_API;
+
  error:
     TRACE_TXT("%s failed init\n", __FUNCTION__);
     free(mem_buff);
     free(mem_buff2);
     memset(params, 0, sizeof(*params));
-
     return 0;
 }
 
@@ -3857,16 +3880,17 @@ using frealtyp = basetype_t<ftyp>;
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
+    size_t msize = a_size + b_size + s_size;
+    mem_buff = (npy_uint8 *)malloc(msize != 0 ? msize : 1);
 
-    if (!mem_buff)
-        goto error;
+    if (!mem_buff) {
+        goto no_memory;
+    }
 
     a = mem_buff;
     b = a + a_size;
     s = b + b_size;
 
-
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
@@ -3887,8 +3911,9 @@ using frealtyp = basetype_t<ftyp>;
         params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
-        if (call_gelsd(params) != 0)
+        if (call_gelsd(params) != 0) {
             goto error;
+        }
 
         work_count = (fortran_int)work_size_query.r;
 
@@ -3898,8 +3923,9 @@ using frealtyp = basetype_t<ftyp>;
     }
 
     mem_buff2 = (npy_uint8 *)malloc(work_size + rwork_size + iwork_size);
-    if (!mem_buff2)
-        goto error;
+    if (!mem_buff2) {
+        goto no_memory;
+    }
 
     work = mem_buff2;
     rwork = work + work_size;
@@ -3911,6 +3937,13 @@ using frealtyp = basetype_t<ftyp>;
     params->LWORK = work_count;
 
     return 1;
+
+ no_memory:
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_NoMemory();
+    NPY_DISABLE_C_API;
+
  error:
     TRACE_TXT("%s failed init\n", __FUNCTION__);
     free(mem_buff);
diff --git a/numpy/ma/__init__.pyi b/numpy/ma/__init__.pyi
index 7f5cb56a8..ce72383e5 100644
--- a/numpy/ma/__init__.pyi
+++ b/numpy/ma/__init__.pyi
@@ -155,7 +155,6 @@ from numpy.ma.core import (
     resize as resize,
     right_shift as right_shift,
     round as round,
-    round_ as round_,
     set_fill_value as set_fill_value,
     shape as shape,
     sin as sin,
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 59ba3a593..2fe326885 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -2310,7 +2310,7 @@ def masked_values(x, value, rtol=1e-5, atol=1e-8, copy=True, shrink=True):
                  mask=False,
            fill_value=2.1)
 
-    Unlike `masked_equal`, `masked_values` can perform approximate equalities. 
+    Unlike `masked_equal`, `masked_values` can perform approximate equalities.
 
     >>> ma.masked_values(x, 2.1, atol=1e-1)
     masked_array(data=[1.0, 1.1, --, 1.1, 3.0],
@@ -2356,8 +2356,13 @@ def masked_invalid(a, copy=True):
            fill_value=1e+20)
 
     """
-
-    return masked_where(~(np.isfinite(getdata(a))), a, copy=copy)
+    a = np.array(a, copy=False, subok=True)
+    res = masked_where(~(np.isfinite(a)), a, copy=copy)
+    # masked_invalid previously never returned nomask as a mask and doing so
+    # threw off matplotlib (gh-22842).  So use shrink=False:
+    if res._mask is nomask:
+        res._mask = make_mask_none(res.shape, res.dtype)
+    return res
 
 ###############################################################################
 #                            Printing options                                 #
@@ -2852,7 +2857,7 @@ class MaskedArray(ndarray):
                     mask = np.array(
                         [getmaskarray(np.asanyarray(m, dtype=_data.dtype))
                          for m in data], dtype=mdtype)
-                except ValueError:
+                except (ValueError, TypeError):
                     # If data is nested
                     mask = nomask
                 # Force shrinking of the mask if needed (and possible)
@@ -2865,11 +2870,17 @@ class MaskedArray(ndarray):
                     _data._mask = _data._mask.copy()
                     # Reset the shape of the original mask
                     if getmask(data) is not nomask:
-                        data._mask.shape = data.shape
+                        # gh-21022 encounters an issue here
+                        # because data._mask.shape is not writeable, but
+                        # the op was also pointless in that case, because
+                        # the shapes were the same, so we can at least
+                        # avoid that path
+                        if data._mask.shape != data.shape:
+                            data._mask.shape = data.shape
         else:
             # Case 2. : With a mask in input.
             # If mask is boolean, create an array of True or False
-            
+
             # if users pass `mask=None` be forgiving here and cast it False
             # for speed; although the default is `mask=nomask` and can differ.
             if mask is None:
@@ -2922,7 +2933,7 @@ class MaskedArray(ndarray):
                     else:
                         _data._mask = np.logical_or(mask, _data._mask)
                     _data._sharedmask = False
-        
+
         # Update fill_value.
         if fill_value is None:
             fill_value = getattr(data, '_fill_value', None)
@@ -3329,6 +3340,10 @@ class MaskedArray(ndarray):
                 # Note: Don't try to check for m.any(), that'll take too long
         return dout
 
+    # setitem may put NaNs into integer arrays or occasionally overflow a
+    # float.  But this may happen in masked values, so avoid otherwise
+    # correct warnings (as is typical also in masked calculations).
+    @np.errstate(over='ignore', invalid='ignore')
     def __setitem__(self, indx, value):
         """
         x.__setitem__(i, y) <==> x[i]=y
@@ -4620,6 +4635,7 @@ class MaskedArray(ndarray):
             otherwise.  'K' means to read the elements in the order they occur
             in memory, except for reversing the data when strides are negative.
             By default, 'C' index order is used.
+            (Masked arrays currently use 'A' on the data when 'K' is passed.)
 
         Returns
         -------
@@ -4646,6 +4662,13 @@ class MaskedArray(ndarray):
                fill_value=999999)
 
         """
+        # The order of _data and _mask could be different (it shouldn't be
+        # normally).  Passing order `K` or `A` would be incorrect.
+        # So we ignore the mask memory order.
+        # TODO: We don't actually support K, so use A instead.  We could
+        #       try to guess this correct by sorting strides or deprecate.
+        if order in "kKaA":
+            order = "F" if self._data.flags.fnc else "C"
         r = ndarray.ravel(self._data, order=order).view(type(self))
         r._update_from(self)
         if self._mask is not nomask:
@@ -6295,6 +6318,12 @@ class MaskedArray(ndarray):
         memo[id(self)] = copied
         for (k, v) in self.__dict__.items():
             copied.__dict__[k] = deepcopy(v, memo)
+        # as clearly documented for np.copy(), you need to use
+        # deepcopy() directly for arrays of object type that may
+        # contain compound types--you cannot depend on normal
+        # copy semantics to do the right thing here
+        if self.dtype.hasobject:
+            copied._data[...] = deepcopy(copied._data)
         return copied
 
 
@@ -6991,6 +7020,21 @@ def sort(a, axis=-1, kind=None, order=None, endwith=True, fill_value=None):
     See Also
     --------
     MaskedArray.sort : equivalent method
+
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> x = [11.2, -3.973, 0.801, -1.41]
+    >>> mask = [0, 0, 0, 1]
+    >>> masked_x = ma.masked_array(x, mask)
+    >>> masked_x
+    masked_array(data=[11.2, -3.973, 0.801, --],
+                 mask=[False, False, False,  True],
+           fill_value=1e+20)
+    >>> ma.sort(masked_x)
+    masked_array(data=[-3.973, 0.801, 11.2, --],
+                 mask=[False, False, False,  True],
+           fill_value=1e+20)
     """
     a = np.array(a, copy=True, subok=True)
     if axis is None:
@@ -7016,6 +7060,29 @@ def compressed(x):
     --------
     ma.MaskedArray.compressed : Equivalent method.
 
+    Examples
+    --------
+    
+    Create an array with negative values masked:
+
+    >>> import numpy as np
+    >>> x = np.array([[1, -1, 0], [2, -1, 3], [7, 4, -1]])
+    >>> masked_x = np.ma.masked_array(x, mask=x < 0)
+    >>> masked_x
+    masked_array(
+      data=[[1, --, 0],
+            [2, --, 3],
+            [7, 4, --]],
+      mask=[[False,  True, False],
+            [False,  True, False],
+            [False, False,  True]],
+      fill_value=999999)
+
+    Compress the masked array into a 1-D array of non-masked values:
+
+    >>> np.ma.compressed(masked_x)
+    array([1, 0, 2, 3, 7, 4])
+
     """
     return asanyarray(x).compressed()
 
@@ -7091,6 +7158,38 @@ def diag(v, k=0):
     --------
     numpy.diag : Equivalent function for ndarrays.
 
+    Examples
+    --------
+
+    Create an array with negative values masked:
+
+    >>> import numpy as np
+    >>> x = np.array([[11.2, -3.973, 18], [0.801, -1.41, 12], [7, 33, -12]])
+    >>> masked_x = np.ma.masked_array(x, mask=x < 0)
+    >>> masked_x
+    masked_array(
+      data=[[11.2, --, 18.0],
+            [0.801, --, 12.0],
+            [7.0, 33.0, --]],
+      mask=[[False,  True, False],
+            [False,  True, False],
+            [False, False,  True]],
+      fill_value=1e+20)
+
+    Isolate the main diagonal from the masked array:
+
+    >>> np.ma.diag(masked_x)
+    masked_array(data=[11.2, --, --],
+                 mask=[False,  True,  True],
+           fill_value=1e+20)
+
+    Isolate the first diagonal below the main diagonal:
+
+    >>> np.ma.diag(masked_x, -1)
+    masked_array(data=[0.801, 33.0],
+                 mask=[False, False],
+           fill_value=1e+20)
+
     """
     output = np.diag(v, k).view(MaskedArray)
     if getmask(v) is not nomask:
@@ -7130,6 +7229,21 @@ def right_shift(a, n):
     --------
     numpy.right_shift
 
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> x = [11, 3, 8, 1]
+    >>> mask = [0, 0, 0, 1]
+    >>> masked_x = ma.masked_array(x, mask)
+    >>> masked_x
+    masked_array(data=[11, 3, 8, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
+    >>> ma.right_shift(masked_x,1)
+    masked_array(data=[5, 1, 4, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
+
     """
     m = getmask(a)
     if m is nomask:
@@ -7341,6 +7455,141 @@ def size(obj, axis=None):
 size.__doc__ = np.size.__doc__
 
 
+def diff(a, /, n=1, axis=-1, prepend=np._NoValue, append=np._NoValue):
+    """
+    Calculate the n-th discrete difference along the given axis.
+    The first difference is given by ``out[i] = a[i+1] - a[i]`` along
+    the given axis, higher differences are calculated by using `diff`
+    recursively.
+    Preserves the input mask.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array
+    n : int, optional
+        The number of times values are differenced. If zero, the input
+        is returned as-is.
+    axis : int, optional
+        The axis along which the difference is taken, default is the
+        last axis.
+    prepend, append : array_like, optional
+        Values to prepend or append to `a` along axis prior to
+        performing the difference.  Scalar values are expanded to
+        arrays with length 1 in the direction of axis and the shape
+        of the input array in along all other axes.  Otherwise the
+        dimension and shape must match `a` except along axis.
+
+    Returns
+    -------
+    diff : MaskedArray
+        The n-th differences. The shape of the output is the same as `a`
+        except along `axis` where the dimension is smaller by `n`. The
+        type of the output is the same as the type of the difference
+        between any two elements of `a`. This is the same as the type of
+        `a` in most cases. A notable exception is `datetime64`, which
+        results in a `timedelta64` output array.
+
+    See Also
+    --------
+    numpy.diff : Equivalent function in the top-level NumPy module.
+
+    Notes
+    -----
+    Type is preserved for boolean arrays, so the result will contain
+    `False` when consecutive elements are the same and `True` when they
+    differ.
+
+    For unsigned integer arrays, the results will also be unsigned. This
+    should not be surprising, as the result is consistent with
+    calculating the difference directly:
+
+    >>> u8_arr = np.array([1, 0], dtype=np.uint8)
+    >>> np.ma.diff(u8_arr)
+    masked_array(data=[255],
+                 mask=False,
+           fill_value=999999,
+                dtype=uint8)
+    >>> u8_arr[1,...] - u8_arr[0,...]
+    255
+
+    If this is not desirable, then the array should be cast to a larger
+    integer type first:
+
+    >>> i16_arr = u8_arr.astype(np.int16)
+    >>> np.ma.diff(i16_arr)
+    masked_array(data=[-1],
+                 mask=False,
+           fill_value=999999,
+                dtype=int16)
+
+    Examples
+    --------
+    >>> a = np.array([1, 2, 3, 4, 7, 0, 2, 3])
+    >>> x = np.ma.masked_where(a < 2, a)
+    >>> np.ma.diff(x)
+    masked_array(data=[--, 1, 1, 3, --, --, 1],
+            mask=[ True, False, False, False,  True,  True, False],
+        fill_value=999999)
+
+    >>> np.ma.diff(x, n=2)
+    masked_array(data=[--, 0, 2, --, --, --],
+                mask=[ True, False, False,  True,  True,  True],
+        fill_value=999999)
+
+    >>> a = np.array([[1, 3, 1, 5, 10], [0, 1, 5, 6, 8]])
+    >>> x = np.ma.masked_equal(a, value=1)
+    >>> np.ma.diff(x)
+    masked_array(
+        data=[[--, --, --, 5],
+                [--, --, 1, 2]],
+        mask=[[ True,  True,  True, False],
+                [ True,  True, False, False]],
+        fill_value=1)
+
+    >>> np.ma.diff(x, axis=0)
+    masked_array(data=[[--, --, --, 1, -2]],
+            mask=[[ True,  True,  True, False, False]],
+        fill_value=1)
+
+    """
+    if n == 0:
+        return a
+    if n < 0:
+        raise ValueError("order must be non-negative but got " + repr(n))
+
+    a = np.ma.asanyarray(a)
+    if a.ndim == 0:
+        raise ValueError(
+            "diff requires input that is at least one dimensional"
+            )
+
+    combined = []
+    if prepend is not np._NoValue:
+        prepend = np.ma.asanyarray(prepend)
+        if prepend.ndim == 0:
+            shape = list(a.shape)
+            shape[axis] = 1
+            prepend = np.broadcast_to(prepend, tuple(shape))
+        combined.append(prepend)
+
+    combined.append(a)
+
+    if append is not np._NoValue:
+        append = np.ma.asanyarray(append)
+        if append.ndim == 0:
+            shape = list(a.shape)
+            shape[axis] = 1
+            append = np.broadcast_to(append, tuple(shape))
+        combined.append(append)
+
+    if len(combined) > 1:
+        a = np.ma.concatenate(combined, axis)
+
+    # GH 22465 np.diff without prepend/append preserves the mask
+    return np.diff(a, n, axis)
+
+
 ##############################################################################
 #                            Extra functions                                 #
 ##############################################################################
@@ -7568,94 +7817,18 @@ def round_(a, decimals=0, out=None):
 round = round_
 
 
-# Needed by dot, so move here from extras.py. It will still be exported
-# from extras.py for compatibility.
-def mask_rowcols(a, axis=None):
+def _mask_propagate(a, axis):
     """
-    Mask rows and/or columns of a 2D array that contain masked values.
-
-    Mask whole rows and/or columns of a 2D array that contain
-    masked values.  The masking behavior is selected using the
-    `axis` parameter.
-
-      - If `axis` is None, rows *and* columns are masked.
-      - If `axis` is 0, only rows are masked.
-      - If `axis` is 1 or -1, only columns are masked.
-
-    Parameters
-    ----------
-    a : array_like, MaskedArray
-        The array to mask.  If not a MaskedArray instance (or if no array
-        elements are masked).  The result is a MaskedArray with `mask` set
-        to `nomask` (False). Must be a 2D array.
-    axis : int, optional
-        Axis along which to perform the operation. If None, applies to a
-        flattened version of the array.
-
-    Returns
-    -------
-    a : MaskedArray
-        A modified version of the input array, masked depending on the value
-        of the `axis` parameter.
-
-    Raises
-    ------
-    NotImplementedError
-        If input array `a` is not 2D.
-
-    See Also
-    --------
-    mask_rows : Mask rows of a 2D array that contain masked values.
-    mask_cols : Mask cols of a 2D array that contain masked values.
-    masked_where : Mask where a condition is met.
-
-    Notes
-    -----
-    The input array's mask is modified by this function.
-
-    Examples
-    --------
-    >>> import numpy.ma as ma
-    >>> a = np.zeros((3, 3), dtype=int)
-    >>> a[1, 1] = 1
-    >>> a
-    array([[0, 0, 0],
-           [0, 1, 0],
-           [0, 0, 0]])
-    >>> a = ma.masked_equal(a, 1)
-    >>> a
-    masked_array(
-      data=[[0, 0, 0],
-            [0, --, 0],
-            [0, 0, 0]],
-      mask=[[False, False, False],
-            [False,  True, False],
-            [False, False, False]],
-      fill_value=1)
-    >>> ma.mask_rowcols(a)
-    masked_array(
-      data=[[0, --, 0],
-            [--, --, --],
-            [0, --, 0]],
-      mask=[[False,  True, False],
-            [ True,  True,  True],
-            [False,  True, False]],
-      fill_value=1)
-
+    Mask whole 1-d vectors of an array that contain masked values.
     """
     a = array(a, subok=False)
-    if a.ndim != 2:
-        raise NotImplementedError("mask_rowcols works for 2D arrays only.")
     m = getmask(a)
-    # Nothing is masked: return a
-    if m is nomask or not m.any():
+    if m is nomask or not m.any() or axis is None:
         return a
-    maskedval = m.nonzero()
     a._mask = a._mask.copy()
-    if not axis:
-        a[np.unique(maskedval[0])] = masked
-    if axis in [None, 1, -1]:
-        a[:, np.unique(maskedval[1])] = masked
+    axes = normalize_axis_tuple(axis, a.ndim)
+    for ax in axes:
+        a._mask |= m.any(axis=ax, keepdims=True)
     return a
 
 
@@ -7672,10 +7845,6 @@ def dot(a, b, strict=False, out=None):
     corresponding method, it is recommended that the optional arguments be
     treated as keyword only.  At some point that may be mandatory.
 
-    .. note::
-      Works only with 2-D arrays at the moment.
-
-
     Parameters
     ----------
     a, b : masked_array_like
@@ -7719,18 +7888,22 @@ def dot(a, b, strict=False, out=None):
       fill_value=999999)
 
     """
-    # !!!: Works only with 2D arrays. There should be a way to get it to run
-    # with higher dimension
-    if strict and (a.ndim == 2) and (b.ndim == 2):
-        a = mask_rowcols(a, 0)
-        b = mask_rowcols(b, 1)
+    if strict is True:
+        if np.ndim(a) == 0 or np.ndim(b) == 0:
+            pass
+        elif b.ndim == 1:
+            a = _mask_propagate(a, a.ndim - 1)
+            b = _mask_propagate(b, b.ndim - 1)
+        else:
+            a = _mask_propagate(a, a.ndim - 1)
+            b = _mask_propagate(b, b.ndim - 2)
     am = ~getmaskarray(a)
     bm = ~getmaskarray(b)
 
     if out is None:
         d = np.dot(filled(a, 0), filled(b, 0))
         m = ~np.dot(am, bm)
-        if d.ndim == 0:
+        if np.ndim(d) == 0:
             d = np.asarray(d)
         r = d.view(get_masked_subclass(a, b))
         r.__setmask__(m)
@@ -8281,12 +8454,6 @@ clip = _convert2ma(
     np_ret='clipped_array : ndarray',
     np_ma_ret='clipped_array : MaskedArray',
 )
-diff = _convert2ma(
-    'diff',
-    params=dict(fill_value=None, hardmask=False),
-    np_ret='diff : ndarray',
-    np_ma_ret='diff : MaskedArray',
-)
 empty = _convert2ma(
     'empty',
     params=dict(fill_value=None, hardmask=False),
diff --git a/numpy/ma/core.pyi b/numpy/ma/core.pyi
index 94a91da85..e94ebce3c 100644
--- a/numpy/ma/core.pyi
+++ b/numpy/ma/core.pyi
@@ -7,7 +7,6 @@ from numpy import (
     amin as amin,
     bool_ as bool_,
     expand_dims as expand_dims,
-    diff as diff,
     clip as clip,
     indices as indices,
     ones_like as ones_like,
@@ -220,6 +219,10 @@ class MaskedArray(ndarray[_ShapeType, _DType_co]):
     def compress(self, condition, axis=..., out=...): ...
     def __eq__(self, other): ...
     def __ne__(self, other): ...
+    def __ge__(self, other): ...
+    def __gt__(self, other): ...
+    def __le__(self, other): ...
+    def __lt__(self, other): ...
     def __add__(self, other): ...
     def __radd__(self, other): ...
     def __sub__(self, other): ...
@@ -429,10 +432,10 @@ def resize(x, new_shape): ...
 def ndim(obj): ...
 def shape(obj): ...
 def size(obj, axis=...): ...
+def diff(a, /, n=..., axis=..., prepend=..., append=...): ...
 def where(condition, x=..., y=...): ...
 def choose(indices, choices, out=..., mode=...): ...
-def round_(a, decimals=..., out=...): ...
-round = round_
+def round(a, decimals=..., out=...): ...
 
 def inner(a, b): ...
 innerproduct = inner
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 41bce0f22..8a6246c36 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -27,8 +27,7 @@ from . import core as ma
 from .core import (
     MaskedArray, MAError, add, array, asarray, concatenate, filled, count,
     getmask, getmaskarray, make_mask_descr, masked, masked_array, mask_or,
-    nomask, ones, sort, zeros, getdata, get_masked_subclass, dot,
-    mask_rowcols
+    nomask, ones, sort, zeros, getdata, get_masked_subclass, dot
     )
 
 import numpy as np
@@ -398,7 +397,7 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
         dtypes.append(np.asarray(res).dtype)
         outarr = zeros(outshape, object)
         outarr[tuple(ind)] = res
-        Ntot = np.product(outshape)
+        Ntot = np.prod(outshape)
         k = 1
         while k < Ntot:
             # increment the index
@@ -418,7 +417,7 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
         j = i.copy()
         j[axis] = ([slice(None, None)] * res.ndim)
         j.put(indlist, ind)
-        Ntot = np.product(outshape)
+        Ntot = np.prod(outshape)
         holdshape = outshape
         outshape = list(arr.shape)
         outshape[axis] = res.shape
@@ -955,6 +954,95 @@ def compress_cols(a):
     return compress_rowcols(a, 1)
 
 
+def mask_rowcols(a, axis=None):
+    """
+    Mask rows and/or columns of a 2D array that contain masked values.
+
+    Mask whole rows and/or columns of a 2D array that contain
+    masked values.  The masking behavior is selected using the
+    `axis` parameter.
+
+      - If `axis` is None, rows *and* columns are masked.
+      - If `axis` is 0, only rows are masked.
+      - If `axis` is 1 or -1, only columns are masked.
+
+    Parameters
+    ----------
+    a : array_like, MaskedArray
+        The array to mask.  If not a MaskedArray instance (or if no array
+        elements are masked), the result is a MaskedArray with `mask` set
+        to `nomask` (False). Must be a 2D array.
+    axis : int, optional
+        Axis along which to perform the operation. If None, applies to a
+        flattened version of the array.
+
+    Returns
+    -------
+    a : MaskedArray
+        A modified version of the input array, masked depending on the value
+        of the `axis` parameter.
+
+    Raises
+    ------
+    NotImplementedError
+        If input array `a` is not 2D.
+
+    See Also
+    --------
+    mask_rows : Mask rows of a 2D array that contain masked values.
+    mask_cols : Mask cols of a 2D array that contain masked values.
+    masked_where : Mask where a condition is met.
+
+    Notes
+    -----
+    The input array's mask is modified by this function.
+
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> a = np.zeros((3, 3), dtype=int)
+    >>> a[1, 1] = 1
+    >>> a
+    array([[0, 0, 0],
+           [0, 1, 0],
+           [0, 0, 0]])
+    >>> a = ma.masked_equal(a, 1)
+    >>> a
+    masked_array(
+      data=[[0, 0, 0],
+            [0, --, 0],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [False,  True, False],
+            [False, False, False]],
+      fill_value=1)
+    >>> ma.mask_rowcols(a)
+    masked_array(
+      data=[[0, --, 0],
+            [--, --, --],
+            [0, --, 0]],
+      mask=[[False,  True, False],
+            [ True,  True,  True],
+            [False,  True, False]],
+      fill_value=1)
+
+    """
+    a = array(a, subok=False)
+    if a.ndim != 2:
+        raise NotImplementedError("mask_rowcols works for 2D arrays only.")
+    m = getmask(a)
+    # Nothing is masked: return a
+    if m is nomask or not m.any():
+        return a
+    maskedval = m.nonzero()
+    a._mask = a._mask.copy()
+    if not axis:
+        a[np.unique(maskedval[0])] = masked
+    if axis in [None, 1, -1]:
+        a[:, np.unique(maskedval[1])] = masked
+    return a
+
+
 def mask_rows(a, axis=np._NoValue):
     """
     Mask rows of a 2D array that contain masked values.
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index bc897d731..6ab1d7e4f 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -8,6 +8,7 @@ __author__ = "Pierre GF Gerard-Marchant"
 
 import sys
 import warnings
+import copy
 import operator
 import itertools
 import textwrap
@@ -23,6 +24,7 @@ import numpy.core.umath as umath
 from numpy.testing import (
     assert_raises, assert_warns, suppress_warnings, IS_WASM
     )
+from numpy.testing._private.utils import requires_memory
 from numpy import ndarray
 from numpy.compat import asbytes
 from numpy.ma.testutils import (
@@ -375,6 +377,24 @@ class TestMaskedArray:
         assert_equal(s1, s2)
         assert_(x1[1:1].shape == (0,))
 
+    def test_setitem_no_warning(self):
+        # Setitem shouldn't warn, because the assignment might be masked
+        # and warning for a masked assignment is weird (see gh-23000)
+        # (When the value is masked, otherwise a warning would be acceptable
+        # but is not given currently.)
+        x = np.ma.arange(60).reshape((6, 10))
+        index = (slice(1, 5, 2), [7, 5])
+        value = np.ma.masked_all((2, 2))
+        value._data[...] = np.inf  # not a valid integer...
+        x[index] = value
+        # The masked scalar is special cased, but test anyway (it's NaN):
+        x[...] = np.ma.masked
+        # Finally, a large value that cannot be cast to the float32 `x`
+        x = np.ma.arange(3., dtype=np.float32)
+        value = np.ma.array([2e234, 1, 1], mask=[True, False, False])
+        x[...] = value
+        x[[0, 1, 2]] = value
+
     @suppress_copy_mask_on_assignment
     def test_copy(self):
         # Tests of some subtle points of copying and sizing.
@@ -1334,16 +1354,16 @@ class TestMaskedArrayArithmetic:
         assert_equal(np.sum(x, axis=0), sum(x, axis=0))
         assert_equal(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0))
         assert_equal(np.sum(x, 0), sum(x, 0))
-        assert_equal(np.product(x, axis=0), product(x, axis=0))
-        assert_equal(np.product(x, 0), product(x, 0))
-        assert_equal(np.product(filled(xm, 1), axis=0), product(xm, axis=0))
+        assert_equal(np.prod(x, axis=0), product(x, axis=0))
+        assert_equal(np.prod(x, 0), product(x, 0))
+        assert_equal(np.prod(filled(xm, 1), axis=0), product(xm, axis=0))
         s = (3, 4)
         x.shape = y.shape = xm.shape = ym.shape = s
         if len(s) > 1:
             assert_equal(np.concatenate((x, y), 1), concatenate((xm, ym), 1))
             assert_equal(np.add.reduce(x, 1), add.reduce(x, 1))
             assert_equal(np.sum(x, 1), sum(x, 1))
-            assert_equal(np.product(x, 1), product(x, 1))
+            assert_equal(np.prod(x, 1), product(x, 1))
 
     def test_binops_d2D(self):
         # Test binary operations on 2D data
@@ -3409,6 +3429,24 @@ class TestMaskedArrayMethods:
         assert_equal(a.ravel(order='C'), [1, 2, 3, 4])
         assert_equal(a.ravel(order='F'), [1, 3, 2, 4])
 
+    @pytest.mark.parametrize("order", "AKCF")
+    @pytest.mark.parametrize("data_order", "CF")
+    def test_ravel_order(self, order, data_order):
+        # Ravelling must ravel mask and data in the same order always to avoid
+        # misaligning the two in the ravel result.
+        arr = np.ones((5, 10), order=data_order)
+        arr[0, :] = 0
+        mask = np.ones((10, 5), dtype=bool, order=data_order).T
+        mask[0, :] = False
+        x = array(arr, mask=mask)
+        assert x._data.flags.fnc != x._mask.flags.fnc
+        assert (x.filled(0) == 0).all()
+        raveled = x.ravel(order)
+        assert (raveled.filled(0) == 0).all()
+
+        # NOTE: Can be wrong if arr order is neither C nor F and `order="K"` 
+        assert_array_equal(arr.ravel(order), x.ravel(order)._data)
+
     def test_reshape(self):
         # Tests reshape
         x = arange(4)
@@ -4084,6 +4122,7 @@ class TestMaskedArrayMathMethods:
         assert_equal(a.max(-1), [3, 6])
         assert_equal(a.max(1), [3, 6])
 
+    @requires_memory(free_bytes=2 * 10000 * 1000 * 2)
     def test_mean_overflow(self):
         # Test overflow in masked arrays
         # gh-20272
@@ -4091,6 +4130,46 @@ class TestMaskedArrayMathMethods:
                          mask=np.zeros((10000, 10000)))
         assert_equal(a.mean(), 65535.0)
 
+    def test_diff_with_prepend(self):
+        # GH 22465
+        x = np.array([1, 2, 2, 3, 4, 2, 1, 1])
+
+        a = np.ma.masked_equal(x[3:], value=2)
+        a_prep = np.ma.masked_equal(x[:3], value=2)
+        diff1 = np.ma.diff(a, prepend=a_prep, axis=0)
+
+        b = np.ma.masked_equal(x, value=2)
+        diff2 = np.ma.diff(b, axis=0)
+
+        assert_(np.ma.allequal(diff1, diff2))
+
+    def test_diff_with_append(self):
+        # GH 22465
+        x = np.array([1, 2, 2, 3, 4, 2, 1, 1])
+
+        a = np.ma.masked_equal(x[:3], value=2)
+        a_app = np.ma.masked_equal(x[3:], value=2)
+        diff1 = np.ma.diff(a, append=a_app, axis=0)
+
+        b = np.ma.masked_equal(x, value=2)
+        diff2 = np.ma.diff(b, axis=0)
+
+        assert_(np.ma.allequal(diff1, diff2))
+
+    def test_diff_with_dim_0(self):
+        with pytest.raises(
+            ValueError,
+            match="diff requires input that is at least one dimensional"
+            ):
+            np.ma.diff(np.array(1))
+
+    def test_diff_with_n_0(self):
+        a = np.ma.masked_equal([1, 2, 2, 3, 4, 2, 1, 1], value=2)
+        diff = np.ma.diff(a, n=0, axis=0)
+
+        assert_(np.ma.allequal(a, diff))
+
+
 class TestMaskedArrayMathMethodsComplex:
     # Test class for miscellaneous MaskedArrays methods.
     def setup_method(self):
@@ -4464,7 +4543,7 @@ class TestMaskedArrayFunctions:
         x = np.arange(4, dtype=np.int32)
         y = np.arange(4, dtype=np.float32) * 2.2
         test = where(x > 1.5, y, x).dtype
-        control = np.find_common_type([np.int32, np.float32], [])
+        control = np.result_type(np.int32, np.float32)
         assert_equal(test, control)
 
     def test_where_broadcast(self):
@@ -4507,6 +4586,32 @@ class TestMaskedArrayFunctions:
                            match="not supported for the input types"):
             np.ma.masked_invalid(a)
 
+    def test_masked_invalid_pandas(self):
+        # getdata() used to be bad for pandas series due to its _data
+        # attribute.  This test is a regression test mainly and may be
+        # removed if getdata() is adjusted.
+        class Series():
+            _data = "nonsense"
+
+            def __array__(self):
+                return np.array([5, np.nan, np.inf])
+
+        arr = np.ma.masked_invalid(Series())
+        assert_array_equal(arr._data, np.array(Series()))
+        assert_array_equal(arr._mask, [False, True, True])
+
+    @pytest.mark.parametrize("copy", [True, False])
+    def test_masked_invalid_full_mask(self, copy):
+        # Matplotlib relied on masked_invalid always returning a full mask
+        # (Also astropy projects, but were ok with it gh-22720 and gh-22842)
+        a = np.ma.array([1, 2, 3, 4])
+        assert a._mask is nomask
+        res = np.ma.masked_invalid(a, copy=copy)
+        assert res.mask is not nomask
+        # mask of a should not be mutated
+        assert a.mask is nomask
+        assert np.may_share_memory(a._data, res._data) != copy
+
     def test_choose(self):
         # Test choose
         choices = [[0, 1, 2, 3], [10, 11, 12, 13],
@@ -5502,3 +5607,47 @@ note
 original note"""
 
     assert_equal(np.ma.core.doc_note(method.__doc__, "note"), expected_doc)
+
+
+def test_gh_22556():
+    source = np.ma.array([0, [0, 1, 2]], dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[1].append('this should not appear in source')
+    assert len(source[1]) == 3
+
+
+def test_gh_21022():
+    # testing for absence of reported error
+    source = np.ma.masked_array(data=[-1, -1], mask=True, dtype=np.float64)
+    axis = np.array(0)
+    result = np.prod(source, axis=axis, keepdims=False)
+    result = np.ma.masked_array(result,
+                                mask=np.ones(result.shape, dtype=np.bool_))
+    array = np.ma.masked_array(data=-1, mask=True, dtype=np.float64)
+    copy.deepcopy(array)
+    copy.deepcopy(result)
+
+
+def test_deepcopy_2d_obj():
+    source = np.ma.array([[0, "dog"],
+                          [1, 1],
+                          [[1, 2], "cat"]],
+                        mask=[[0, 1],
+                              [0, 0],
+                              [0, 0]],
+                        dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[2, 0].extend(['this should not appear in source', 3])
+    assert len(source[2, 0]) == 2
+    assert len(deepcopy[2, 0]) == 4
+    assert_equal(deepcopy._mask, source._mask)
+    deepcopy._mask[0, 0] = 1
+    assert source._mask[0, 0] == 0
+
+
+def test_deepcopy_0d_obj():
+    source = np.ma.array(0, mask=[0], dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[...] = 17
+    assert_equal(source, 0)
+    assert_equal(deepcopy, 17)
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index 38603fb84..d09a50fec 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -387,8 +387,8 @@ class TestConcatenator:
         # Tests mr_ on 2D arrays.
         a_1 = np.random.rand(5, 5)
         a_2 = np.random.rand(5, 5)
-        m_1 = np.round_(np.random.rand(5, 5), 0)
-        m_2 = np.round_(np.random.rand(5, 5), 0)
+        m_1 = np.round(np.random.rand(5, 5), 0)
+        m_2 = np.round(np.random.rand(5, 5), 0)
         b_1 = masked_array(a_1, mask=m_1)
         b_2 = masked_array(a_2, mask=m_2)
         # append columns
@@ -730,6 +730,47 @@ class TestCompressFunctions:
         assert_equal(c.mask, [[0, 0, 1], [1, 1, 1], [0, 0, 1]])
         c = dot(b, a, strict=False)
         assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[0, 0], [0, 0]], [[0, 0], [0, 1]]])
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask,
+                     [[[[1, 1], [1, 1]], [[0, 0], [0, 1]]],
+                      [[[0, 0], [0, 1]], [[0, 0], [0, 1]]]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask,
+                     [[[[0, 0], [0, 1]], [[0, 0], [0, 0]]],
+                      [[[0, 0], [0, 0]], [[0, 0], [0, 0]]]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask,
+                     [[[[1, 0], [0, 0]], [[1, 0], [0, 0]]],
+                      [[[1, 0], [0, 0]], [[1, 1], [1, 1]]]])
+        c = dot(b, a, strict=False)
+        assert_equal(c.mask,
+                     [[[[0, 0], [0, 0]], [[0, 0], [0, 0]]],
+                      [[[0, 0], [0, 0]], [[1, 0], [0, 0]]]])
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = 5.
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(b, a, strict=False)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = masked_array(np.arange(2), mask=[0, 1])
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[1, 1], [1, 1]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask, [[1, 0], [0, 0]])
 
     def test_dot_returns_maskedarray(self):
         # See gh-6611
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 8465b1153..7b892ad23 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -194,16 +194,16 @@ class TestMa:
         assert_(eq(np.sum(x, axis=0), sum(x, axis=0)))
         assert_(eq(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0)))
         assert_(eq(np.sum(x, 0), sum(x, 0)))
-        assert_(eq(np.product(x, axis=0), product(x, axis=0)))
-        assert_(eq(np.product(x, 0), product(x, 0)))
-        assert_(eq(np.product(filled(xm, 1), axis=0),
+        assert_(eq(np.prod(x, axis=0), product(x, axis=0)))
+        assert_(eq(np.prod(x, 0), product(x, 0)))
+        assert_(eq(np.prod(filled(xm, 1), axis=0),
                            product(xm, axis=0)))
         if len(s) > 1:
             assert_(eq(np.concatenate((x, y), 1),
                                concatenate((xm, ym), 1)))
             assert_(eq(np.add.reduce(x, 1), add.reduce(x, 1)))
             assert_(eq(np.sum(x, 1), sum(x, 1)))
-            assert_(eq(np.product(x, 1), product(x, 1)))
+            assert_(eq(np.prod(x, 1), product(x, 1)))
 
     def test_testCI(self):
         # Test of conversions and indexing
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index cb3d0349f..f4f32cc7a 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -89,3 +89,9 @@ class TestRegression:
     def test_masked_array_tobytes_fortran(self):
         ma = np.ma.arange(4).reshape((2,2))
         assert_array_equal(ma.tobytes(order='F'), ma.T.tobytes())
+
+    def test_structured_array(self):
+        # see gh-22041
+        np.ma.array((1, (b"", b"")),
+                    dtype=[("x", np.int_),
+                          ("y", [("i", np.void), ("j", np.void)])])
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index 64c66eeb9..e3c885253 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -154,6 +154,7 @@ class WrappedArray(NDArrayOperatorsMixin):
     ufunc deferrals are commutative.
     See: https://github.com/numpy/numpy/issues/15200)
     """
+    __slots__ = ('_array', 'attrs')
     __array_priority__ = 20
 
     def __init__(self, array, **attrs):
@@ -448,3 +449,12 @@ class TestClassWrapping:
         assert_(isinstance(np.divide(wm, m2), WrappedArray))
         assert_(isinstance(np.divide(m2, wm), WrappedArray))
         assert_equal(np.divide(m2, wm), np.divide(wm, m2))
+
+    def test_mixins_have_slots(self):
+        mixin = NDArrayOperatorsMixin()
+        # Should raise an error
+        assert_raises(AttributeError, mixin.__setattr__, "not_a_real_attr", 1)
+
+        m = np.ma.masked_array([1, 3, 5], mask=[False, True, False])
+        wm = WrappedArray(m)
+        assert_raises(AttributeError, wm.__setattr__, "not_an_attr", 2)
diff --git a/numpy/ma/testutils.py b/numpy/ma/testutils.py
index 2dd479abe..7a633906b 100644
--- a/numpy/ma/testutils.py
+++ b/numpy/ma/testutils.py
@@ -233,7 +233,7 @@ def fail_if_array_equal(x, y, err_msg='', verbose=True):
 
     """
     def compare(x, y):
-        return (not np.alltrue(approx(x, y)))
+        return (not np.all(approx(x, y)))
     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,
                          header='Arrays are not equal')
 
diff --git a/numpy/meson.build b/numpy/meson.build
index 23bd83a04..de36e0d49 100644
--- a/numpy/meson.build
+++ b/numpy/meson.build
@@ -66,14 +66,22 @@ endif
 blas = dependency(blas_name, required: false)
 lapack = dependency(lapack_name, required: false)
 
+dependency_map = {
+  'BLAS': blas,
+  'LAPACK': lapack,
+}
+
 # BLAS and LAPACK are optional dependencies for NumPy. We can only use a BLAS
 # which provides a CBLAS interface.
 # TODO: add ILP64 support
 have_blas = blas.found() # TODO: and blas.has_cblas()
 have_lapack = lapack.found()
-
-
-# TODO: generate __config__.py (see scipy/meson.build)
+if have_blas
+  # TODO: this is a shortcut - it needs to be checked rather than used
+  # unconditionally, and then added to the extension modules that need the
+  # macro, rather than as a project argument.
+  add_project_arguments('-DHAVE_CBLAS', language: ['c', 'cpp'])
+endif
 
 # Copy the main __init__.py|pxd files to the build dir (needed for Cython)
 __init__py = fs.copyfile('__init__.py')
@@ -94,9 +102,10 @@ python_sources = [
   'conftest.py',
   'ctypeslib.py',
   'ctypeslib.pyi',
-  'dual.py',
   'exceptions.py',
   'exceptions.pyi',
+  'dtypes.py',
+  'dtypes.pyi',
   'matlib.py',
   'py.typed',
   'version.py'
@@ -143,12 +152,60 @@ foreach subdir: pure_subdirs
   install_subdir(subdir, install_dir: np_dir)
 endforeach
 
-custom_target('__config__.py',
-  output: '__config__.py',
+compilers = {
+  'C': cc,
+  'CPP': cpp,
+  'CYTHON': meson.get_compiler('cython')
+}
+
+machines = {
+  'HOST': host_machine,
+  'BUILD': build_machine,
+}
+
+conf_data = configuration_data()
+
+# Set compiler information
+foreach name, compiler : compilers
+  conf_data.set(name + '_COMP', compiler.get_id())
+  conf_data.set(name + '_COMP_LINKER_ID', compiler.get_linker_id())
+  conf_data.set(name + '_COMP_VERSION', compiler.version())
+  conf_data.set(name + '_COMP_CMD_ARRAY', ', '.join(compiler.cmd_array()))
+endforeach
+
+# Machines CPU and system information
+foreach name, machine : machines
+  conf_data.set(name + '_CPU', machine.cpu())
+  conf_data.set(name + '_CPU_FAMILY', machine.cpu_family())
+  conf_data.set(name + '_CPU_ENDIAN', machine.endian())
+  conf_data.set(name + '_CPU_SYSTEM', machine.system())
+endforeach
+
+conf_data.set('CROSS_COMPILED', meson.is_cross_build())
+
+# Python information
+conf_data.set('PYTHON_PATH', py.full_path())
+conf_data.set('PYTHON_VERSION', py.language_version())
+
+# Dependencies information
+foreach name, dep : dependency_map
+  conf_data.set(name + '_NAME', dep.name())
+  conf_data.set(name + '_FOUND', dep.found())
+  if dep.found()
+    conf_data.set(name + '_VERSION', dep.version())
+    conf_data.set(name + '_TYPE_NAME', dep.type_name())
+    conf_data.set(name + '_INCLUDEDIR', dep.get_variable('includedir'))
+    conf_data.set(name + '_LIBDIR', dep.get_variable('libdir'))
+    conf_data.set(name + '_OPENBLAS_CONFIG', dep.get_variable('openblas_config'))
+    conf_data.set(name + '_PCFILEDIR', dep.get_variable('pcfiledir'))
+  endif
+endforeach
+
+configure_file(
   input: '__config__.py.in',
-  command: [tempita_cli, '@INPUT@', '-o', '@OUTPUT@'],
-  install: true,
-  install_dir: np_dir
+  output: '__config__.py',
+  configuration : conf_data,
+  install_dir: np_dir,
 )
 
 subdir('core')
diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py
index 3bea91dd2..9730574cf 100644
--- a/numpy/polynomial/_polybase.py
+++ b/numpy/polynomial/_polybase.py
@@ -682,6 +682,28 @@ class ABCPolyBase(abc.ABC):
         degree : int
             Degree of the series, one less than the number of coefficients.
 
+        Examples
+        --------
+
+        Create a polynomial object for ``1 + 7*x + 4*x**2``:
+
+        >>> poly = np.polynomial.Polynomial([1, 7, 4])
+        >>> print(poly)
+        1.0 + 7.0·x + 4.0·x²
+        >>> poly.degree()
+        2
+
+        Note that this method does not check for non-zero coefficients.
+        You must trim the polynomial to remove any trailing zeroes:
+
+        >>> poly = np.polynomial.Polynomial([1, 7, 0])
+        >>> print(poly)
+        1.0 + 7.0·x + 0.0·x²
+        >>> poly.degree()
+        2
+        >>> poly.trim().degree()
+        1
+
         """
         return len(self) - 1
 
@@ -887,7 +909,7 @@ class ABCPolyBase(abc.ABC):
         """Return the roots of the series polynomial.
 
         Compute the roots for the series. Note that the accuracy of the
-        roots decrease the further outside the domain they lie.
+        roots decreases the further outside the `domain` they lie.
 
         Returns
         -------
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index c663ffab0..efbe13e0c 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -2012,6 +2012,12 @@ class Chebyshev(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/hermite.py b/numpy/polynomial/hermite.py
index e20339121..210df25f5 100644
--- a/numpy/polynomial/hermite.py
+++ b/numpy/polynomial/hermite.py
@@ -1675,6 +1675,12 @@ class Hermite(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/hermite_e.py b/numpy/polynomial/hermite_e.py
index 182c562c2..bdf29405b 100644
--- a/numpy/polynomial/hermite_e.py
+++ b/numpy/polynomial/hermite_e.py
@@ -1667,6 +1667,12 @@ class HermiteE(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/laguerre.py b/numpy/polynomial/laguerre.py
index 2eacceced..925d4898e 100644
--- a/numpy/polynomial/laguerre.py
+++ b/numpy/polynomial/laguerre.py
@@ -1623,6 +1623,12 @@ class Laguerre(ABCPolyBase):
         Window, see `domain` for its use. The default value is [0, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/legendre.py b/numpy/polynomial/legendre.py
index 028e2fe7b..8e9c19d94 100644
--- a/numpy/polynomial/legendre.py
+++ b/numpy/polynomial/legendre.py
@@ -1636,6 +1636,12 @@ class Legendre(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py
index d102f5a30..ceadff0bf 100644
--- a/numpy/polynomial/polynomial.py
+++ b/numpy/polynomial/polynomial.py
@@ -1489,6 +1489,12 @@ class Polynomial(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/tests/test_printing.py b/numpy/polynomial/tests/test_printing.py
index 990a0d179..6f2a5092d 100644
--- a/numpy/polynomial/tests/test_printing.py
+++ b/numpy/polynomial/tests/test_printing.py
@@ -478,6 +478,10 @@ class TestPrintOptions:
     are too small or too large.
     """
 
+    @pytest.fixture(scope='class', autouse=True)
+    def use_ascii(self):
+        poly.set_default_printstyle('ascii')
+
     def test_str(self):
         p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9])
         assert_equal(str(p), '0.5 + 0.14285714 x + 14285714.28571429 x**2 '
diff --git a/numpy/random/_bounded_integers.pyx.in b/numpy/random/_bounded_integers.pyx.in
index 7eb6aff1e..6743001d6 100644
--- a/numpy/random/_bounded_integers.pyx.in
+++ b/numpy/random/_bounded_integers.pyx.in
@@ -99,8 +99,12 @@ cdef object _rand_{{nptype}}_broadcast(np.ndarray low, np.ndarray high, object s
     is_open = not closed
     low_arr = <np.ndarray>low
     high_arr = <np.ndarray>high
-    if np.any(np.less(low_arr, {{lb}})):
+
+    if np.can_cast(low_arr, np.{{otype}}):
+        pass  # cannot be out-of-bounds
+    elif np.any(np.less(low_arr, np.{{otype}}({{lb}}))):
         raise ValueError('low is out of bounds for {{nptype}}')
+
     if closed:
         high_comp = np.greater_equal
         low_high_comp = np.greater
@@ -108,8 +112,11 @@ cdef object _rand_{{nptype}}_broadcast(np.ndarray low, np.ndarray high, object s
         high_comp = np.greater
         low_high_comp = np.greater_equal
 
-    if np.any(high_comp(high_arr, {{ub}})):
+    if np.can_cast(high_arr, np.{{otype}}):
+        pass  # cannot be out-of-bounds
+    elif np.any(high_comp(high_arr, np.{{nptype_up}}({{ub}}))):
         raise ValueError('high is out of bounds for {{nptype}}')
+
     if np.any(low_high_comp(low_arr, high_arr)):
         raise ValueError(format_bounds_error(closed, low_arr))
 
@@ -165,50 +172,69 @@ cdef object _rand_{{nptype}}_broadcast(object low, object high, object size,
     64 bit integer type.
     """
 
-    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.ndarray low_arr, low_arr_orig, high_arr, high_arr_orig, out_arr
     cdef np.npy_intp i, cnt, n
     cdef np.broadcast it
     cdef object closed_upper
     cdef uint64_t *out_data
     cdef {{nptype}}_t *highm1_data
     cdef {{nptype}}_t low_v, high_v
-    cdef uint64_t rng, last_rng, val, mask, off, out_val
+    cdef uint64_t rng, last_rng, val, mask, off, out_val, is_open
 
-    low_arr = <np.ndarray>low
-    high_arr = <np.ndarray>high
+    low_arr_orig = <np.ndarray>low
+    high_arr_orig = <np.ndarray>high
 
-    if np.any(np.less(low_arr, {{lb}})):
-        raise ValueError('low is out of bounds for {{nptype}}')
-    dt = high_arr.dtype
-    if closed or np.issubdtype(dt, np.integer):
-        # Avoid object dtype path if already an integer
-        high_lower_comp = np.less if closed else np.less_equal
-        if np.any(high_lower_comp(high_arr, {{lb}})):
-            raise ValueError(format_bounds_error(closed, low_arr))
-        high_m1 = high_arr if closed else high_arr - dt.type(1)
-        if np.any(np.greater(high_m1, {{ub}})):
-            raise ValueError('high is out of bounds for {{nptype}}')
-        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.{{npctype}}, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    is_open = not closed
+
+    # The following code tries to cast safely, but failing that goes via
+    # Python `int()` because it is very difficult to cast integers in a
+    # truly safe way (i.e. so it raises on out-of-bound).
+    # We correct if the interval is not closed in this step if we go the long
+    # route.  (Not otherwise, since the -1 could overflow in theory.)
+    if np.can_cast(low_arr_orig, np.{{otype}}):
+        low_arr = <np.ndarray>np.PyArray_FROM_OTF(low_arr_orig, np.{{npctype}}, np.NPY_ALIGNED)
+    else:
+        low_arr = <np.ndarray>np.empty_like(low_arr_orig, dtype=np.{{otype}})
+        flat = low_arr_orig.flat
+        low_data = <{{nptype}}_t *>np.PyArray_DATA(low_arr)
+        cnt = np.PyArray_SIZE(low_arr)
+        for i in range(cnt):
+            lower = int(flat[i])
+            if lower < {{lb}} or lower > {{ub}}:
+                raise ValueError('low is out of bounds for {{nptype}}')
+            low_data[i] = lower
+
+    del low_arr_orig
+
+    if np.can_cast(high_arr_orig, np.{{otype}}):
+        high_arr = <np.ndarray>np.PyArray_FROM_OTF(high_arr_orig, np.{{npctype}}, np.NPY_ALIGNED)
     else:
-        # If input is object or a floating type
-        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.{{otype}})
-        highm1_data = <{{nptype}}_t *>np.PyArray_DATA(highm1_arr)
+        high_arr = np.empty_like(high_arr_orig, dtype=np.{{otype}})
+        flat = high_arr_orig.flat
+        high_data = <{{nptype}}_t *>np.PyArray_DATA(high_arr)
         cnt = np.PyArray_SIZE(high_arr)
-        flat = high_arr.flat
         for i in range(cnt):
-            # Subtract 1 since generator produces values on the closed int [off, off+rng]
-            closed_upper = int(flat[i]) - 1
+            closed_upper = int(flat[i]) - is_open
             if closed_upper > {{ub}}:
                 raise ValueError('high is out of bounds for {{nptype}}')
             if closed_upper < {{lb}}:
                 raise ValueError(format_bounds_error(closed, low_arr))
-            highm1_data[i] = <{{nptype}}_t>closed_upper
+            high_data[i] = closed_upper
 
-    if np.any(np.greater(low_arr, highm1_arr)):
-        raise ValueError(format_bounds_error(closed, low_arr))
+        is_open = 0  # we applied is_open in this path already
 
-    high_arr = highm1_arr
-    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.{{npctype}}, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    del high_arr_orig
+
+    # Since we have the largest supported integer dtypes, they must be within
+    # range at this point; otherwise conversion would have failed.  Check that
+    # it is never true that `high <= low`` if closed and `high < low` if not
+    if not is_open:
+        low_high_comp = np.greater
+    else:
+        low_high_comp = np.greater_equal
+
+    if np.any(low_high_comp(low_arr, high_arr)):
+        raise ValueError(format_bounds_error(closed, low_arr))
 
     if size is not None:
         out_arr = <np.ndarray>np.empty(size, np.{{otype}})
@@ -224,8 +250,8 @@ cdef object _rand_{{nptype}}_broadcast(object low, object high, object size,
         for i in range(n):
             low_v = (<{{nptype}}_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
             high_v = (<{{nptype}}_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
-            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
-            rng = <{{utype}}_t>(high_v - low_v)
+            # Generator produces values on the closed int [off, off+rng]
+            rng = <{{utype}}_t>((high_v - is_open) - low_v)
             off = <{{utype}}_t>(<{{nptype}}_t>low_v)
 
             if rng != last_rng:
diff --git a/numpy/random/_common.pxd b/numpy/random/_common.pxd
index 3eaf39ddf..659da0d2d 100644
--- a/numpy/random/_common.pxd
+++ b/numpy/random/_common.pxd
@@ -39,32 +39,32 @@ cdef extern from "include/aligned_malloc.h":
     cdef void *PyArray_calloc_aligned(size_t n, size_t s)
     cdef void PyArray_free_aligned(void *p)
 
-ctypedef void (*random_double_fill)(bitgen_t *state, np.npy_intp count, double* out) nogil
-ctypedef double (*random_double_0)(void *state) nogil
-ctypedef double (*random_double_1)(void *state, double a) nogil
-ctypedef double (*random_double_2)(void *state, double a, double b) nogil
-ctypedef double (*random_double_3)(void *state, double a, double b, double c) nogil
+ctypedef void (*random_double_fill)(bitgen_t *state, np.npy_intp count, double* out)  noexcept nogil
+ctypedef double (*random_double_0)(void *state)  noexcept nogil
+ctypedef double (*random_double_1)(void *state, double a)  noexcept nogil
+ctypedef double (*random_double_2)(void *state, double a, double b)  noexcept nogil
+ctypedef double (*random_double_3)(void *state, double a, double b, double c)  noexcept nogil
 
-ctypedef void (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out) nogil
-ctypedef float (*random_float_0)(bitgen_t *state) nogil
-ctypedef float (*random_float_1)(bitgen_t *state, float a) nogil
+ctypedef void (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out)  noexcept nogil
+ctypedef float (*random_float_0)(bitgen_t *state)  noexcept nogil
+ctypedef float (*random_float_1)(bitgen_t *state, float a)  noexcept nogil
 
-ctypedef int64_t (*random_uint_0)(void *state) nogil
-ctypedef int64_t (*random_uint_d)(void *state, double a) nogil
-ctypedef int64_t (*random_uint_dd)(void *state, double a, double b) nogil
-ctypedef int64_t (*random_uint_di)(void *state, double a, uint64_t b) nogil
-ctypedef int64_t (*random_uint_i)(void *state, int64_t a) nogil
-ctypedef int64_t (*random_uint_iii)(void *state, int64_t a, int64_t b, int64_t c) nogil
+ctypedef int64_t (*random_uint_0)(void *state)  noexcept nogil
+ctypedef int64_t (*random_uint_d)(void *state, double a)  noexcept nogil
+ctypedef int64_t (*random_uint_dd)(void *state, double a, double b)  noexcept nogil
+ctypedef int64_t (*random_uint_di)(void *state, double a, uint64_t b)  noexcept nogil
+ctypedef int64_t (*random_uint_i)(void *state, int64_t a)  noexcept nogil
+ctypedef int64_t (*random_uint_iii)(void *state, int64_t a, int64_t b, int64_t c)  noexcept nogil
 
-ctypedef uint32_t (*random_uint_0_32)(bitgen_t *state) nogil
-ctypedef uint32_t (*random_uint_1_i_32)(bitgen_t *state, uint32_t a) nogil
+ctypedef uint32_t (*random_uint_0_32)(bitgen_t *state)  noexcept nogil
+ctypedef uint32_t (*random_uint_1_i_32)(bitgen_t *state, uint32_t a)  noexcept nogil
 
-ctypedef int32_t (*random_int_2_i_32)(bitgen_t *state, int32_t a, int32_t b) nogil
-ctypedef int64_t (*random_int_2_i)(bitgen_t *state, int64_t a, int64_t b) nogil
+ctypedef int32_t (*random_int_2_i_32)(bitgen_t *state, int32_t a, int32_t b)  noexcept nogil
+ctypedef int64_t (*random_int_2_i)(bitgen_t *state, int64_t a, int64_t b)  noexcept nogil
 
-cdef double kahan_sum(double *darr, np.npy_intp n)
+cdef double kahan_sum(double *darr, np.npy_intp n) noexcept
 
-cdef inline double uint64_to_double(uint64_t rnd) nogil:
+cdef inline double uint64_to_double(uint64_t rnd) noexcept nogil:
     return (rnd >> 11) * (1.0 / 9007199254740992.0)
 
 cdef object double_fill(void *func, bitgen_t *state, object size, object lock, object out)
diff --git a/numpy/random/_common.pyx b/numpy/random/_common.pyx
index 7b6f69303..c5e4e3297 100644
--- a/numpy/random/_common.pyx
+++ b/numpy/random/_common.pyx
@@ -171,7 +171,7 @@ cdef object prepare_ctypes(bitgen_t *bitgen):
                         ctypes.c_void_p(<uintptr_t>bitgen))
     return _ctypes
 
-cdef double kahan_sum(double *darr, np.npy_intp n):
+cdef double kahan_sum(double *darr, np.npy_intp n) noexcept:
     """
     Parameters
     ----------
diff --git a/numpy/random/_generator.pyi b/numpy/random/_generator.pyi
index f0d814fef..e1cdefb15 100644
--- a/numpy/random/_generator.pyi
+++ b/numpy/random/_generator.pyi
@@ -29,6 +29,7 @@ from numpy._typing import (
     _DTypeLikeUInt,
     _Float32Codes,
     _Float64Codes,
+    _FloatLike_co,
     _Int8Codes,
     _Int16Codes,
     _Int32Codes,
@@ -72,6 +73,7 @@ class Generator:
     def __reduce__(self) -> tuple[Callable[[str], Generator], tuple[str], dict[str, Any]]: ...
     @property
     def bit_generator(self) -> BitGenerator: ...
+    def spawn(self, n_children: int) -> list[Generator]: ...
     def bytes(self, length: int) -> bytes: ...
     @overload
     def standard_normal(  # type: ignore[misc]
@@ -187,13 +189,18 @@ class Generator:
         out: None | ndarray[Any, dtype[float64]] = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def beta(self, a: float, b: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def beta(
+        self,
+        a: _FloatLike_co,
+        b: _FloatLike_co,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def beta(
         self, a: _ArrayLikeFloat_co, b: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def exponential(self, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def exponential(self, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def exponential(
         self, scale: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
@@ -370,7 +377,12 @@ class Generator:
         shuffle: bool = ...,
     ) -> ndarray[Any, Any]: ...
     @overload
-    def uniform(self, low: float = ..., high: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def uniform(
+        self,
+        low: _FloatLike_co = ...,
+        high: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def uniform(
         self,
@@ -379,7 +391,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def normal(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def normal(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def normal(
         self,
@@ -390,7 +407,7 @@ class Generator:
     @overload
     def standard_gamma(  # type: ignore[misc]
         self,
-        shape: float,
+        shape: _FloatLike_co,
         size: None = ...,
         dtype: _DTypeLikeFloat32 | _DTypeLikeFloat64 = ...,
         out: None = ...,
@@ -425,7 +442,7 @@ class Generator:
         out: None | ndarray[Any, dtype[float64]] = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def gamma(self, shape: float, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def gamma(self, shape: _FloatLike_co, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def gamma(
         self,
@@ -434,13 +451,13 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def f(self, dfnum: float, dfden: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def f(self, dfnum: _FloatLike_co, dfden: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def f(
         self, dfnum: _ArrayLikeFloat_co, dfden: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def noncentral_f(self, dfnum: float, dfden: float, nonc: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def noncentral_f(self, dfnum: _FloatLike_co, dfden: _FloatLike_co, nonc: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def noncentral_f(
         self,
@@ -450,19 +467,19 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def chisquare(self, df: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def chisquare(self, df: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def chisquare(
         self, df: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def noncentral_chisquare(self, df: float, nonc: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def noncentral_chisquare(self, df: _FloatLike_co, nonc: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def noncentral_chisquare(
         self, df: _ArrayLikeFloat_co, nonc: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def standard_t(self, df: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def standard_t(self, df: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def standard_t(
         self, df: _ArrayLikeFloat_co, size: None = ...
@@ -472,25 +489,25 @@ class Generator:
         self, df: _ArrayLikeFloat_co, size: _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def vonmises(self, mu: float, kappa: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def vonmises(self, mu: _FloatLike_co, kappa: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def vonmises(
         self, mu: _ArrayLikeFloat_co, kappa: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def pareto(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def pareto(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def pareto(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def weibull(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def weibull(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def weibull(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def power(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def power(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def power(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
@@ -500,7 +517,12 @@ class Generator:
     @overload
     def standard_cauchy(self, size: _ShapeLike = ...) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def laplace(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def laplace(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def laplace(
         self,
@@ -509,7 +531,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def gumbel(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def gumbel(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def gumbel(
         self,
@@ -518,7 +545,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def logistic(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def logistic(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def logistic(
         self,
@@ -527,7 +559,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def lognormal(self, mean: float = ..., sigma: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def lognormal(
+        self,
+        mean: _FloatLike_co = ...,
+        sigma: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def lognormal(
         self,
@@ -536,19 +573,25 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def rayleigh(self, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def rayleigh(self, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def rayleigh(
         self, scale: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def wald(self, mean: float, scale: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def wald(self, mean: _FloatLike_co, scale: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def wald(
         self, mean: _ArrayLikeFloat_co, scale: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def triangular(self, left: float, mode: float, right: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def triangular(
+        self,
+        left: _FloatLike_co,
+        mode: _FloatLike_co,
+        right: _FloatLike_co,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def triangular(
         self,
@@ -558,31 +601,31 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def binomial(self, n: int, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def binomial(self, n: int, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def binomial(
         self, n: _ArrayLikeInt_co, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def negative_binomial(self, n: float, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def negative_binomial(self, n: _FloatLike_co, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def negative_binomial(
         self, n: _ArrayLikeFloat_co, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def poisson(self, lam: float = ..., size: None = ...) -> int: ...  # type: ignore[misc]
+    def poisson(self, lam: _FloatLike_co = ..., size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def poisson(
         self, lam: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def zipf(self, a: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def zipf(self, a: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def zipf(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def geometric(self, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def geometric(self, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def geometric(
         self, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
@@ -598,7 +641,7 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def logseries(self, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def logseries(self, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def logseries(
         self, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index da66c1cac..a30d116c2 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -238,6 +238,64 @@ cdef class Generator:
         """
         return self._bit_generator
 
+    def spawn(self, int n_children):
+        """
+        spawn(n_children)
+
+        Create new independent child generators.
+
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
+        .. versionadded:: 1.25.0
+
+        Parameters
+        ----------
+        n_children : int
+
+        Returns
+        -------
+        child_generators : list of Generators
+
+        Raises
+        ------
+        TypeError
+            When the underlying SeedSequence does not implement spawning.
+
+        See Also
+        --------
+        random.BitGenerator.spawn, random.SeedSequence.spawn :
+            Equivalent method on the bit generator and seed sequence.
+        bit_generator :
+            The bit generator instance used by the generator.
+
+        Examples
+        --------
+        Starting from a seeded default generator:
+
+        >>> # High quality entropy created with: f"0x{secrets.randbits(128):x}"
+        >>> entropy = 0x3034c61a9ae04ff8cb62ab8ec2c4b501
+        >>> rng = np.random.default_rng(entropy)
+
+        Create two new generators for example for parallel executation:
+
+        >>> child_rng1, child_rng2 = rng.spawn(2)
+
+        Drawn numbers from each are independent but derived from the initial
+        seeding entropy:
+
+        >>> rng.uniform(), child_rng1.uniform(), child_rng2.uniform()
+        (0.19029263503854454, 0.9475673279178444, 0.4702687338396767)
+
+        It is safe to spawn additional children from the original ``rng`` or
+        the children:
+
+        >>> more_child_rngs = rng.spawn(20)
+        >>> nested_spawn = child_rng1.spawn(20)
+
+        """
+        return [type(self)(g) for g in self._bit_generator.spawn(n_children)]
+
     def random(self, size=None, dtype=np.float64, out=None):
         """
         random(size=None, dtype=np.float64, out=None)
@@ -380,6 +438,22 @@ cdef class Generator:
         out : ndarray or scalar
             Drawn samples from the parameterized exponential distribution.
 
+        Examples
+        --------
+        A real world example: Assume a company has 10000 customer support 
+        agents and the average time between customer calls is 4 minutes.
+
+        >>> n = 10000
+        >>> time_between_calls = np.random.default_rng().exponential(scale=4, size=n)
+
+        What is the probability that a customer will call in the next 
+        4 to 5 minutes? 
+        
+        >>> x = ((time_between_calls < 5).sum())/n 
+        >>> y = ((time_between_calls < 4).sum())/n
+        >>> x-y
+        0.08 # may vary
+
         References
         ----------
         .. [1] Peyton Z. Peebles Jr., "Probability, Random Variables and
@@ -2560,7 +2634,7 @@ cdef class Generator:
         >>> b = []
         >>> for i in range(1000):
         ...    a = 10. + rng.standard_normal(100)
-        ...    b.append(np.product(a))
+        ...    b.append(np.prod(a))
 
         >>> b = np.array(b) / np.min(b) # scale values to be positive
         >>> count, bins, ignored = plt.hist(b, 100, density=True, align='mid')
@@ -3533,8 +3607,8 @@ cdef class Generator:
         generalization of the one-dimensional normal distribution to higher
         dimensions.  Such a distribution is specified by its mean and
         covariance matrix.  These parameters are analogous to the mean
-        (average or "center") and variance (standard deviation, or "width,"
-        squared) of the one-dimensional normal distribution.
+        (average or "center") and variance (the squared standard deviation,
+        or "width") of the one-dimensional normal distribution.
 
         Parameters
         ----------
@@ -3611,6 +3685,12 @@ cdef class Generator:
         nonnegative-definite). Otherwise, the behavior of this method is
         undefined and backwards compatibility is not guaranteed.
 
+        This function internally uses linear algebra routines, and thus results
+        may not be identical (even up to precision) across architectures, OSes,
+        or even builds. For example, this is likely if ``cov`` has multiple equal
+        singular values and ``method`` is ``'svd'`` (default). In this case,
+        ``method='cholesky'`` may be more robust.
+
         References
         ----------
         .. [1] Papoulis, A., "Probability, Random Variables, and Stochastic
@@ -4247,7 +4327,7 @@ cdef class Generator:
         Raises
         ------
         ValueError
-            If any value in ``alpha`` is less than or equal to zero
+            If any value in ``alpha`` is less than zero
 
         Notes
         -----
@@ -4326,8 +4406,8 @@ cdef class Generator:
         alpha_arr = <np.ndarray>np.PyArray_FROMANY(
             alpha, np.NPY_DOUBLE, 1, 1,
             np.NPY_ARRAY_ALIGNED | np.NPY_ARRAY_C_CONTIGUOUS)
-        if np.any(np.less_equal(alpha_arr, 0)):
-            raise ValueError('alpha <= 0')
+        if np.any(np.less(alpha_arr, 0)):
+            raise ValueError('alpha < 0')
         alpha_data = <double*>np.PyArray_DATA(alpha_arr)
 
         if size is None:
@@ -4803,6 +4883,8 @@ def default_rng(seed=None):
     -----
     If ``seed`` is not a `BitGenerator` or a `Generator`, a new `BitGenerator`
     is instantiated. This function does not manage a default global instance.
+
+    See :ref:`seeding_and_entropy` for more information about seeding.
     
     Examples
     --------
diff --git a/numpy/random/_mt19937.pyx b/numpy/random/_mt19937.pyx
index 5a8d52e6b..8b991254a 100644
--- a/numpy/random/_mt19937.pyx
+++ b/numpy/random/_mt19937.pyx
@@ -28,16 +28,16 @@ cdef extern from "src/mt19937/mt19937.h":
     enum:
         RK_STATE_LEN
 
-cdef uint64_t mt19937_uint64(void *st) nogil:
+cdef uint64_t mt19937_uint64(void *st) noexcept nogil:
     return mt19937_next64(<mt19937_state *> st)
 
-cdef uint32_t mt19937_uint32(void *st) nogil:
+cdef uint32_t mt19937_uint32(void *st) noexcept nogil:
     return mt19937_next32(<mt19937_state *> st)
 
-cdef double mt19937_double(void *st) nogil:
+cdef double mt19937_double(void *st) noexcept nogil:
     return mt19937_next_double(<mt19937_state *> st)
 
-cdef uint64_t mt19937_raw(void *st) nogil:
+cdef uint64_t mt19937_raw(void *st) noexcept nogil:
     return <uint64_t>mt19937_next32(<mt19937_state *> st)
 
 cdef class MT19937(BitGenerator):
diff --git a/numpy/random/_pcg64.pyx b/numpy/random/_pcg64.pyx
index c0a10a812..f7891aa85 100644
--- a/numpy/random/_pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -26,26 +26,26 @@ cdef extern from "src/pcg64/pcg64.h":
     void pcg64_get_state(pcg64_state *state, uint64_t *state_arr, int *has_uint32, uint32_t *uinteger)
     void pcg64_set_state(pcg64_state *state, uint64_t *state_arr, int has_uint32, uint32_t uinteger)
 
-    uint64_t pcg64_cm_next64(pcg64_state *state)  nogil
-    uint32_t pcg64_cm_next32(pcg64_state *state)  nogil
+    uint64_t pcg64_cm_next64(pcg64_state *state)  noexcept nogil
+    uint32_t pcg64_cm_next32(pcg64_state *state)  noexcept nogil
     void pcg64_cm_advance(pcg64_state *state, uint64_t *step)
 
-cdef uint64_t pcg64_uint64(void* st) nogil:
+cdef uint64_t pcg64_uint64(void* st) noexcept nogil:
     return pcg64_next64(<pcg64_state *>st)
 
-cdef uint32_t pcg64_uint32(void *st) nogil:
+cdef uint32_t pcg64_uint32(void *st) noexcept nogil:
     return pcg64_next32(<pcg64_state *> st)
 
-cdef double pcg64_double(void* st) nogil:
+cdef double pcg64_double(void* st) noexcept nogil:
     return uint64_to_double(pcg64_next64(<pcg64_state *>st))
 
-cdef uint64_t pcg64_cm_uint64(void* st) nogil:
+cdef uint64_t pcg64_cm_uint64(void* st) noexcept nogil:
     return pcg64_cm_next64(<pcg64_state *>st)
 
-cdef uint32_t pcg64_cm_uint32(void *st) nogil:
+cdef uint32_t pcg64_cm_uint32(void *st) noexcept nogil:
     return pcg64_cm_next32(<pcg64_state *> st)
 
-cdef double pcg64_cm_double(void* st) nogil:
+cdef double pcg64_cm_double(void* st) noexcept nogil:
     return uint64_to_double(pcg64_cm_next64(<pcg64_state *>st))
 
 cdef class PCG64(BitGenerator):
@@ -515,4 +515,3 @@ cdef class PCG64DXSM(BitGenerator):
         pcg64_cm_advance(&self.rng_state, <uint64_t *>np.PyArray_DATA(d))
         self._reset_state_variables()
         return self
-
diff --git a/numpy/random/_philox.pyx b/numpy/random/_philox.pyx
index d9a366e86..e5353460c 100644
--- a/numpy/random/_philox.pyx
+++ b/numpy/random/_philox.pyx
@@ -36,19 +36,19 @@ cdef extern from 'src/philox/philox.h':
 
     ctypedef s_philox_state philox_state
 
-    uint64_t philox_next64(philox_state *state)  nogil
-    uint32_t philox_next32(philox_state *state)  nogil
+    uint64_t philox_next64(philox_state *state)  noexcept nogil
+    uint32_t philox_next32(philox_state *state)  noexcept nogil
     void philox_jump(philox_state *state)
     void philox_advance(uint64_t *step, philox_state *state)
 
 
-cdef uint64_t philox_uint64(void*st) nogil:
+cdef uint64_t philox_uint64(void*st) noexcept nogil:
     return philox_next64(<philox_state *> st)
 
-cdef uint32_t philox_uint32(void *st) nogil:
+cdef uint32_t philox_uint32(void *st) noexcept nogil:
     return philox_next32(<philox_state *> st)
 
-cdef double philox_double(void*st) nogil:
+cdef double philox_double(void*st) noexcept nogil:
     return uint64_to_double(philox_next64(<philox_state *> st))
 
 cdef class Philox(BitGenerator):
diff --git a/numpy/random/_sfc64.pyx b/numpy/random/_sfc64.pyx
index 1daee34f8..419045c1d 100644
--- a/numpy/random/_sfc64.pyx
+++ b/numpy/random/_sfc64.pyx
@@ -21,13 +21,13 @@ cdef extern from "src/sfc64/sfc64.h":
     void sfc64_set_state(sfc64_state *state, uint64_t *state_arr, int has_uint32, uint32_t uinteger)
 
 
-cdef uint64_t sfc64_uint64(void* st) nogil:
+cdef uint64_t sfc64_uint64(void* st) noexcept nogil:
     return sfc64_next64(<sfc64_state *>st)
 
-cdef uint32_t sfc64_uint32(void *st) nogil:
+cdef uint32_t sfc64_uint32(void *st) noexcept nogil:
     return sfc64_next32(<sfc64_state *> st)
 
-cdef double sfc64_double(void* st) nogil:
+cdef double sfc64_double(void* st) noexcept nogil:
     return uint64_to_double(sfc64_next64(<sfc64_state *>st))
 
 
diff --git a/numpy/random/bit_generator.pyi b/numpy/random/bit_generator.pyi
index e6e3b10cd..8b9779cad 100644
--- a/numpy/random/bit_generator.pyi
+++ b/numpy/random/bit_generator.pyi
@@ -96,6 +96,9 @@ class BitGenerator(abc.ABC):
     def state(self) -> Mapping[str, Any]: ...
     @state.setter
     def state(self, value: Mapping[str, Any]) -> None: ...
+    @property
+    def seed_seq(self) -> ISeedSequence: ...
+    def spawn(self, n_children: int) -> list[BitGenerator]: ...
     @overload
     def random_raw(self, size: None = ..., output: Literal[True] = ...) -> int: ...  # type: ignore[misc]
     @overload
diff --git a/numpy/random/bit_generator.pyx b/numpy/random/bit_generator.pyx
index 3da4fabce..83441747a 100644
--- a/numpy/random/bit_generator.pyx
+++ b/numpy/random/bit_generator.pyx
@@ -212,6 +212,9 @@ class ISpawnableSeedSequence(ISeedSequence):
         Spawn a number of child `SeedSequence` s by extending the
         `spawn_key`.
 
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
         Parameters
         ----------
         n_children : int
@@ -260,6 +263,7 @@ cdef class SeedSequence():
     ----------
     entropy : {None, int, sequence[int]}, optional
         The entropy for creating a `SeedSequence`.
+        All integer values must be non-negative.
     spawn_key : {(), sequence[int]}, optional
         An additional source of entropy based on the position of this
         `SeedSequence` in the tree of such objects created with the
@@ -450,6 +454,9 @@ cdef class SeedSequence():
         Spawn a number of child `SeedSequence` s by extending the
         `spawn_key`.
 
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
         Parameters
         ----------
         n_children : int
@@ -457,6 +464,12 @@ cdef class SeedSequence():
         Returns
         -------
         seqs : list of `SeedSequence` s
+
+        See Also
+        --------
+        random.Generator.spawn, random.BitGenerator.spawn :
+            Equivalent method on the generator and bit generator.
+
         """
         cdef uint32_t i
 
@@ -490,6 +503,7 @@ cdef class BitGenerator():
         ``array_like[ints]`` is passed, then it will be passed to
         `~numpy.random.SeedSequence` to derive the initial `BitGenerator` state.
         One may also pass in a `SeedSequence` instance.
+        All integer values must be non-negative.
 
     Attributes
     ----------
@@ -549,6 +563,59 @@ cdef class BitGenerator():
     def state(self, value):
         raise NotImplementedError('Not implemented in base BitGenerator')
 
+    @property
+    def seed_seq(self):
+        """
+        Get the seed sequence used to initialize the bit generator.
+
+        .. versionadded:: 1.25.0
+
+        Returns
+        -------
+        seed_seq : ISeedSequence
+            The SeedSequence object used to initialize the BitGenerator.
+            This is normally a `np.random.SeedSequence` instance.
+
+        """
+        return self._seed_seq
+
+    def spawn(self, int n_children):
+        """
+        spawn(n_children)
+
+        Create new independent child bit generators.
+
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.  Some bit generators also implement ``jumped``
+        as a different approach for creating independent streams.
+
+        .. versionadded:: 1.25.0
+
+        Parameters
+        ----------
+        n_children : int
+
+        Returns
+        -------
+        child_bit_generators : list of BitGenerators
+
+        Raises
+        ------
+        TypeError
+            When the underlying SeedSequence does not implement spawning.
+
+        See Also
+        --------
+        random.Generator.spawn, random.SeedSequence.spawn :
+            Equivalent method on the generator and seed sequence.
+
+        """
+        if not isinstance(self._seed_seq, ISpawnableSeedSequence):
+            raise TypeError(
+                "The underlying SeedSequence does not implement spawning.")
+
+        return [type(self)(seed=s) for s in self._seed_seq.spawn(n_children)]
+
     def random_raw(self, size=None, output=True):
         """
         random_raw(self, size=None)
diff --git a/numpy/random/c_distributions.pxd b/numpy/random/c_distributions.pxd
index 6f905edc1..b978d1350 100644
--- a/numpy/random/c_distributions.pxd
+++ b/numpy/random/c_distributions.pxd
@@ -28,18 +28,24 @@ cdef extern from "numpy/random/distributions.h":
 
     ctypedef s_binomial_t binomial_t
 
+    float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
     double random_standard_uniform(bitgen_t *bitgen_state) nogil
     void random_standard_uniform_fill(bitgen_t* bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_uniform_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
+    
     double random_standard_exponential(bitgen_t *bitgen_state) nogil
-    double random_standard_exponential_f(bitgen_t *bitgen_state) nogil
+    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
     void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
-    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
     void random_standard_exponential_inv_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
-    void random_standard_exponential_inv_fill_f(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_exponential_inv_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
+    
     double random_standard_normal(bitgen_t* bitgen_state) nogil
+    float random_standard_normal_f(bitgen_t *bitgen_state) nogil
     void random_standard_normal_fill(bitgen_t *bitgen_state, npy_intp count, double *out) nogil
     void random_standard_normal_fill_f(bitgen_t *bitgen_state, npy_intp count, float *out) nogil
     double random_standard_gamma(bitgen_t *bitgen_state, double shape) nogil
+    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
 
     float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
     void random_standard_uniform_fill_f(bitgen_t* bitgen_state, npy_intp cnt, float *out) nogil
diff --git a/numpy/random/meson.build b/numpy/random/meson.build
index cc61c66dd..036cd81b9 100644
--- a/numpy/random/meson.build
+++ b/numpy/random/meson.build
@@ -66,7 +66,7 @@ random_pyx_sources = [
       fs.copyfile('mtrand.pyx'),
       'src/distributions/distributions.c',
       'src/legacy/legacy-distributions.c'
-    ], ['-DNPY_RANDOM_LEGACY=1'], npymath_lib,
+    ], ['-DNP_RANDOM_LEGACY=1'], npymath_lib,
   ],
 ]
 foreach gen: random_pyx_sources
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index edf812a4d..dfa553ee4 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -537,6 +537,22 @@ cdef class RandomState:
         out : ndarray or scalar
             Drawn samples from the parameterized exponential distribution.
 
+        Examples
+        --------
+        A real world example: Assume a company has 10000 customer support 
+        agents and the average time between customer calls is 4 minutes.
+
+        >>> n = 10000
+        >>> time_between_calls = np.random.default_rng().exponential(scale=4, size=n)
+
+        What is the probability that a customer will call in the next 
+        4 to 5 minutes? 
+        
+        >>> x = ((time_between_calls < 5).sum())/n 
+        >>> y = ((time_between_calls < 4).sum())/n
+        >>> x-y
+        0.08 # may vary
+
         See Also
         --------
         random.Generator.exponential: which should be used for new code.
@@ -3050,7 +3066,7 @@ cdef class RandomState:
         >>> b = []
         >>> for i in range(1000):
         ...    a = 10. + np.random.standard_normal(100)
-        ...    b.append(np.product(a))
+        ...    b.append(np.prod(a))
 
         >>> b = np.array(b) / np.min(b) # scale values to be positive
         >>> count, bins, ignored = plt.hist(b, 100, density=True, align='mid')
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index 9ab3f94a0..cebeb07cf 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -960,7 +960,15 @@ RAND_INT_TYPE random_geometric_search(bitgen_t *bitgen_state, double p) {
 }
 
 int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) {
-  return (int64_t)ceil(-random_standard_exponential(bitgen_state) / npy_log1p(-p));
+  double z = ceil(-random_standard_exponential(bitgen_state) / npy_log1p(-p));
+  /*
+   * The constant 9.223372036854776e+18 is the smallest double that is
+   * larger than INT64_MAX.
+   */
+  if (z >= 9.223372036854776e+18) {
+    return INT64_MAX;
+  }
+  return (int64_t) z;
 }
 
 int64_t random_geometric(bitgen_t *bitgen_state, double p) {
diff --git a/numpy/random/tests/test_direct.py b/numpy/random/tests/test_direct.py
index 58d966adf..fa2ae866b 100644
--- a/numpy/random/tests/test_direct.py
+++ b/numpy/random/tests/test_direct.py
@@ -148,6 +148,46 @@ def test_seedsequence():
     assert len(dummy.spawn(10)) == 10
 
 
+def test_generator_spawning():
+    """ Test spawning new generators and bit_generators directly.
+    """
+    rng = np.random.default_rng()
+    seq = rng.bit_generator.seed_seq
+    new_ss = seq.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(5)]
+    assert [c.spawn_key for c in new_ss] == expected_keys
+
+    new_bgs = rng.bit_generator.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(5, 10)]
+    assert [bg.seed_seq.spawn_key for bg in new_bgs] == expected_keys
+
+    new_rngs = rng.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(10, 15)]
+    found_keys = [rng.bit_generator.seed_seq.spawn_key for rng in new_rngs]
+    assert found_keys == expected_keys
+
+    # Sanity check that streams are actually different:
+    assert new_rngs[0].uniform() != new_rngs[1].uniform()
+
+
+def test_non_spawnable():
+    from numpy.random.bit_generator import ISeedSequence
+
+    class FakeSeedSequence:
+        def generate_state(self, n_words, dtype=np.uint32):
+            return np.zeros(n_words, dtype=dtype)
+
+    ISeedSequence.register(FakeSeedSequence)
+
+    rng = np.random.default_rng(FakeSeedSequence())
+
+    with pytest.raises(TypeError, match="The underlying SeedSequence"):
+        rng.spawn(5)
+
+    with pytest.raises(TypeError, match="The underlying SeedSequence"):
+        rng.bit_generator.spawn(5)
+
+
 class Base:
     dtype = np.uint64
     data2 = data1 = {}
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 54a5b73a3..5c4c2cbf9 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -345,6 +345,8 @@ class TestIntegers:
                           endpoint=endpoint, dtype=dt)
             assert_raises(ValueError, self.rfunc, 1, [0],
                           endpoint=endpoint, dtype=dt)
+            assert_raises(ValueError, self.rfunc, [ubnd+1], [ubnd],
+                          endpoint=endpoint, dtype=dt)
 
     def test_bounds_checking_array(self, endpoint):
         for dt in self.itype:
diff --git a/numpy/random/tests/test_generator_mt19937_regressions.py b/numpy/random/tests/test_generator_mt19937_regressions.py
index 0227d6502..7c2b6867c 100644
--- a/numpy/random/tests/test_generator_mt19937_regressions.py
+++ b/numpy/random/tests/test_generator_mt19937_regressions.py
@@ -3,32 +3,32 @@ import numpy as np
 import pytest
 from numpy.random import Generator, MT19937
 
-mt19937 = Generator(MT19937())
-
 
 class TestRegression:
 
+    def setup_method(self):
+        self.mt19937 = Generator(MT19937(121263137472525314065))
+
     def test_vonmises_range(self):
         # Make sure generated random variables are in [-pi, pi].
         # Regression test for ticket #986.
         for mu in np.linspace(-7., 7., 5):
-            r = mt19937.vonmises(mu, 1, 50)
+            r = self.mt19937.vonmises(mu, 1, 50)
             assert_(np.all(r > -np.pi) and np.all(r <= np.pi))
 
     def test_hypergeometric_range(self):
         # Test for ticket #921
-        assert_(np.all(mt19937.hypergeometric(3, 18, 11, size=10) < 4))
-        assert_(np.all(mt19937.hypergeometric(18, 3, 11, size=10) > 0))
+        assert_(np.all(self.mt19937.hypergeometric(3, 18, 11, size=10) < 4))
+        assert_(np.all(self.mt19937.hypergeometric(18, 3, 11, size=10) > 0))
 
         # Test for ticket #5623
         args = (2**20 - 2, 2**20 - 2, 2**20 - 2)  # Check for 32-bit systems
-        assert_(mt19937.hypergeometric(*args) > 0)
+        assert_(self.mt19937.hypergeometric(*args) > 0)
 
     def test_logseries_convergence(self):
         # Test for ticket #923
         N = 1000
-        mt19937 = Generator(MT19937(0))
-        rvsn = mt19937.logseries(0.8, size=N)
+        rvsn = self.mt19937.logseries(0.8, size=N)
         # these two frequency counts should be close to theoretical
         # numbers with this large sample
         # theoretical large N result is 0.49706795
@@ -65,41 +65,38 @@ class TestRegression:
         # Test for multivariate_normal issue with 'size' argument.
         # Check that the multivariate_normal size argument can be a
         # numpy integer.
-        mt19937.multivariate_normal([0], [[0]], size=1)
-        mt19937.multivariate_normal([0], [[0]], size=np.int_(1))
-        mt19937.multivariate_normal([0], [[0]], size=np.int64(1))
+        self.mt19937.multivariate_normal([0], [[0]], size=1)
+        self.mt19937.multivariate_normal([0], [[0]], size=np.int_(1))
+        self.mt19937.multivariate_normal([0], [[0]], size=np.int64(1))
 
     def test_beta_small_parameters(self):
         # Test that beta with small a and b parameters does not produce
         # NaNs due to roundoff errors causing 0 / 0, gh-5851
-        mt19937 = Generator(MT19937(1234567890))
-        x = mt19937.beta(0.0001, 0.0001, size=100)
+        x = self.mt19937.beta(0.0001, 0.0001, size=100)
         assert_(not np.any(np.isnan(x)), 'Nans in mt19937.beta')
 
     def test_choice_sum_of_probs_tolerance(self):
         # The sum of probs should be 1.0 with some tolerance.
         # For low precision dtypes the tolerance was too tight.
         # See numpy github issue 6123.
-        mt19937 = Generator(MT19937(1234))
         a = [1, 2, 3]
         counts = [4, 4, 2]
         for dt in np.float16, np.float32, np.float64:
             probs = np.array(counts, dtype=dt) / sum(counts)
-            c = mt19937.choice(a, p=probs)
+            c = self.mt19937.choice(a, p=probs)
             assert_(c in a)
             with pytest.raises(ValueError):
-                mt19937.choice(a, p=probs*0.9)
+                self.mt19937.choice(a, p=probs*0.9)
 
     def test_shuffle_of_array_of_different_length_strings(self):
         # Test that permuting an array of different length strings
         # will not cause a segfault on garbage collection
         # Tests gh-7710
-        mt19937 = Generator(MT19937(1234))
 
         a = np.array(['a', 'a' * 1000])
 
         for _ in range(100):
-            mt19937.shuffle(a)
+            self.mt19937.shuffle(a)
 
         # Force Garbage Collection - should not segfault.
         import gc
@@ -109,17 +106,17 @@ class TestRegression:
         # Test that permuting an array of objects will not cause
         # a segfault on garbage collection.
         # See gh-7719
-        mt19937 = Generator(MT19937(1234))
         a = np.array([np.arange(1), np.arange(4)], dtype=object)
 
         for _ in range(1000):
-            mt19937.shuffle(a)
+            self.mt19937.shuffle(a)
 
         # Force Garbage Collection - should not segfault.
         import gc
         gc.collect()
 
     def test_permutation_subclass(self):
+
         class N(np.ndarray):
             pass
 
@@ -142,9 +139,16 @@ class TestRegression:
         assert_array_equal(m.__array__(), np.arange(5))
 
     def test_gamma_0(self):
-        assert mt19937.standard_gamma(0.0) == 0.0
-        assert_array_equal(mt19937.standard_gamma([0.0]), 0.0)
+        assert self.mt19937.standard_gamma(0.0) == 0.0
+        assert_array_equal(self.mt19937.standard_gamma([0.0]), 0.0)
 
-        actual = mt19937.standard_gamma([0.0], dtype='float')
+        actual = self.mt19937.standard_gamma([0.0], dtype='float')
         expected = np.array([0.], dtype=np.float32)
         assert_array_equal(actual, expected)
+
+    def test_geometric_tiny_prob(self):
+        # Regression test for gh-17007.
+        # When p = 1e-30, the probability that a sample will exceed 2**63-1
+        # is 0.9999999999907766, so we expect the result to be all 2**63-1.
+        assert_array_equal(self.mt19937.geometric(p=1e-30, size=3),
+                           np.iinfo(np.int64).max)
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index 8b911cb3a..3a2961098 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -812,6 +812,10 @@ class TestRandomDist:
         alpha = np.array([5.4e-01, -1.0e-16])
         assert_raises(ValueError, random.dirichlet, alpha)
 
+    def test_dirichlet_zero_alpha(self):
+        y = random.default_rng().dirichlet([5, 9, 0, 8])
+        assert_equal(y[2], 0)
+
     def test_dirichlet_alpha_non_contiguous(self):
         a = np.array([51.72840233779265162, -1.0, 39.74494232180943953])
         alpha = a[::2]
diff --git a/numpy/testing/__init__.py b/numpy/testing/__init__.py
index 087527e43..8a34221e4 100644
--- a/numpy/testing/__init__.py
+++ b/numpy/testing/__init__.py
@@ -10,12 +10,12 @@ from unittest import TestCase
 from . import _private
 from ._private.utils import *
 from ._private.utils import (_assert_valid_refcount, _gen_alignment_data)
-from ._private import extbuild, decorators as dec
-from ._private.nosetester import (
-    run_module_suite, NoseTester as Tester
-    )
+from ._private import extbuild
+from . import overrides
 
-__all__ = _private.utils.__all__ + ['TestCase', 'run_module_suite']
+__all__ = (
+    _private.utils.__all__ + ['TestCase', 'overrides']
+)
 
 from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
diff --git a/numpy/testing/__init__.pyi b/numpy/testing/__init__.pyi
index a981d6113..d65860ccb 100644
--- a/numpy/testing/__init__.pyi
+++ b/numpy/testing/__init__.pyi
@@ -18,7 +18,6 @@ from numpy.testing._private.utils import (
     jiffies as jiffies,
     memusage as memusage,
     print_assert_equal as print_assert_equal,
-    raises as raises,
     rundocs as rundocs,
     runstring as runstring,
     verbose as verbose,
@@ -49,8 +48,3 @@ from numpy.testing._private.utils import (
 __all__: list[str]
 __path__: list[str]
 test: PytestTester
-
-def run_module_suite(
-    file_to_run: None | str = ...,
-    argv: None | list[str] = ...,
-) -> None: ...
diff --git a/numpy/testing/_private/decorators.py b/numpy/testing/_private/decorators.py
deleted file mode 100644
index cb49d9a73..000000000
--- a/numpy/testing/_private/decorators.py
+++ /dev/null
@@ -1,331 +0,0 @@
-"""
-Decorators for labeling and modifying behavior of test objects.
-
-Decorators that merely return a modified version of the original
-function object are straightforward. Decorators that return a new
-function object need to use
-::
-
-  nose.tools.make_decorator(original_function)(decorator)
-
-in returning the decorator, in order to preserve meta-data such as
-function name, setup and teardown functions and so on - see
-``nose.tools`` for more information.
-
-"""
-import collections.abc
-import warnings
-
-from .utils import SkipTest, assert_warns, HAS_REFCOUNT
-
-__all__ = ['slow', 'setastest', 'skipif', 'knownfailureif', 'deprecated',
-           'parametrize', '_needs_refcount',]
-
-
-def slow(t):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Label a test as 'slow'.
-
-    The exact definition of a slow test is obviously both subjective and
-    hardware-dependent, but in general any individual test that requires more
-    than a second or two should be labeled as slow (the whole suite consists of
-    thousands of tests, so even a second is significant).
-
-    Parameters
-    ----------
-    t : callable
-        The test to label as slow.
-
-    Returns
-    -------
-    t : callable
-        The decorated test `t`.
-
-    Examples
-    --------
-    The `numpy.testing` module includes ``import decorators as dec``.
-    A test can be decorated as slow like this::
-
-      from numpy.testing import *
-
-      @dec.slow
-      def test_big(self):
-          print('Big, slow test')
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-                'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    t.slow = True
-    return t
-
-def setastest(tf=True):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Signals to nose that this function is or is not a test.
-
-    Parameters
-    ----------
-    tf : bool
-        If True, specifies that the decorated callable is a test.
-        If False, specifies that the decorated callable is not a test.
-        Default is True.
-
-    Notes
-    -----
-    This decorator can't use the nose namespace, because it can be
-    called from a non-test module. See also ``istest`` and ``nottest`` in
-    ``nose.tools``.
-
-    Examples
-    --------
-    `setastest` can be used in the following way::
-
-      from numpy.testing import dec
-
-      @dec.setastest(False)
-      def func_with_test_in_name(arg1, arg2):
-          pass
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-    def set_test(t):
-        t.__test__ = tf
-        return t
-    return set_test
-
-def skipif(skip_condition, msg=None):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Make function raise SkipTest exception if a given condition is true.
-
-    If the condition is a callable, it is used at runtime to dynamically
-    make the decision. This is useful for tests that may require costly
-    imports, to delay the cost until the test suite is actually executed.
-
-    Parameters
-    ----------
-    skip_condition : bool or callable
-        Flag to determine whether to skip the decorated test.
-    msg : str, optional
-        Message to give on raising a SkipTest exception. Default is None.
-
-    Returns
-    -------
-    decorator : function
-        Decorator which, when applied to a function, causes SkipTest
-        to be raised when `skip_condition` is True, and the function
-        to be called normally otherwise.
-
-    Notes
-    -----
-    The decorator itself is decorated with the ``nose.tools.make_decorator``
-    function in order to transmit function name, and various other metadata.
-
-    """
-
-    def skip_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-
-        # Numpy 1.21, 2020-12-20
-        warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-        # Allow for both boolean or callable skip conditions.
-        if isinstance(skip_condition, collections.abc.Callable):
-            skip_val = lambda: skip_condition()
-        else:
-            skip_val = lambda: skip_condition
-
-        def get_msg(func,msg=None):
-            """Skip message with information about function being skipped."""
-            if msg is None:
-                out = 'Test skipped due to test condition'
-            else:
-                out = msg
-
-            return f'Skipping test: {func.__name__}: {out}'
-
-        # We need to define *two* skippers because Python doesn't allow both
-        # return with value and yield inside the same function.
-        def skipper_func(*args, **kwargs):
-            """Skipper for normal test functions."""
-            if skip_val():
-                raise SkipTest(get_msg(f, msg))
-            else:
-                return f(*args, **kwargs)
-
-        def skipper_gen(*args, **kwargs):
-            """Skipper for test generators."""
-            if skip_val():
-                raise SkipTest(get_msg(f, msg))
-            else:
-                yield from f(*args, **kwargs)
-
-        # Choose the right skipper to use when building the actual decorator.
-        if nose.util.isgenerator(f):
-            skipper = skipper_gen
-        else:
-            skipper = skipper_func
-
-        return nose.tools.make_decorator(f)(skipper)
-
-    return skip_decorator
-
-
-def knownfailureif(fail_condition, msg=None):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Make function raise KnownFailureException exception if given condition is true.
-
-    If the condition is a callable, it is used at runtime to dynamically
-    make the decision. This is useful for tests that may require costly
-    imports, to delay the cost until the test suite is actually executed.
-
-    Parameters
-    ----------
-    fail_condition : bool or callable
-        Flag to determine whether to mark the decorated test as a known
-        failure (if True) or not (if False).
-    msg : str, optional
-        Message to give on raising a KnownFailureException exception.
-        Default is None.
-
-    Returns
-    -------
-    decorator : function
-        Decorator, which, when applied to a function, causes
-        KnownFailureException to be raised when `fail_condition` is True,
-        and the function to be called normally otherwise.
-
-    Notes
-    -----
-    The decorator itself is decorated with the ``nose.tools.make_decorator``
-    function in order to transmit function name, and various other metadata.
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    if msg is None:
-        msg = 'Test skipped due to known failure'
-
-    # Allow for both boolean or callable known failure conditions.
-    if isinstance(fail_condition, collections.abc.Callable):
-        fail_val = lambda: fail_condition()
-    else:
-        fail_val = lambda: fail_condition
-
-    def knownfail_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-        from .noseclasses import KnownFailureException
-
-        def knownfailer(*args, **kwargs):
-            if fail_val():
-                raise KnownFailureException(msg)
-            else:
-                return f(*args, **kwargs)
-        return nose.tools.make_decorator(f)(knownfailer)
-
-    return knownfail_decorator
-
-def deprecated(conditional=True):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Filter deprecation warnings while running the test suite.
-
-    This decorator can be used to filter DeprecationWarning's, to avoid
-    printing them during the test suite run, while checking that the test
-    actually raises a DeprecationWarning.
-
-    Parameters
-    ----------
-    conditional : bool or callable, optional
-        Flag to determine whether to mark test as deprecated or not. If the
-        condition is a callable, it is used at runtime to dynamically make the
-        decision. Default is True.
-
-    Returns
-    -------
-    decorator : function
-        The `deprecated` decorator itself.
-
-    Notes
-    -----
-    .. versionadded:: 1.4.0
-
-    """
-    def deprecate_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-
-        # Numpy 1.21, 2020-12-20
-        warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-        def _deprecated_imp(*args, **kwargs):
-            # Poor man's replacement for the with statement
-            with assert_warns(DeprecationWarning):
-                f(*args, **kwargs)
-
-        if isinstance(conditional, collections.abc.Callable):
-            cond = conditional()
-        else:
-            cond = conditional
-        if cond:
-            return nose.tools.make_decorator(f)(_deprecated_imp)
-        else:
-            return f
-    return deprecate_decorator
-
-
-def parametrize(vars, input):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Pytest compatibility class. This implements the simplest level of
-    pytest.mark.parametrize for use in nose as an aid in making the transition
-    to pytest. It achieves that by adding a dummy var parameter and ignoring
-    the doc_func parameter of the base class. It does not support variable
-    substitution by name, nor does it support nesting or classes. See the
-    pytest documentation for usage.
-
-    .. versionadded:: 1.14.0
-
-    """
-    from .parameterized import parameterized
-
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    return parameterized(input)
-
-_needs_refcount = skipif(not HAS_REFCOUNT, "python has no sys.getrefcount")
diff --git a/numpy/testing/_private/noseclasses.py b/numpy/testing/_private/noseclasses.py
deleted file mode 100644
index 48fa4dc1f..000000000
--- a/numpy/testing/_private/noseclasses.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# These classes implement a doctest runner plugin for nose, a "known failure"
-# error class, and a customized TestProgram for NumPy.
-
-# Because this module imports nose directly, it should not
-# be used except by nosetester.py to avoid a general NumPy
-# dependency on nose.
-import os
-import sys
-import doctest
-import inspect
-
-import numpy
-import nose
-from nose.plugins import doctests as npd
-from nose.plugins.errorclass import ErrorClass, ErrorClassPlugin
-from nose.plugins.base import Plugin
-from nose.util import src
-from .nosetester import get_package_name
-from .utils import KnownFailureException, KnownFailureTest
-
-
-# Some of the classes in this module begin with 'Numpy' to clearly distinguish
-# them from the plethora of very similar names from nose/unittest/doctest
-
-#-----------------------------------------------------------------------------
-# Modified version of the one in the stdlib, that fixes a python bug (doctests
-# not found in extension modules, https://bugs.python.org/issue3158)
-class NumpyDocTestFinder(doctest.DocTestFinder):
-
-    def _from_module(self, module, object):
-        """
-        Return true if the given object is defined in the given
-        module.
-        """
-        if module is None:
-            return True
-        elif inspect.isfunction(object):
-            return module.__dict__ is object.__globals__
-        elif inspect.isbuiltin(object):
-            return module.__name__ == object.__module__
-        elif inspect.isclass(object):
-            return module.__name__ == object.__module__
-        elif inspect.ismethod(object):
-            # This one may be a bug in cython that fails to correctly set the
-            # __module__ attribute of methods, but since the same error is easy
-            # to make by extension code writers, having this safety in place
-            # isn't such a bad idea
-            return module.__name__ == object.__self__.__class__.__module__
-        elif inspect.getmodule(object) is not None:
-            return module is inspect.getmodule(object)
-        elif hasattr(object, '__module__'):
-            return module.__name__ == object.__module__
-        elif isinstance(object, property):
-            return True  # [XX] no way not be sure.
-        else:
-            raise ValueError("object must be a class or function")
-
-    def _find(self, tests, obj, name, module, source_lines, globs, seen):
-        """
-        Find tests for the given object and any contained objects, and
-        add them to `tests`.
-        """
-
-        doctest.DocTestFinder._find(self, tests, obj, name, module,
-                                    source_lines, globs, seen)
-
-        # Below we re-run pieces of the above method with manual modifications,
-        # because the original code is buggy and fails to correctly identify
-        # doctests in extension modules.
-
-        # Local shorthands
-        from inspect import (
-            isroutine, isclass, ismodule, isfunction, ismethod
-            )
-
-        # Look for tests in a module's contained objects.
-        if ismodule(obj) and self._recurse:
-            for valname, val in obj.__dict__.items():
-                valname1 = f'{name}.{valname}'
-                if ( (isroutine(val) or isclass(val))
-                     and self._from_module(module, val)):
-
-                    self._find(tests, val, valname1, module, source_lines,
-                               globs, seen)
-
-        # Look for tests in a class's contained objects.
-        if isclass(obj) and self._recurse:
-            for valname, val in obj.__dict__.items():
-                # Special handling for staticmethod/classmethod.
-                if isinstance(val, staticmethod):
-                    val = getattr(obj, valname)
-                if isinstance(val, classmethod):
-                    val = getattr(obj, valname).__func__
-
-                # Recurse to methods, properties, and nested classes.
-                if ((isfunction(val) or isclass(val) or
-                     ismethod(val) or isinstance(val, property)) and
-                      self._from_module(module, val)):
-                    valname = f'{name}.{valname}'
-                    self._find(tests, val, valname, module, source_lines,
-                               globs, seen)
-
-
-# second-chance checker; if the default comparison doesn't
-# pass, then see if the expected output string contains flags that
-# tell us to ignore the output
-class NumpyOutputChecker(doctest.OutputChecker):
-    def check_output(self, want, got, optionflags):
-        ret = doctest.OutputChecker.check_output(self, want, got,
-                                                 optionflags)
-        if not ret:
-            if "#random" in want:
-                return True
-
-            # it would be useful to normalize endianness so that
-            # bigendian machines don't fail all the tests (and there are
-            # actually some bigendian examples in the doctests). Let's try
-            # making them all little endian
-            got = got.replace("'>", "'<")
-            want = want.replace("'>", "'<")
-
-            # try to normalize out 32 and 64 bit default int sizes
-            for sz in [4, 8]:
-                got = got.replace("'<i%d'" % sz, "int")
-                want = want.replace("'<i%d'" % sz, "int")
-
-            ret = doctest.OutputChecker.check_output(self, want,
-                    got, optionflags)
-
-        return ret
-
-
-# Subclass nose.plugins.doctests.DocTestCase to work around a bug in
-# its constructor that blocks non-default arguments from being passed
-# down into doctest.DocTestCase
-class NumpyDocTestCase(npd.DocTestCase):
-    def __init__(self, test, optionflags=0, setUp=None, tearDown=None,
-                 checker=None, obj=None, result_var='_'):
-        self._result_var = result_var
-        self._nose_obj = obj
-        doctest.DocTestCase.__init__(self, test,
-                                     optionflags=optionflags,
-                                     setUp=setUp, tearDown=tearDown,
-                                     checker=checker)
-
-
-print_state = numpy.get_printoptions()
-
-class NumpyDoctest(npd.Doctest):
-    name = 'numpydoctest'   # call nosetests with --with-numpydoctest
-    score = 1000  # load late, after doctest builtin
-
-    # always use whitespace and ellipsis options for doctests
-    doctest_optflags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
-
-    # files that should be ignored for doctests
-    doctest_ignore = ['generate_numpy_api.py',
-                      'setup.py']
-
-    # Custom classes; class variables to allow subclassing
-    doctest_case_class = NumpyDocTestCase
-    out_check_class = NumpyOutputChecker
-    test_finder_class = NumpyDocTestFinder
-
-    # Don't use the standard doctest option handler; hard-code the option values
-    def options(self, parser, env=os.environ):
-        Plugin.options(self, parser, env)
-        # Test doctests in 'test' files / directories. Standard plugin default
-        # is False
-        self.doctest_tests = True
-        # Variable name; if defined, doctest results stored in this variable in
-        # the top-level namespace.  None is the standard default
-        self.doctest_result_var = None
-
-    def configure(self, options, config):
-        # parent method sets enabled flag from command line --with-numpydoctest
-        Plugin.configure(self, options, config)
-        self.finder = self.test_finder_class()
-        self.parser = doctest.DocTestParser()
-        if self.enabled:
-            # Pull standard doctest out of plugin list; there's no reason to run
-            # both.  In practice the Unplugger plugin above would cover us when
-            # run from a standard numpy.test() call; this is just in case
-            # someone wants to run our plugin outside the numpy.test() machinery
-            config.plugins.plugins = [p for p in config.plugins.plugins
-                                      if p.name != 'doctest']
-
-    def set_test_context(self, test):
-        """ Configure `test` object to set test context
-
-        We set the numpy / scipy standard doctest namespace
-
-        Parameters
-        ----------
-        test : test object
-            with ``globs`` dictionary defining namespace
-
-        Returns
-        -------
-        None
-
-        Notes
-        -----
-        `test` object modified in place
-        """
-        # set the namespace for tests
-        pkg_name = get_package_name(os.path.dirname(test.filename))
-
-        # Each doctest should execute in an environment equivalent to
-        # starting Python and executing "import numpy as np", and,
-        # for SciPy packages, an additional import of the local
-        # package (so that scipy.linalg.basic.py's doctests have an
-        # implicit "from scipy import linalg" as well).
-        #
-        # Note: __file__ allows the doctest in NoseTester to run
-        # without producing an error
-        test.globs = {'__builtins__':__builtins__,
-                      '__file__':'__main__',
-                      '__name__':'__main__',
-                      'np':numpy}
-        # add appropriate scipy import for SciPy tests
-        if 'scipy' in pkg_name:
-            p = pkg_name.split('.')
-            p2 = p[-1]
-            test.globs[p2] = __import__(pkg_name, test.globs, {}, [p2])
-
-    # Override test loading to customize test context (with set_test_context
-    # method), set standard docstring options, and install our own test output
-    # checker
-    def loadTestsFromModule(self, module):
-        if not self.matches(module.__name__):
-            npd.log.debug("Doctest doesn't want module %s", module)
-            return
-        try:
-            tests = self.finder.find(module)
-        except AttributeError:
-            # nose allows module.__test__ = False; doctest does not and
-            # throws AttributeError
-            return
-        if not tests:
-            return
-        tests.sort()
-        module_file = src(module.__file__)
-        for test in tests:
-            if not test.examples:
-                continue
-            if not test.filename:
-                test.filename = module_file
-            # Set test namespace; test altered in place
-            self.set_test_context(test)
-            yield self.doctest_case_class(test,
-                                          optionflags=self.doctest_optflags,
-                                          checker=self.out_check_class(),
-                                          result_var=self.doctest_result_var)
-
-    # Add an afterContext method to nose.plugins.doctests.Doctest in order
-    # to restore print options to the original state after each doctest
-    def afterContext(self):
-        numpy.set_printoptions(**print_state)
-
-    # Ignore NumPy-specific build files that shouldn't be searched for tests
-    def wantFile(self, file):
-        bn = os.path.basename(file)
-        if bn in self.doctest_ignore:
-            return False
-        return npd.Doctest.wantFile(self, file)
-
-
-class Unplugger:
-    """ Nose plugin to remove named plugin late in loading
-
-    By default it removes the "doctest" plugin.
-    """
-    name = 'unplugger'
-    enabled = True  # always enabled
-    score = 4000  # load late in order to be after builtins
-
-    def __init__(self, to_unplug='doctest'):
-        self.to_unplug = to_unplug
-
-    def options(self, parser, env):
-        pass
-
-    def configure(self, options, config):
-        # Pull named plugin out of plugins list
-        config.plugins.plugins = [p for p in config.plugins.plugins
-                                  if p.name != self.to_unplug]
-
-
-class KnownFailurePlugin(ErrorClassPlugin):
-    '''Plugin that installs a KNOWNFAIL error class for the
-    KnownFailureClass exception.  When KnownFailure is raised,
-    the exception will be logged in the knownfail attribute of the
-    result, 'K' or 'KNOWNFAIL' (verbose) will be output, and the
-    exception will not be counted as an error or failure.'''
-    enabled = True
-    knownfail = ErrorClass(KnownFailureException,
-                           label='KNOWNFAIL',
-                           isfailure=False)
-
-    def options(self, parser, env=os.environ):
-        env_opt = 'NOSE_WITHOUT_KNOWNFAIL'
-        parser.add_option('--no-knownfail', action='store_true',
-                          dest='noKnownFail', default=env.get(env_opt, False),
-                          help='Disable special handling of KnownFailure '
-                               'exceptions')
-
-    def configure(self, options, conf):
-        if not self.can_configure:
-            return
-        self.conf = conf
-        disable = getattr(options, 'noKnownFail', False)
-        if disable:
-            self.enabled = False
-
-KnownFailure = KnownFailurePlugin   # backwards compat
-
-
-class FPUModeCheckPlugin(Plugin):
-    """
-    Plugin that checks the FPU mode before and after each test,
-    raising failures if the test changed the mode.
-    """
-
-    def prepareTestCase(self, test):
-        from numpy.core._multiarray_tests import get_fpu_mode
-
-        def run(result):
-            old_mode = get_fpu_mode()
-            test.test(result)
-            new_mode = get_fpu_mode()
-
-            if old_mode != new_mode:
-                try:
-                    raise AssertionError(
-                        "FPU mode changed from {0:#x} to {1:#x} during the "
-                        "test".format(old_mode, new_mode))
-                except AssertionError:
-                    result.addFailure(test, sys.exc_info())
-
-        return run
-
-
-# Class allows us to save the results of the tests in runTests - see runTests
-# method docstring for details
-class NumpyTestProgram(nose.core.TestProgram):
-    def runTests(self):
-        """Run Tests. Returns true on success, false on failure, and
-        sets self.success to the same value.
-
-        Because nose currently discards the test result object, but we need
-        to return it to the user, override TestProgram.runTests to retain
-        the result
-        """
-        if self.testRunner is None:
-            self.testRunner = nose.core.TextTestRunner(stream=self.config.stream,
-                                                       verbosity=self.config.verbosity,
-                                                       config=self.config)
-        plug_runner = self.config.plugins.prepareTestRunner(self.testRunner)
-        if plug_runner is not None:
-            self.testRunner = plug_runner
-        self.result = self.testRunner.run(self.test)
-        self.success = self.result.wasSuccessful()
-        return self.success
diff --git a/numpy/testing/_private/nosetester.py b/numpy/testing/_private/nosetester.py
deleted file mode 100644
index bccec8236..000000000
--- a/numpy/testing/_private/nosetester.py
+++ /dev/null
@@ -1,545 +0,0 @@
-"""
-Nose test running.
-
-This module implements ``test()`` and ``bench()`` functions for NumPy modules.
-
-"""
-import os
-import sys
-import warnings
-import numpy as np
-
-from .utils import import_nose, suppress_warnings
-
-
-__all__ = ['get_package_name', 'run_module_suite', 'NoseTester',
-           '_numpy_tester', 'get_package_name', 'import_nose',
-           'suppress_warnings']
-
-
-def get_package_name(filepath):
-    """
-    Given a path where a package is installed, determine its name.
-
-    Parameters
-    ----------
-    filepath : str
-        Path to a file. If the determination fails, "numpy" is returned.
-
-    Examples
-    --------
-    >>> np.testing.nosetester.get_package_name('nonsense')
-    'numpy'
-
-    """
-
-    fullpath = filepath[:]
-    pkg_name = []
-    while 'site-packages' in filepath or 'dist-packages' in filepath:
-        filepath, p2 = os.path.split(filepath)
-        if p2 in ('site-packages', 'dist-packages'):
-            break
-        pkg_name.append(p2)
-
-    # if package name determination failed, just default to numpy/scipy
-    if not pkg_name:
-        if 'scipy' in fullpath:
-            return 'scipy'
-        else:
-            return 'numpy'
-
-    # otherwise, reverse to get correct order and return
-    pkg_name.reverse()
-
-    # don't include the outer egg directory
-    if pkg_name[0].endswith('.egg'):
-        pkg_name.pop(0)
-
-    return '.'.join(pkg_name)
-
-
-def run_module_suite(file_to_run=None, argv=None):
-    """
-    Run a test module.
-
-    Equivalent to calling ``$ nosetests <argv> <file_to_run>`` from
-    the command line
-
-    Parameters
-    ----------
-    file_to_run : str, optional
-        Path to test module, or None.
-        By default, run the module from which this function is called.
-    argv : list of strings
-        Arguments to be passed to the nose test runner. ``argv[0]`` is
-        ignored. All command line arguments accepted by ``nosetests``
-        will work. If it is the default value None, sys.argv is used.
-
-        .. versionadded:: 1.9.0
-
-    Examples
-    --------
-    Adding the following::
-
-        if __name__ == "__main__" :
-            run_module_suite(argv=sys.argv)
-
-    at the end of a test module will run the tests when that module is
-    called in the python interpreter.
-
-    Alternatively, calling::
-
-    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")  # doctest: +SKIP
-
-    from an interpreter will run all the test routine in 'test_matlib.py'.
-    """
-    if file_to_run is None:
-        f = sys._getframe(1)
-        file_to_run = f.f_locals.get('__file__', None)
-        if file_to_run is None:
-            raise AssertionError
-
-    if argv is None:
-        argv = sys.argv + [file_to_run]
-    else:
-        argv = argv + [file_to_run]
-
-    nose = import_nose()
-    from .noseclasses import KnownFailurePlugin
-    nose.run(argv=argv, addplugins=[KnownFailurePlugin()])
-
-
-class NoseTester:
-    """
-    Nose test runner.
-
-    This class is made available as numpy.testing.Tester, and a test function
-    is typically added to a package's __init__.py like so::
-
-      from numpy.testing import Tester
-      test = Tester().test
-
-    Calling this test function finds and runs all tests associated with the
-    package and all its sub-packages.
-
-    Attributes
-    ----------
-    package_path : str
-        Full path to the package to test.
-    package_name : str
-        Name of the package to test.
-
-    Parameters
-    ----------
-    package : module, str or None, optional
-        The package to test. If a string, this should be the full path to
-        the package. If None (default), `package` is set to the module from
-        which `NoseTester` is initialized.
-    raise_warnings : None, str or sequence of warnings, optional
-        This specifies which warnings to configure as 'raise' instead
-        of being shown once during the test execution.  Valid strings are:
-
-          - "develop" : equals ``(Warning,)``
-          - "release" : equals ``()``, don't raise on any warnings.
-
-        Default is "release".
-    depth : int, optional
-        If `package` is None, then this can be used to initialize from the
-        module of the caller of (the caller of (...)) the code that
-        initializes `NoseTester`. Default of 0 means the module of the
-        immediate caller; higher values are useful for utility routines that
-        want to initialize `NoseTester` objects on behalf of other code.
-
-    """
-    def __init__(self, package=None, raise_warnings="release", depth=0,
-                 check_fpu_mode=False):
-        # Back-compat: 'None' used to mean either "release" or "develop"
-        # depending on whether this was a release or develop version of
-        # numpy. Those semantics were fine for testing numpy, but not so
-        # helpful for downstream projects like scipy that use
-        # numpy.testing. (They want to set this based on whether *they* are a
-        # release or develop version, not whether numpy is.) So we continue to
-        # accept 'None' for back-compat, but it's now just an alias for the
-        # default "release".
-        if raise_warnings is None:
-            raise_warnings = "release"
-
-        package_name = None
-        if package is None:
-            f = sys._getframe(1 + depth)
-            package_path = f.f_locals.get('__file__', None)
-            if package_path is None:
-                raise AssertionError
-            package_path = os.path.dirname(package_path)
-            package_name = f.f_locals.get('__name__', None)
-        elif isinstance(package, type(os)):
-            package_path = os.path.dirname(package.__file__)
-            package_name = getattr(package, '__name__', None)
-        else:
-            package_path = str(package)
-
-        self.package_path = package_path
-
-        # Find the package name under test; this name is used to limit coverage
-        # reporting (if enabled).
-        if package_name is None:
-            package_name = get_package_name(package_path)
-        self.package_name = package_name
-
-        # Set to "release" in constructor in maintenance branches.
-        self.raise_warnings = raise_warnings
-
-        # Whether to check for FPU mode changes
-        self.check_fpu_mode = check_fpu_mode
-
-    def _test_argv(self, label, verbose, extra_argv):
-        ''' Generate argv for nosetest command
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            see ``test`` docstring
-        verbose : int, optional
-            Verbosity value for test outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-
-        Returns
-        -------
-        argv : list
-            command line arguments that will be passed to nose
-        '''
-        argv = [__file__, self.package_path, '-s']
-        if label and label != 'full':
-            if not isinstance(label, str):
-                raise TypeError('Selection label should be a string')
-            if label == 'fast':
-                label = 'not slow'
-            argv += ['-A', label]
-        argv += ['--verbosity', str(verbose)]
-
-        # When installing with setuptools, and also in some other cases, the
-        # test_*.py files end up marked +x executable. Nose, by default, does
-        # not run files marked with +x as they might be scripts. However, in
-        # our case nose only looks for test_*.py files under the package
-        # directory, which should be safe.
-        argv += ['--exe']
-
-        if extra_argv:
-            argv += extra_argv
-        return argv
-
-    def _show_system_info(self):
-        nose = import_nose()
-
-        import numpy
-        print(f'NumPy version {numpy.__version__}')
-        relaxed_strides = numpy.ones((10, 1), order="C").flags.f_contiguous
-        print("NumPy relaxed strides checking option:", relaxed_strides)
-        npdir = os.path.dirname(numpy.__file__)
-        print(f'NumPy is installed in {npdir}')
-
-        if 'scipy' in self.package_name:
-            import scipy
-            print(f'SciPy version {scipy.__version__}')
-            spdir = os.path.dirname(scipy.__file__)
-            print(f'SciPy is installed in {spdir}')
-
-        pyversion = sys.version.replace('\n', '')
-        print(f'Python version {pyversion}')
-        print("nose version %d.%d.%d" % nose.__versioninfo__)
-
-    def _get_custom_doctester(self):
-        """ Return instantiated plugin for doctests
-
-        Allows subclassing of this class to override doctester
-
-        A return value of None means use the nose builtin doctest plugin
-        """
-        from .noseclasses import NumpyDoctest
-        return NumpyDoctest()
-
-    def prepare_test_args(self, label='fast', verbose=1, extra_argv=None,
-                          doctests=False, coverage=False, timer=False):
-        """
-        Run tests for module using nose.
-
-        This method does the heavy lifting for the `test` method. It takes all
-        the same arguments, for details see `test`.
-
-        See Also
-        --------
-        test
-
-        """
-        # fail with nice error message if nose is not present
-        import_nose()
-        # compile argv
-        argv = self._test_argv(label, verbose, extra_argv)
-        # our way of doing coverage
-        if coverage:
-            argv += [f'--cover-package={self.package_name}', '--with-coverage',
-                   '--cover-tests', '--cover-erase']
-
-        if timer:
-            if timer is True:
-                argv += ['--with-timer']
-            elif isinstance(timer, int):
-                argv += ['--with-timer', '--timer-top-n', str(timer)]
-
-        # construct list of plugins
-        import nose.plugins.builtin
-        from nose.plugins import EntryPointPluginManager
-        from .noseclasses import (KnownFailurePlugin, Unplugger,
-                                  FPUModeCheckPlugin)
-        plugins = [KnownFailurePlugin()]
-        plugins += [p() for p in nose.plugins.builtin.plugins]
-        if self.check_fpu_mode:
-            plugins += [FPUModeCheckPlugin()]
-            argv += ["--with-fpumodecheckplugin"]
-        try:
-            # External plugins (like nose-timer)
-            entrypoint_manager = EntryPointPluginManager()
-            entrypoint_manager.loadPlugins()
-            plugins += [p for p in entrypoint_manager.plugins]
-        except ImportError:
-            # Relies on pkg_resources, not a hard dependency
-            pass
-
-        # add doctesting if required
-        doctest_argv = '--with-doctest' in argv
-        if doctests == False and doctest_argv:
-            doctests = True
-        plug = self._get_custom_doctester()
-        if plug is None:
-            # use standard doctesting
-            if doctests and not doctest_argv:
-                argv += ['--with-doctest']
-        else:  # custom doctesting
-            if doctest_argv:  # in fact the unplugger would take care of this
-                argv.remove('--with-doctest')
-            plugins += [Unplugger('doctest'), plug]
-            if doctests:
-                argv += ['--with-' + plug.name]
-        return argv, plugins
-
-    def test(self, label='fast', verbose=1, extra_argv=None,
-             doctests=False, coverage=False, raise_warnings=None,
-             timer=False):
-        """
-        Run tests for module using nose.
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            Identifies the tests to run. This can be a string to pass to
-            the nosetests executable with the '-A' option, or one of several
-            special values.  Special values are:
-
-            * 'fast' - the default - which corresponds to the ``nosetests -A``
-              option of 'not slow'.
-            * 'full' - fast (as above) and slow tests as in the
-              'no -A' option to nosetests - this is the same as ''.
-            * None or '' - run all tests.
-            * attribute_identifier - string passed directly to nosetests as '-A'.
-
-        verbose : int, optional
-            Verbosity value for test outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-        doctests : bool, optional
-            If True, run doctests in module. Default is False.
-        coverage : bool, optional
-            If True, report coverage of NumPy code. Default is False.
-            (This requires the
-            `coverage module <https://pypi.org/project/coverage/>`_).
-        raise_warnings : None, str or sequence of warnings, optional
-            This specifies which warnings to configure as 'raise' instead
-            of being shown once during the test execution. Valid strings are:
-
-            * "develop" : equals ``(Warning,)``
-            * "release" : equals ``()``, do not raise on any warnings.
-        timer : bool or int, optional
-            Timing of individual tests with ``nose-timer`` (which needs to be
-            installed).  If True, time tests and report on all of them.
-            If an integer (say ``N``), report timing results for ``N`` slowest
-            tests.
-
-        Returns
-        -------
-        result : object
-            Returns the result of running the tests as a
-            ``nose.result.TextTestResult`` object.
-
-        Notes
-        -----
-        Each NumPy module exposes `test` in its namespace to run all tests for it.
-        For example, to run all tests for numpy.lib:
-
-        >>> np.lib.test() #doctest: +SKIP
-
-        Examples
-        --------
-        >>> result = np.lib.test() #doctest: +SKIP
-        Running unit tests for numpy.lib
-        ...
-        Ran 976 tests in 3.933s
-
-        OK
-
-        >>> result.errors #doctest: +SKIP
-        []
-        >>> result.knownfail #doctest: +SKIP
-        []
-        """
-
-        # cap verbosity at 3 because nose becomes *very* verbose beyond that
-        verbose = min(verbose, 3)
-
-        from . import utils
-        utils.verbose = verbose
-
-        argv, plugins = self.prepare_test_args(
-                label, verbose, extra_argv, doctests, coverage, timer)
-
-        if doctests:
-            print(f'Running unit tests and doctests for {self.package_name}')
-        else:
-            print(f'Running unit tests for {self.package_name}')
-
-        self._show_system_info()
-
-        # reset doctest state on every run
-        import doctest
-        doctest.master = None
-
-        if raise_warnings is None:
-            raise_warnings = self.raise_warnings
-
-        _warn_opts = dict(develop=(Warning,),
-                          release=())
-        if isinstance(raise_warnings, str):
-            raise_warnings = _warn_opts[raise_warnings]
-
-        with suppress_warnings("location") as sup:
-            # Reset the warning filters to the default state,
-            # so that running the tests is more repeatable.
-            warnings.resetwarnings()
-            # Set all warnings to 'warn', this is because the default 'once'
-            # has the bad property of possibly shadowing later warnings.
-            warnings.filterwarnings('always')
-            # Force the requested warnings to raise
-            for warningtype in raise_warnings:
-                warnings.filterwarnings('error', category=warningtype)
-            # Filter out annoying import messages.
-            sup.filter(message='Not importing directory')
-            sup.filter(message="numpy.dtype size changed")
-            sup.filter(message="numpy.ufunc size changed")
-            sup.filter(category=np.ModuleDeprecationWarning)
-            # Filter out boolean '-' deprecation messages. This allows
-            # older versions of scipy to test without a flood of messages.
-            sup.filter(message=".*boolean negative.*")
-            sup.filter(message=".*boolean subtract.*")
-            # Filter out distutils cpu warnings (could be localized to
-            # distutils tests). ASV has problems with top level import,
-            # so fetch module for suppression here.
-            with warnings.catch_warnings():
-                warnings.simplefilter("always")
-                from ...distutils import cpuinfo
-            sup.filter(category=UserWarning, module=cpuinfo)
-            # Filter out some deprecation warnings inside nose 1.3.7 when run
-            # on python 3.5b2. See
-            #     https://github.com/nose-devs/nose/issues/929
-            # Note: it is hard to filter based on module for sup (lineno could
-            #       be implemented).
-            warnings.filterwarnings("ignore", message=".*getargspec.*",
-                                    category=DeprecationWarning,
-                                    module=r"nose\.")
-
-            from .noseclasses import NumpyTestProgram
-
-            t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins)
-
-        return t.result
-
-    def bench(self, label='fast', verbose=1, extra_argv=None):
-        """
-        Run benchmarks for module using nose.
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            Identifies the benchmarks to run. This can be a string to pass to
-            the nosetests executable with the '-A' option, or one of several
-            special values.  Special values are:
-
-            * 'fast' - the default - which corresponds to the ``nosetests -A``
-              option of 'not slow'.
-            * 'full' - fast (as above) and slow benchmarks as in the
-              'no -A' option to nosetests - this is the same as ''.
-            * None or '' - run all tests.
-            * attribute_identifier - string passed directly to nosetests as '-A'.
-
-        verbose : int, optional
-            Verbosity value for benchmark outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-
-        Returns
-        -------
-        success : bool
-            Returns True if running the benchmarks works, False if an error
-            occurred.
-
-        Notes
-        -----
-        Benchmarks are like tests, but have names starting with "bench" instead
-        of "test", and can be found under the "benchmarks" sub-directory of the
-        module.
-
-        Each NumPy module exposes `bench` in its namespace to run all benchmarks
-        for it.
-
-        Examples
-        --------
-        >>> success = np.lib.bench() #doctest: +SKIP
-        Running benchmarks for numpy.lib
-        ...
-        using 562341 items:
-        unique:
-        0.11
-        unique1d:
-        0.11
-        ratio: 1.0
-        nUnique: 56230 == 56230
-        ...
-        OK
-
-        >>> success #doctest: +SKIP
-        True
-
-        """
-
-        print(f'Running benchmarks for {self.package_name}')
-        self._show_system_info()
-
-        argv = self._test_argv(label, verbose, extra_argv)
-        argv += ['--match', r'(?:^|[\\b_\\.%s-])[Bb]ench' % os.sep]
-
-        # import nose or make informative error
-        nose = import_nose()
-
-        # get plugin to disable doctests
-        from .noseclasses import Unplugger
-        add_plugins = [Unplugger('doctest')]
-
-        return nose.run(argv=argv, addplugins=add_plugins)
-
-
-def _numpy_tester():
-    if hasattr(np, "__version__") and ".dev0" in np.__version__:
-        mode = "develop"
-    else:
-        mode = "release"
-    return NoseTester(raise_warnings=mode, depth=1,
-                      check_fpu_mode=True)
diff --git a/numpy/testing/_private/parameterized.py b/numpy/testing/_private/parameterized.py
deleted file mode 100644
index 3a29a1811..000000000
--- a/numpy/testing/_private/parameterized.py
+++ /dev/null
@@ -1,432 +0,0 @@
-"""
-tl;dr: all code is licensed under simplified BSD, unless stated otherwise.
-
-Unless stated otherwise in the source files, all code is copyright 2010 David
-Wolever <david@wolever.net>. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-EVENT SHALL <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies,
-either expressed or implied, of David Wolever.
-
-"""
-import re
-import inspect
-import warnings
-from functools import wraps
-from types import MethodType
-from collections import namedtuple
-
-from unittest import TestCase
-
-_param = namedtuple("param", "args kwargs")
-
-class param(_param):
-    """ Represents a single parameter to a test case.
-
-        For example::
-
-            >>> p = param("foo", bar=16)
-            >>> p
-            param("foo", bar=16)
-            >>> p.args
-            ('foo', )
-            >>> p.kwargs
-            {'bar': 16}
-
-        Intended to be used as an argument to ``@parameterized``::
-
-            @parameterized([
-                param("foo", bar=16),
-            ])
-            def test_stuff(foo, bar=16):
-                pass
-        """
-
-    def __new__(cls, *args , **kwargs):
-        return _param.__new__(cls, args, kwargs)
-
-    @classmethod
-    def explicit(cls, args=None, kwargs=None):
-        """ Creates a ``param`` by explicitly specifying ``args`` and
-            ``kwargs``::
-
-                >>> param.explicit([1,2,3])
-                param(*(1, 2, 3))
-                >>> param.explicit(kwargs={"foo": 42})
-                param(*(), **{"foo": "42"})
-            """
-        args = args or ()
-        kwargs = kwargs or {}
-        return cls(*args, **kwargs)
-
-    @classmethod
-    def from_decorator(cls, args):
-        """ Returns an instance of ``param()`` for ``@parameterized`` argument
-            ``args``::
-
-                >>> param.from_decorator((42, ))
-                param(args=(42, ), kwargs={})
-                >>> param.from_decorator("foo")
-                param(args=("foo", ), kwargs={})
-            """
-        if isinstance(args, param):
-            return args
-        elif isinstance(args, (str,)):
-            args = (args, )
-        try:
-            return cls(*args)
-        except TypeError as e:
-            if "after * must be" not in str(e):
-                raise
-            raise TypeError(
-                "Parameters must be tuples, but %r is not (hint: use '(%r, )')"
-                %(args, args),
-            )
-
-    def __repr__(self):
-        return "param(*%r, **%r)" %self
-
-
-def parameterized_argument_value_pairs(func, p):
-    """Return tuples of parameterized arguments and their values.
-
-        This is useful if you are writing your own doc_func
-        function and need to know the values for each parameter name::
-
-            >>> def func(a, foo=None, bar=42, **kwargs): pass
-            >>> p = param(1, foo=7, extra=99)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("a", 1), ("foo", 7), ("bar", 42), ("**kwargs", {"extra": 99})]
-
-        If the function's first argument is named ``self`` then it will be
-        ignored::
-
-            >>> def func(self, a): pass
-            >>> p = param(1)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("a", 1)]
-
-        Additionally, empty ``*args`` or ``**kwargs`` will be ignored::
-
-            >>> def func(foo, *args): pass
-            >>> p = param(1)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("foo", 1)]
-            >>> p = param(1, 16)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("foo", 1), ("*args", (16, ))]
-    """
-    argspec = inspect.getargspec(func)
-    arg_offset = 1 if argspec.args[:1] == ["self"] else 0
-
-    named_args = argspec.args[arg_offset:]
-
-    result = list(zip(named_args, p.args))
-    named_args = argspec.args[len(result) + arg_offset:]
-    varargs = p.args[len(result):]
-
-    result.extend([
-        (name, p.kwargs.get(name, default))
-        for (name, default)
-        in zip(named_args, argspec.defaults or [])
-    ])
-
-    seen_arg_names = {n for (n, _) in result}
-    keywords = dict(sorted([
-        (name, p.kwargs[name])
-        for name in p.kwargs
-        if name not in seen_arg_names
-    ]))
-
-    if varargs:
-        result.append(("*%s" %(argspec.varargs, ), tuple(varargs)))
-
-    if keywords:
-        result.append(("**%s" %(argspec.keywords, ), keywords))
-
-    return result
-
-def short_repr(x, n=64):
-    """ A shortened repr of ``x`` which is guaranteed to be ``unicode``::
-
-            >>> short_repr("foo")
-            u"foo"
-            >>> short_repr("123456789", n=4)
-            u"12...89"
-    """
-
-    x_repr = repr(x)
-    if isinstance(x_repr, bytes):
-        try:
-            x_repr = str(x_repr, "utf-8")
-        except UnicodeDecodeError:
-            x_repr = str(x_repr, "latin1")
-    if len(x_repr) > n:
-        x_repr = x_repr[:n//2] + "..." + x_repr[len(x_repr) - n//2:]
-    return x_repr
-
-def default_doc_func(func, num, p):
-    if func.__doc__ is None:
-        return None
-
-    all_args_with_values = parameterized_argument_value_pairs(func, p)
-
-    # Assumes that the function passed is a bound method.
-    descs = [f'{n}={short_repr(v)}' for n, v in all_args_with_values]
-
-    # The documentation might be a multiline string, so split it
-    # and just work with the first string, ignoring the period
-    # at the end if there is one.
-    first, nl, rest = func.__doc__.lstrip().partition("\n")
-    suffix = ""
-    if first.endswith("."):
-        suffix = "."
-        first = first[:-1]
-    args = "%s[with %s]" %(len(first) and " " or "", ", ".join(descs))
-    return "".join([first.rstrip(), args, suffix, nl, rest])
-
-def default_name_func(func, num, p):
-    base_name = func.__name__
-    name_suffix = "_%s" %(num, )
-    if len(p.args) > 0 and isinstance(p.args[0], (str,)):
-        name_suffix += "_" + parameterized.to_safe_name(p.args[0])
-    return base_name + name_suffix
-
-
-# force nose for numpy purposes.
-_test_runner_override = 'nose'
-_test_runner_guess = False
-_test_runners = set(["unittest", "unittest2", "nose", "nose2", "pytest"])
-_test_runner_aliases = {
-    "_pytest": "pytest",
-}
-
-def set_test_runner(name):
-    global _test_runner_override
-    if name not in _test_runners:
-        raise TypeError(
-            "Invalid test runner: %r (must be one of: %s)"
-            %(name, ", ".join(_test_runners)),
-        )
-    _test_runner_override = name
-
-def detect_runner():
-    """ Guess which test runner we're using by traversing the stack and looking
-        for the first matching module. This *should* be reasonably safe, as
-        it's done during test discovery where the test runner should be the
-        stack frame immediately outside. """
-    if _test_runner_override is not None:
-        return _test_runner_override
-    global _test_runner_guess
-    if _test_runner_guess is False:
-        stack = inspect.stack()
-        for record in reversed(stack):
-            frame = record[0]
-            module = frame.f_globals.get("__name__").partition(".")[0]
-            if module in _test_runner_aliases:
-                module = _test_runner_aliases[module]
-            if module in _test_runners:
-                _test_runner_guess = module
-                break
-        else:
-            _test_runner_guess = None
-    return _test_runner_guess
-
-class parameterized:
-    """ Parameterize a test case::
-
-            class TestInt:
-                @parameterized([
-                    ("A", 10),
-                    ("F", 15),
-                    param("10", 42, base=42)
-                ])
-                def test_int(self, input, expected, base=16):
-                    actual = int(input, base=base)
-                    assert_equal(actual, expected)
-
-            @parameterized([
-                (2, 3, 5)
-                (3, 5, 8),
-            ])
-            def test_add(a, b, expected):
-                assert_equal(a + b, expected)
-        """
-
-    def __init__(self, input, doc_func=None):
-        self.get_input = self.input_as_callable(input)
-        self.doc_func = doc_func or default_doc_func
-
-    def __call__(self, test_func):
-        self.assert_not_in_testcase_subclass()
-
-        @wraps(test_func)
-        def wrapper(test_self=None):
-            test_cls = test_self and type(test_self)
-
-            original_doc = wrapper.__doc__
-            for num, args in enumerate(wrapper.parameterized_input):
-                p = param.from_decorator(args)
-                unbound_func, nose_tuple = self.param_as_nose_tuple(test_self, test_func, num, p)
-                try:
-                    wrapper.__doc__ = nose_tuple[0].__doc__
-                    # Nose uses `getattr(instance, test_func.__name__)` to get
-                    # a method bound to the test instance (as opposed to a
-                    # method bound to the instance of the class created when
-                    # tests were being enumerated). Set a value here to make
-                    # sure nose can get the correct test method.
-                    if test_self is not None:
-                        setattr(test_cls, test_func.__name__, unbound_func)
-                    yield nose_tuple
-                finally:
-                    if test_self is not None:
-                        delattr(test_cls, test_func.__name__)
-                    wrapper.__doc__ = original_doc
-        wrapper.parameterized_input = self.get_input()
-        wrapper.parameterized_func = test_func
-        test_func.__name__ = "_parameterized_original_%s" %(test_func.__name__, )
-        return wrapper
-
-    def param_as_nose_tuple(self, test_self, func, num, p):
-        nose_func = wraps(func)(lambda *args: func(*args[:-1], **args[-1]))
-        nose_func.__doc__ = self.doc_func(func, num, p)
-        # Track the unbound function because we need to setattr the unbound
-        # function onto the class for nose to work (see comments above), and
-        # Python 3 doesn't let us pull the function out of a bound method.
-        unbound_func = nose_func
-        if test_self is not None:
-            nose_func = MethodType(nose_func, test_self)
-        return unbound_func, (nose_func, ) + p.args + (p.kwargs or {}, )
-
-    def assert_not_in_testcase_subclass(self):
-        parent_classes = self._terrible_magic_get_defining_classes()
-        if any(issubclass(cls, TestCase) for cls in parent_classes):
-            raise Exception("Warning: '@parameterized' tests won't work "
-                            "inside subclasses of 'TestCase' - use "
-                            "'@parameterized.expand' instead.")
-
-    def _terrible_magic_get_defining_classes(self):
-        """ Returns the list of parent classes of the class currently being defined.
-            Will likely only work if called from the ``parameterized`` decorator.
-            This function is entirely @brandon_rhodes's fault, as he suggested
-            the implementation: http://stackoverflow.com/a/8793684/71522
-            """
-        stack = inspect.stack()
-        if len(stack) <= 4:
-            return []
-        frame = stack[4]
-        code_context = frame[4] and frame[4][0].strip()
-        if not (code_context and code_context.startswith("class ")):
-            return []
-        _, _, parents = code_context.partition("(")
-        parents, _, _ = parents.partition(")")
-        return eval("[" + parents + "]", frame[0].f_globals, frame[0].f_locals)
-
-    @classmethod
-    def input_as_callable(cls, input):
-        if callable(input):
-            return lambda: cls.check_input_values(input())
-        input_values = cls.check_input_values(input)
-        return lambda: input_values
-
-    @classmethod
-    def check_input_values(cls, input_values):
-        # Explicitly convert non-list inputs to a list so that:
-        # 1. A helpful exception will be raised if they aren't iterable, and
-        # 2. Generators are unwrapped exactly once (otherwise `nosetests
-        #    --processes=n` has issues; see:
-        #    https://github.com/wolever/nose-parameterized/pull/31)
-        if not isinstance(input_values, list):
-            input_values = list(input_values)
-        return [ param.from_decorator(p) for p in input_values ]
-
-    @classmethod
-    def expand(cls, input, name_func=None, doc_func=None, **legacy):
-        """ A "brute force" method of parameterizing test cases. Creates new
-            test cases and injects them into the namespace that the wrapped
-            function is being defined in. Useful for parameterizing tests in
-            subclasses of 'UnitTest', where Nose test generators don't work.
-
-            >>> @parameterized.expand([("foo", 1, 2)])
-            ... def test_add1(name, input, expected):
-            ...     actual = add1(input)
-            ...     assert_equal(actual, expected)
-            ...
-            >>> locals()
-            ... 'test_add1_foo_0': <function ...> ...
-            >>>
-            """
-
-        if "testcase_func_name" in legacy:
-            warnings.warn("testcase_func_name= is deprecated; use name_func=",
-                          DeprecationWarning, stacklevel=2)
-            if not name_func:
-                name_func = legacy["testcase_func_name"]
-
-        if "testcase_func_doc" in legacy:
-            warnings.warn("testcase_func_doc= is deprecated; use doc_func=",
-                          DeprecationWarning, stacklevel=2)
-            if not doc_func:
-                doc_func = legacy["testcase_func_doc"]
-
-        doc_func = doc_func or default_doc_func
-        name_func = name_func or default_name_func
-
-        def parameterized_expand_wrapper(f, instance=None):
-            stack = inspect.stack()
-            frame = stack[1]
-            frame_locals = frame[0].f_locals
-
-            parameters = cls.input_as_callable(input)()
-            for num, p in enumerate(parameters):
-                name = name_func(f, num, p)
-                frame_locals[name] = cls.param_as_standalone_func(p, f, name)
-                frame_locals[name].__doc__ = doc_func(f, num, p)
-
-            f.__test__ = False
-        return parameterized_expand_wrapper
-
-    @classmethod
-    def param_as_standalone_func(cls, p, func, name):
-        @wraps(func)
-        def standalone_func(*a):
-            return func(*(a + p.args), **p.kwargs)
-        standalone_func.__name__ = name
-
-        # place_as is used by py.test to determine what source file should be
-        # used for this test.
-        standalone_func.place_as = func
-
-        # Remove __wrapped__ because py.test will try to look at __wrapped__
-        # to determine which parameters should be used with this test case,
-        # and obviously we don't need it to do any parameterization.
-        try:
-            del standalone_func.__wrapped__
-        except AttributeError:
-            pass
-        return standalone_func
-
-    @classmethod
-    def to_safe_name(cls, s):
-        return str(re.sub("[^a-zA-Z0-9_]+", "_", s))
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 45400856b..cca7b8063 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -16,10 +16,12 @@ from tempfile import mkdtemp, mkstemp
 from unittest.case import SkipTest
 from warnings import WarningMessage
 import pprint
+import sysconfig
 
 import numpy as np
-from numpy.core import(
+from numpy.core import (
      intp, float32, empty, arange, array_repr, ndarray, isnat, array)
+from numpy import isfinite, isnan, isinf
 import numpy.linalg.lapack_lite
 
 from io import StringIO
@@ -29,14 +31,14 @@ __all__ = [
         'assert_array_equal', 'assert_array_less', 'assert_string_equal',
         'assert_array_almost_equal', 'assert_raises', 'build_err_msg',
         'decorate_methods', 'jiffies', 'memusage', 'print_assert_equal',
-        'raises', 'rundocs', 'runstring', 'verbose', 'measure',
+        'rundocs', 'runstring', 'verbose', 'measure',
         'assert_', 'assert_array_almost_equal_nulp', 'assert_raises_regex',
         'assert_array_max_ulp', 'assert_warns', 'assert_no_warnings',
         'assert_allclose', 'IgnoreException', 'clear_and_catch_warnings',
         'SkipTest', 'KnownFailureException', 'temppath', 'tempdir', 'IS_PYPY',
         'HAS_REFCOUNT', "IS_WASM", 'suppress_warnings', 'assert_array_compare',
         'assert_no_gc_cycles', 'break_cycles', 'HAS_LAPACK64', 'IS_PYSTON',
-        '_OLD_PROMOTION'
+        '_OLD_PROMOTION', 'IS_MUSL', '_SUPPORTS_SVE'
         ]
 
 
@@ -56,27 +58,18 @@ HAS_LAPACK64 = numpy.linalg.lapack_lite._ilp64
 
 _OLD_PROMOTION = lambda: np._get_promotion_state() == 'legacy'
 
-
-def import_nose():
-    """ Import nose only when needed.
-    """
-    nose_is_good = True
-    minimum_nose_version = (1, 0, 0)
-    try:
-        import nose
-    except ImportError:
-        nose_is_good = False
-    else:
-        if nose.__versioninfo__ < minimum_nose_version:
-            nose_is_good = False
-
-    if not nose_is_good:
-        msg = ('Need nose >= %d.%d.%d for tests - see '
-               'https://nose.readthedocs.io' %
-               minimum_nose_version)
-        raise ImportError(msg)
-
-    return nose
+IS_MUSL = False
+try:
+    from packaging.tags import sys_tags
+    _tags = list(sys_tags())
+    if 'musllinux' in _tags[0].platform:
+        IS_MUSL = True
+except ImportError:
+    # fallback to sysconfig (might be flaky)
+    # value could be None.
+    v = sysconfig.get_config_var('HOST_GNU_TYPE') or ''
+    if 'musl' in v:
+        IS_MUSL = True
 
 
 def assert_(val, msg=''):
@@ -99,62 +92,6 @@ def assert_(val, msg=''):
         raise AssertionError(smsg)
 
 
-def gisnan(x):
-    """like isnan, but always raise an error if type not supported instead of
-    returning a TypeError object.
-
-    Notes
-    -----
-    isnan and other ufunc sometimes return a NotImplementedType object instead
-    of raising any exception. This function is a wrapper to make sure an
-    exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isnan
-    st = isnan(x)
-    if isinstance(st, type(NotImplemented)):
-        raise TypeError("isnan not supported for this type")
-    return st
-
-
-def gisfinite(x):
-    """like isfinite, but always raise an error if type not supported instead
-    of returning a TypeError object.
-
-    Notes
-    -----
-    isfinite and other ufunc sometimes return a NotImplementedType object
-    instead of raising any exception. This function is a wrapper to make sure
-    an exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isfinite, errstate
-    with errstate(invalid='ignore'):
-        st = isfinite(x)
-        if isinstance(st, type(NotImplemented)):
-            raise TypeError("isfinite not supported for this type")
-    return st
-
-
-def gisinf(x):
-    """like isinf, but always raise an error if type not supported instead of
-    returning a TypeError object.
-
-    Notes
-    -----
-    isinf and other ufunc sometimes return a NotImplementedType object instead
-    of raising any exception. This function is a wrapper to make sure an
-    exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isinf, errstate
-    with errstate(invalid='ignore'):
-        st = isinf(x)
-        if isinstance(st, type(NotImplemented)):
-            raise TypeError("isinf not supported for this type")
-    return st
-
-
 if os.name == 'nt':
     # Code "stolen" from enthought/debug/memusage.py
     def GetPerformanceAttributes(object, counter, instance=None,
@@ -198,7 +135,7 @@ elif sys.platform[:5] == 'linux':
 
         """
         try:
-            with open(_proc_pid_stat, 'r') as f:
+            with open(_proc_pid_stat) as f:
                 l = f.readline().split(' ')
             return int(l[22])
         except Exception:
@@ -225,7 +162,7 @@ if sys.platform[:5] == 'linux':
         if not _load_time:
             _load_time.append(time.time())
         try:
-            with open(_proc_pid_stat, 'r') as f:
+            with open(_proc_pid_stat) as f:
                 l = f.readline().split(' ')
             return int(l[13])
         except Exception:
@@ -398,8 +335,8 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
 
     # Inf/nan/negative zero handling
     try:
-        isdesnan = gisnan(desired)
-        isactnan = gisnan(actual)
+        isdesnan = isnan(desired)
+        isactnan = isnan(actual)
         if isdesnan and isactnan:
             return  # both nan, so equal
 
@@ -409,7 +346,7 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
         if (array_actual.dtype.char in 'Mm' or
                 array_desired.dtype.char in 'Mm'):
             # version 1.18
-            # until this version, gisnan failed for datetime64 and timedelta64.
+            # until this version, isnan failed for datetime64 and timedelta64.
             # Now it succeeds but comparison to scalar with a different type
             # emits a DeprecationWarning.
             # Avoid that by skipping the next check
@@ -478,7 +415,7 @@ def print_assert_equal(test_string, actual, desired):
 
 
 @np._no_nep50_warning()
-def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
+def assert_almost_equal(actual, desired, decimal=7, err_msg='', verbose=True):
     """
     Raises an AssertionError if two items are not equal up to desired
     precision.
@@ -590,9 +527,9 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
         # otherwise
-        if not (gisfinite(desired) and gisfinite(actual)):
-            if gisnan(desired) or gisnan(actual):
-                if not (gisnan(desired) and gisnan(actual)):
+        if not (isfinite(desired) and isfinite(actual)):
+            if isnan(desired) or isnan(actual):
+                if not (isnan(desired) and isnan(actual)):
                     raise AssertionError(_build_err_msg())
             else:
                 if not desired == actual:
@@ -605,7 +542,8 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
 
 
 @np._no_nep50_warning()
-def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
+def assert_approx_equal(actual, desired, significant=7, err_msg='',
+                        verbose=True):
     """
     Raises an AssertionError if two items are not equal up to significant
     digits.
@@ -690,9 +628,9 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
         # otherwise
-        if not (gisfinite(desired) and gisfinite(actual)):
-            if gisnan(desired) or gisnan(actual):
-                if not (gisnan(desired) and gisnan(actual)):
+        if not (isfinite(desired) and isfinite(actual)):
+            if isnan(desired) or isnan(actual):
+                if not (isnan(desired) and isnan(actual)):
                     raise AssertionError(msg)
             else:
                 if not desired == actual:
@@ -709,7 +647,8 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                          precision=6, equal_nan=True, equal_inf=True,
                          *, strict=False):
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import array, array2string, isnan, inf, bool_, errstate, all, max, object_
+    from numpy.core import (array2string, isnan, inf, bool_, errstate,
+                            all, max, object_)
 
     x = np.asanyarray(x)
     y = np.asanyarray(y)
@@ -835,10 +774,10 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                     max_abs_error = max(error)
                     if getattr(error, 'dtype', object_) == object_:
                         remarks.append('Max absolute difference: '
-                                        + str(max_abs_error))
+                                       + str(max_abs_error))
                     else:
                         remarks.append('Max absolute difference: '
-                                        + array2string(max_abs_error))
+                                       + array2string(max_abs_error))
 
                     # note: this definition of relative error matches that one
                     # used by assert_allclose (found in np.isclose)
@@ -850,10 +789,10 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                         max_rel_error = max(error[nonzero] / abs(y[nonzero]))
                     if getattr(error, 'dtype', object_) == object_:
                         remarks.append('Max relative difference: '
-                                        + str(max_rel_error))
+                                       + str(max_rel_error))
                     else:
                         remarks.append('Max relative difference: '
-                                        + array2string(max_rel_error))
+                                       + array2string(max_rel_error))
 
             err_msg += '\n' + '\n'.join(remarks)
             msg = build_err_msg([ox, oy], err_msg,
@@ -899,6 +838,8 @@ def assert_array_equal(x, y, err_msg='', verbose=True, *, strict=False):
         type of the array_like objects does not match. The special
         handling for scalars mentioned in the Notes section is disabled.
 
+        .. versionadded:: 1.24.0
+
     Raises
     ------
     AssertionError
@@ -1064,15 +1005,15 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg='', verbose=True):
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import number, float_, result_type, array
+    from numpy.core import number, float_, result_type
     from numpy.core.numerictypes import issubdtype
     from numpy.core.fromnumeric import any as npany
 
     def compare(x, y):
         try:
-            if npany(gisinf(x)) or npany( gisinf(y)):
-                xinfid = gisinf(x)
-                yinfid = gisinf(y)
+            if npany(isinf(x)) or npany(isinf(y)):
+                xinfid = isinf(x)
+                yinfid = isinf(y)
                 if not (xinfid == yinfid).all():
                     return False
                 # if one item, x and y is +- inf
@@ -1112,8 +1053,6 @@ def assert_array_less(x, y, err_msg='', verbose=True):
     compared, no assertion is raised if both objects have NaNs in the same
     positions.
 
-
-
     Parameters
     ----------
     x : array_like
@@ -1128,15 +1067,13 @@ def assert_array_less(x, y, err_msg='', verbose=True):
     Raises
     ------
     AssertionError
-      If actual and desired objects are not equal.
+      If x is not strictly smaller than y, element-wise.
 
     See Also
     --------
     assert_array_equal: tests objects for equality
     assert_array_almost_equal: test objects for equality up to precision
 
-
-
     Examples
     --------
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1.1, 2.0, np.nan])
@@ -1303,42 +1240,21 @@ def rundocs(filename=None, raise_on_error=True):
         raise AssertionError("Some doctests failed:\n%s" % "\n".join(msg))
 
 
-def raises(*args):
-    """Decorator to check for raised exceptions.
-
-    The decorated test function must raise one of the passed exceptions to
-    pass.  If you want to test many assertions about exceptions in a single
-    test, you may want to use `assert_raises` instead.
-
-    .. warning::
-       This decorator is nose specific, do not use it if you are using a
-       different test framework.
-
-    Parameters
-    ----------
-    args : exceptions
-        The test passes if any of the passed exceptions is raised.
-
-    Raises
-    ------
-    AssertionError
-
-    Examples
-    --------
-
-    Usage::
+def check_support_sve():
+    """
+    gh-22982
+    """
+    
+    import subprocess
+    cmd = 'lscpu'
+    try:
+        return "sve" in (subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                            shell=True).communicate()[0]).decode('utf-8')
+    except OSError:
+        return False
 
-        @raises(TypeError, ValueError)
-        def test_raises_type_error():
-            raise TypeError("This test passes")
 
-        @raises(Exception)
-        def test_that_fails_by_passing():
-            pass
-
-    """
-    nose = import_nose()
-    return nose.tools.raises(*args)
+_SUPPORTS_SVE = check_support_sve()
 
 #
 # assert_raises and assert_raises_regex are taken from unittest.
@@ -1350,8 +1266,10 @@ class _Dummy(unittest.TestCase):
     def nop(self):
         pass
 
+
 _d = _Dummy('nop')
 
+
 def assert_raises(*args, **kwargs):
     """
     assert_raises(exception_class, callable, *args, **kwargs)
@@ -1378,7 +1296,7 @@ def assert_raises(*args, **kwargs):
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
-    return _d.assertRaises(*args,**kwargs)
+    return _d.assertRaises(*args, **kwargs)
 
 
 def assert_raises_regex(exception_class, expected_regexp, *args, **kwargs):
@@ -1529,7 +1447,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0, equal_nan=True,
 
     Given two array_like objects, check that their shapes and all elements
     are equal (but see the Notes for the special handling of a scalar). An
-    exception is raised if the shapes mismatch or any values conflict. In 
+    exception is raised if the shapes mismatch or any values conflict. In
     contrast to the standard usage in numpy, NaNs are compared like numbers,
     no assertion is raised if both objects have NaNs in the same positions.
 
@@ -2451,6 +2369,7 @@ def assert_no_gc_cycles(*args, **kwargs):
     with _assert_no_gc_cycles_context(name=func.__name__):
         func(*args, **kwargs)
 
+
 def break_cycles():
     """
     Break reference cycles by calling gc.collect
@@ -2546,7 +2465,7 @@ def _get_mem_available():
 
     if sys.platform.startswith('linux'):
         info = {}
-        with open('/proc/meminfo', 'r') as f:
+        with open('/proc/meminfo') as f:
             for line in f:
                 p = line.split()
                 info[p[0].strip(':').lower()] = int(p[1]) * 1024
@@ -2583,7 +2502,7 @@ def _no_tracing(func):
 def _get_glibc_version():
     try:
         ver = os.confstr('CS_GNU_LIBC_VERSION').rsplit(' ')[1]
-    except Exception as inst:
+    except Exception:
         ver = '0.0'
 
     return ver
@@ -2591,3 +2510,4 @@ def _get_glibc_version():
 
 _glibcver = _get_glibc_version()
 _glibc_older_than = lambda x: (_glibcver != '0.0' and _glibcver < x)
+
diff --git a/numpy/testing/overrides.py b/numpy/testing/overrides.py
index d20ed60e5..edc7132c2 100644
--- a/numpy/testing/overrides.py
+++ b/numpy/testing/overrides.py
@@ -21,10 +21,11 @@ def get_overridable_numpy_ufuncs():
     """
     ufuncs = {obj for obj in _umath.__dict__.values()
               if isinstance(obj, _ufunc)}
+    return ufuncs
     
 
 def allows_array_ufunc_override(func):
-    """Determine if a function can be overriden via `__array_ufunc__`
+    """Determine if a function can be overridden via `__array_ufunc__`
 
     Parameters
     ----------
@@ -37,9 +38,9 @@ def allows_array_ufunc_override(func):
         `True` if `func` is overridable via `__array_ufunc__` and
         `False` otherwise.
 
-    Note
-    ----
-    This function is equivalent to `isinstance(func, np.ufunc)` and
+    Notes
+    -----
+    This function is equivalent to ``isinstance(func, np.ufunc)`` and
     will work correctly for ufuncs defined outside of Numpy.
 
     """
@@ -66,7 +67,7 @@ def get_overridable_numpy_array_functions():
     return _array_functions.copy()
 
 def allows_array_function_override(func):
-    """Determine if a Numpy function can be overriden via `__array_function__`
+    """Determine if a Numpy function can be overridden via `__array_function__`
 
     Parameters
     ----------
diff --git a/numpy/testing/tests/test_doctesting.py b/numpy/testing/tests/test_doctesting.py
deleted file mode 100644
index 92c2156d8..000000000
--- a/numpy/testing/tests/test_doctesting.py
+++ /dev/null
@@ -1,57 +0,0 @@
-""" Doctests for NumPy-specific nose/doctest modifications
-
-"""
-#FIXME: None of these tests is run, because 'check' is not a recognized
-# testing prefix.
-
-# try the #random directive on the output line
-def check_random_directive():
-    '''
-    >>> 2+2
-    <BadExample object at 0x084D05AC>  #random: may vary on your system
-    '''
-
-# check the implicit "import numpy as np"
-def check_implicit_np():
-    '''
-    >>> np.array([1,2,3])
-    array([1, 2, 3])
-    '''
-
-# there's some extraneous whitespace around the correct responses
-def check_whitespace_enabled():
-    '''
-    # whitespace after the 3
-    >>> 1+2
-    3
-
-    # whitespace before the 7
-    >>> 3+4
-     7
-    '''
-
-def check_empty_output():
-    """ Check that no output does not cause an error.
-
-    This is related to nose bug 445; the numpy plugin changed the
-    doctest-result-variable default and therefore hit this bug:
-    http://code.google.com/p/python-nose/issues/detail?id=445
-
-    >>> a = 10
-    """
-
-def check_skip():
-    """ Check skip directive
-
-    The test below should not run
-
-    >>> 1/0 #doctest: +SKIP
-    """
-
-
-if __name__ == '__main__':
-    # Run tests outside numpy test rig
-    import nose
-    from numpy.testing.noseclasses import NumpyDoctest
-    argv = ['', __file__, '--with-numpydoctest']
-    nose.core.TestProgram(argv=argv, addplugins=[NumpyDoctest()])
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 052cc3aa1..0aaa508ee 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -8,13 +8,12 @@ import weakref
 import numpy as np
 from numpy.testing import (
     assert_equal, assert_array_equal, assert_almost_equal,
-    assert_array_almost_equal, assert_array_less, build_err_msg, raises,
+    assert_array_almost_equal, assert_array_less, build_err_msg,
     assert_raises, assert_warns, assert_no_warnings, assert_allclose,
     assert_approx_equal, assert_array_almost_equal_nulp, assert_array_max_ulp,
     clear_and_catch_warnings, suppress_warnings, assert_string_equal, assert_,
     tempdir, temppath, assert_no_gc_cycles, HAS_REFCOUNT
     )
-from numpy.core.overrides import ARRAY_FUNCTION_ENABLED
 
 
 class _GenericTest:
@@ -191,8 +190,6 @@ class TestArrayEqual(_GenericTest):
         self._test_not_equal(a, b)
         self._test_not_equal(b, a)
 
-    @pytest.mark.skipif(
-        not ARRAY_FUNCTION_ENABLED, reason='requires __array_function__')
     def test_subclass_that_does_not_implement_npall(self):
         class MyArray(np.ndarray):
             def __array_function__(self, *args, **kwargs):
@@ -793,41 +790,6 @@ class TestArrayAssertLess:
         self._assert_func(-ainf, x)
 
 
-@pytest.mark.skip(reason="The raises decorator depends on Nose")
-class TestRaises:
-
-    def setup_method(self):
-        class MyException(Exception):
-            pass
-
-        self.e = MyException
-
-    def raises_exception(self, e):
-        raise e
-
-    def does_not_raise_exception(self):
-        pass
-
-    def test_correct_catch(self):
-        raises(self.e)(self.raises_exception)(self.e)  # raises?
-
-    def test_wrong_exception(self):
-        try:
-            raises(self.e)(self.raises_exception)(RuntimeError)  # raises?
-        except RuntimeError:
-            return
-        else:
-            raise AssertionError("should have caught RuntimeError")
-
-    def test_catch_no_raise(self):
-        try:
-            raises(self.e)(self.does_not_raise_exception)()  # raises?
-        except AssertionError:
-            return
-        else:
-            raise AssertionError("should have raised an AssertionError")
-
-
 class TestWarns:
 
     def test_warn(self):
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
deleted file mode 100644
index 20a883304..000000000
--- a/numpy/testing/utils.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-Back compatibility utils module. It will import the appropriate
-set of tools
-
-"""
-import warnings
-
-# 2018-04-04, numpy 1.15.0 ImportWarning
-# 2019-09-18, numpy 1.18.0 DeprecatonWarning (changed)
-warnings.warn("Importing from numpy.testing.utils is deprecated "
-              "since 1.15.0, import from numpy.testing instead.",
-              DeprecationWarning, stacklevel=2)
-
-from ._private.utils import *
-from ._private.utils import _assert_valid_refcount, _gen_alignment_data
-
-__all__ = [
-        'assert_equal', 'assert_almost_equal', 'assert_approx_equal',
-        'assert_array_equal', 'assert_array_less', 'assert_string_equal',
-        'assert_array_almost_equal', 'assert_raises', 'build_err_msg',
-        'decorate_methods', 'jiffies', 'memusage', 'print_assert_equal',
-        'raises', 'rundocs', 'runstring', 'verbose', 'measure',
-        'assert_', 'assert_array_almost_equal_nulp', 'assert_raises_regex',
-        'assert_array_max_ulp', 'assert_warns', 'assert_no_warnings',
-        'assert_allclose', 'IgnoreException', 'clear_and_catch_warnings',
-        'SkipTest', 'KnownFailureException', 'temppath', 'tempdir', 'IS_PYPY',
-        'HAS_REFCOUNT', 'suppress_warnings', 'assert_array_compare',
-        'assert_no_gc_cycles'
-        ]
diff --git a/numpy/tests/test_numpy_config.py b/numpy/tests/test_numpy_config.py
new file mode 100644
index 000000000..82c1ad70b
--- /dev/null
+++ b/numpy/tests/test_numpy_config.py
@@ -0,0 +1,44 @@
+"""
+Check the numpy config is valid.
+"""
+import numpy as np
+import pytest
+from unittest.mock import Mock, patch
+
+pytestmark = pytest.mark.skipif(
+    not hasattr(np.__config__, "_built_with_meson"),
+    reason="Requires Meson builds",
+)
+
+
+class TestNumPyConfigs:
+    REQUIRED_CONFIG_KEYS = [
+        "Compilers",
+        "Machine Information",
+        "Python Information",
+    ]
+
+    @patch("numpy.__config__._check_pyyaml")
+    def test_pyyaml_not_found(self, mock_yaml_importer):
+        mock_yaml_importer.side_effect = ModuleNotFoundError()
+        with pytest.warns(UserWarning):
+            np.show_config()
+
+    def test_dict_mode(self):
+        config = np.show_config(mode="dicts")
+
+        assert isinstance(config, dict)
+        assert all([key in config for key in self.REQUIRED_CONFIG_KEYS]), (
+            "Required key missing,"
+            " see index of `False` with `REQUIRED_CONFIG_KEYS`"
+        )
+
+    def test_invalid_mode(self):
+        with pytest.raises(AttributeError):
+            np.show_config(mode="foo")
+
+    def test_warn_to_add_tests(self):
+        assert len(np.__config__.DisplayModes) == 2, (
+            "New mode detected,"
+            " please add UT if applicable and increment this count"
+        )
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index c41639043..eaa89aa6f 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -34,7 +34,6 @@ def test_numpy_namespace():
     # None of these objects are publicly documented to be part of the main
     # NumPy namespace (some are useful though, others need to be cleaned up)
     undocumented = {
-        'Tester': 'numpy.testing._private.nosetester.NoseTester',
         '_add_newdoc_ufunc': 'numpy.core._multiarray_umath._add_newdoc_ufunc',
         'add_docstring': 'numpy.core._multiarray_umath.add_docstring',
         'add_newdoc': 'numpy.core.function_base.add_newdoc',
@@ -64,7 +63,7 @@ def test_numpy_namespace():
 
 
 @pytest.mark.skipif(IS_WASM, reason="can't start subprocess")
-@pytest.mark.parametrize('name', ['testing', 'Tester'])
+@pytest.mark.parametrize('name', ['testing'])
 def test_import_lazy_import(name):
     """Make sure we can actually use the modules we lazy load.
 
@@ -137,6 +136,7 @@ PUBLIC_MODULES = ['numpy.' + s for s in [
     "doc",
     "doc.constants",
     "doc.ufuncs",
+    "dtypes",
     "exceptions",
     "f2py",
     "fft",
@@ -195,6 +195,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "core.umath",
     "core.umath_tests",
     "distutils.armccompiler",
+    "distutils.fujitsuccompiler",
     "distutils.ccompiler",
     'distutils.ccompiler_opt',
     "distutils.command",
@@ -248,7 +249,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "distutils.numpy_distribution",
     "distutils.pathccompiler",
     "distutils.unixccompiler",
-    "dual",
     "f2py.auxfuncs",
     "f2py.capi_maps",
     "f2py.cb_rules",
@@ -289,7 +289,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "random.mtrand",
     "random.bit_generator",
     "testing.print_coercion_tables",
-    "testing.utils",
 ]]
 
 
@@ -353,6 +352,8 @@ def test_all_modules_are_expected():
 SKIP_LIST_2 = [
     'numpy.math',
     'numpy.distutils.log.sys',
+    'numpy.distutils.log.logging',
+    'numpy.distutils.log.warnings',
     'numpy.doc.constants.re',
     'numpy.doc.constants.textwrap',
     'numpy.lib.emath',
@@ -471,6 +472,10 @@ def test_api_importable():
 
 
 @pytest.mark.xfail(
+    hasattr(np.__config__, "_built_with_meson"),
+    reason = "Meson does not yet support entry points via pyproject.toml",
+)
+@pytest.mark.xfail(
     sysconfig.get_config_var("Py_DEBUG") is not None,
     reason=(
         "NumPy possibly built with `USE_DEBUG=True ./tools/travis-test.sh`, "
diff --git a/numpy/typing/tests/data/fail/einsumfunc.pyi b/numpy/typing/tests/data/fail/einsumfunc.pyi
index f0e3f1e95..2d1f37418 100644
--- a/numpy/typing/tests/data/fail/einsumfunc.pyi
+++ b/numpy/typing/tests/data/fail/einsumfunc.pyi
@@ -4,12 +4,9 @@ import numpy as np
 AR_i: np.ndarray[Any, np.dtype[np.int64]]
 AR_f: np.ndarray[Any, np.dtype[np.float64]]
 AR_m: np.ndarray[Any, np.dtype[np.timedelta64]]
-AR_O: np.ndarray[Any, np.dtype[np.object_]]
 AR_U: np.ndarray[Any, np.dtype[np.str_]]
 
 np.einsum("i,i->i", AR_i, AR_m)  # E: incompatible type
-np.einsum("i,i->i", AR_O, AR_O)  # E: incompatible type
 np.einsum("i,i->i", AR_f, AR_f, dtype=np.int32)  # E: incompatible type
-np.einsum("i,i->i", AR_i, AR_i, dtype=np.timedelta64, casting="unsafe")  # E: No overload variant
 np.einsum("i,i->i", AR_i, AR_i, out=AR_U)  # E: Value of type variable "_ArrayType" of "einsum" cannot be
 np.einsum("i,i->i", AR_i, AR_i, out=AR_U, casting="unsafe")  # E: No overload variant
diff --git a/numpy/typing/tests/data/fail/numerictypes.pyi b/numpy/typing/tests/data/fail/numerictypes.pyi
index a5c2814ef..ce5662d5e 100644
--- a/numpy/typing/tests/data/fail/numerictypes.pyi
+++ b/numpy/typing/tests/data/fail/numerictypes.pyi
@@ -9,5 +9,3 @@ np.maximum_sctype(1)  # E: No overload variant
 np.issubsctype(1, np.int64)  # E: incompatible type
 
 np.issubdtype(1, np.int64)  # E: incompatible type
-
-np.find_common_type(np.int64, np.int64)  # E: incompatible type
diff --git a/numpy/typing/tests/data/pass/dtype.py b/numpy/typing/tests/data/pass/dtype.py
index e849cfdd4..6ec44e6b0 100644
--- a/numpy/typing/tests/data/pass/dtype.py
+++ b/numpy/typing/tests/data/pass/dtype.py
@@ -15,7 +15,7 @@ np.dtype({"names": ["a", "b"], "formats": [int, float]})
 np.dtype({"names": ["a"], "formats": [int], "titles": [object]})
 np.dtype({"names": ["a"], "formats": [int], "titles": [object()]})
 
-np.dtype([("name", np.unicode_, 16), ("grades", np.float64, (2,)), ("age", "int32")])
+np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,)), ("age", "int32")])
 
 np.dtype(
     {
diff --git a/numpy/typing/tests/data/pass/ndarray_misc.py b/numpy/typing/tests/data/pass/ndarray_misc.py
index 19a1af9e2..6beacc5d7 100644
--- a/numpy/typing/tests/data/pass/ndarray_misc.py
+++ b/numpy/typing/tests/data/pass/ndarray_misc.py
@@ -150,7 +150,7 @@ A.argpartition([0])
 A.diagonal()
 
 A.dot(1)
-A.dot(1, out=B0)
+A.dot(1, out=B2)
 
 A.nonzero()
 
diff --git a/numpy/typing/tests/data/pass/numerictypes.py b/numpy/typing/tests/data/pass/numerictypes.py
index 7f1dd0945..63b6ad0e2 100644
--- a/numpy/typing/tests/data/pass/numerictypes.py
+++ b/numpy/typing/tests/data/pass/numerictypes.py
@@ -8,7 +8,7 @@ np.issctype("S8")
 
 np.obj2sctype(list)
 np.obj2sctype(list, default=None)
-np.obj2sctype(list, default=np.string_)
+np.obj2sctype(list, default=np.bytes_)
 
 np.issubclass_(np.int32, int)
 np.issubclass_(np.float64, float)
@@ -17,17 +17,12 @@ np.issubclass_(np.float64, (int, float))
 np.issubsctype("int64", int)
 np.issubsctype(np.array([1]), np.array([1]))
 
-np.issubdtype("S1", np.string_)
+np.issubdtype("S1", np.bytes_)
 np.issubdtype(np.float64, np.float32)
 
 np.sctype2char("S1")
 np.sctype2char(list)
 
-np.find_common_type([], [np.int64, np.float32, complex])
-np.find_common_type((), (np.int64, np.float32, complex))
-np.find_common_type([np.int64, np.float32], [])
-np.find_common_type([np.float32], [np.int64, np.float64])
-
 np.cast[int]
 np.cast["i8"]
 np.cast[np.int64]
diff --git a/numpy/typing/tests/data/pass/simple.py b/numpy/typing/tests/data/pass/simple.py
index 03ca3e83f..801168702 100644
--- a/numpy/typing/tests/data/pass/simple.py
+++ b/numpy/typing/tests/data/pass/simple.py
@@ -33,13 +33,13 @@ np.dtype(two_tuples_dtype)
 three_tuples_dtype = [("R", "u1", 2)]
 np.dtype(three_tuples_dtype)
 
-mixed_tuples_dtype = [("R", "u1"), ("G", np.unicode_, 1)]
+mixed_tuples_dtype = [("R", "u1"), ("G", np.str_, 1)]
 np.dtype(mixed_tuples_dtype)
 
 shape_tuple_dtype = [("R", "u1", (2, 2))]
 np.dtype(shape_tuple_dtype)
 
-shape_like_dtype = [("R", "u1", (2, 2)), ("G", np.unicode_, 1)]
+shape_like_dtype = [("R", "u1", (2, 2)), ("G", np.str_, 1)]
 np.dtype(shape_like_dtype)
 
 object_dtype = [("field1", object)]
diff --git a/numpy/typing/tests/data/reveal/einsumfunc.pyi b/numpy/typing/tests/data/reveal/einsumfunc.pyi
index d5f930149..5f6415f27 100644
--- a/numpy/typing/tests/data/reveal/einsumfunc.pyi
+++ b/numpy/typing/tests/data/reveal/einsumfunc.pyi
@@ -1,5 +1,6 @@
 from typing import Any
 import numpy as np
+import numpy.typing as npt
 
 AR_LIKE_b: list[bool]
 AR_LIKE_u: list[np.uint32]
@@ -7,10 +8,12 @@ AR_LIKE_i: list[int]
 AR_LIKE_f: list[float]
 AR_LIKE_c: list[complex]
 AR_LIKE_U: list[str]
+AR_o: npt.NDArray[np.object_]
 
-OUT_f: np.ndarray[Any, np.dtype[np.float64]]
+OUT_f: npt.NDArray[np.float64]
 
 reveal_type(np.einsum("i,i->i", AR_LIKE_b, AR_LIKE_b))  # E: Any
+reveal_type(np.einsum("i,i->i", AR_o, AR_o))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_u, AR_LIKE_u))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_i, AR_LIKE_i))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_f, AR_LIKE_f))  # E: Any
diff --git a/numpy/typing/tests/data/reveal/modules.pyi b/numpy/typing/tests/data/reveal/modules.pyi
index ba830eb0d..4191c564a 100644
--- a/numpy/typing/tests/data/reveal/modules.pyi
+++ b/numpy/typing/tests/data/reveal/modules.pyi
@@ -16,6 +16,8 @@ reveal_type(np.random)  # E: ModuleType
 reveal_type(np.rec)  # E: ModuleType
 reveal_type(np.testing)  # E: ModuleType
 reveal_type(np.version)  # E: ModuleType
+reveal_type(np.exceptions)  # E: ModuleType
+reveal_type(np.dtypes)  # E: ModuleType
 
 reveal_type(np.lib.format)  # E: ModuleType
 reveal_type(np.lib.mixins)  # E: ModuleType
diff --git a/numpy/typing/tests/data/reveal/npyio.pyi b/numpy/typing/tests/data/reveal/npyio.pyi
index 68605cf94..2c62d8d21 100644
--- a/numpy/typing/tests/data/reveal/npyio.pyi
+++ b/numpy/typing/tests/data/reveal/npyio.pyi
@@ -75,13 +75,13 @@ reveal_type(np.fromregex(str_path, re.compile("test"), dtype=np.str_, encoding="
 reveal_type(np.fromregex(pathlib_path, "test", np.float64))  # E: ndarray[Any, dtype[{float64}]]
 reveal_type(np.fromregex(bytes_reader, "test", np.float64))  # E: ndarray[Any, dtype[{float64}]]
 
-reveal_type(np.genfromtxt(bytes_file))  # E: ndarray[Any, dtype[{float64}]]
+reveal_type(np.genfromtxt(bytes_file))  # E: ndarray[Any, dtype[Any]]
 reveal_type(np.genfromtxt(pathlib_path, dtype=np.str_))  # E: ndarray[Any, dtype[str_]]
 reveal_type(np.genfromtxt(str_path, dtype=str, skip_header=2))  # E: ndarray[Any, dtype[Any]]
-reveal_type(np.genfromtxt(str_file, comments="test"))  # E: ndarray[Any, dtype[{float64}]]
-reveal_type(np.genfromtxt(str_path, delimiter="\n"))  # E: ndarray[Any, dtype[{float64}]]
-reveal_type(np.genfromtxt(str_path, ndmin=2))  # E: ndarray[Any, dtype[{float64}]]
-reveal_type(np.genfromtxt(["1", "2", "3"], ndmin=2))  # E: ndarray[Any, dtype[{float64}]]
+reveal_type(np.genfromtxt(str_file, comments="test"))  # E: ndarray[Any, dtype[Any]]
+reveal_type(np.genfromtxt(str_path, delimiter="\n"))  # E: ndarray[Any, dtype[Any]]
+reveal_type(np.genfromtxt(str_path, ndmin=2))  # E: ndarray[Any, dtype[Any]]
+reveal_type(np.genfromtxt(["1", "2", "3"], ndmin=2))  # E: ndarray[Any, dtype[Any]]
 
 reveal_type(np.recfromtxt(bytes_file))  # E: recarray[Any, dtype[record]]
 reveal_type(np.recfromtxt(pathlib_path, usemask=True))  # E: ma.mrecords.MaskedRecords[Any, dtype[void]]
diff --git a/numpy/typing/tests/data/reveal/numerictypes.pyi b/numpy/typing/tests/data/reveal/numerictypes.pyi
index e1857557d..d4399e2b1 100644
--- a/numpy/typing/tests/data/reveal/numerictypes.pyi
+++ b/numpy/typing/tests/data/reveal/numerictypes.pyi
@@ -21,8 +21,6 @@ reveal_type(np.issubclass_(1, 1))  # E: Literal[False]
 reveal_type(np.sctype2char("S8"))  # E: str
 reveal_type(np.sctype2char(list))  # E: str
 
-reveal_type(np.find_common_type([np.int64], [np.int64]))  # E: dtype[Any]
-
 reveal_type(np.cast[int])  # E: _CastFunc
 reveal_type(np.cast["i8"])  # E: _CastFunc
 reveal_type(np.cast[np.int64])  # E: _CastFunc
diff --git a/numpy/typing/tests/data/reveal/scalars.pyi b/numpy/typing/tests/data/reveal/scalars.pyi
index b7fc75acc..965aa5ace 100644
--- a/numpy/typing/tests/data/reveal/scalars.pyi
+++ b/numpy/typing/tests/data/reveal/scalars.pyi
@@ -1,4 +1,3 @@
-import sys
 import numpy as np
 
 b: np.bool_
@@ -151,8 +150,7 @@ reveal_type(round(u8, 3))  # E: {uint64}
 reveal_type(round(f8))  # E: int
 reveal_type(round(f8, 3))  # E: {float64}
 
-if sys.version_info >= (3, 9):
-    reveal_type(f8.__ceil__())  # E: int
-    reveal_type(f8.__floor__())  # E: int
+reveal_type(f8.__ceil__())  # E: int
+reveal_type(f8.__floor__())  # E: int
 
 reveal_type(i8.is_integer())  # E: Literal[True]
diff --git a/numpy/typing/tests/data/reveal/testing.pyi b/numpy/typing/tests/data/reveal/testing.pyi
index 5440af800..5c35731d3 100644
--- a/numpy/typing/tests/data/reveal/testing.pyi
+++ b/numpy/typing/tests/data/reveal/testing.pyi
@@ -113,7 +113,6 @@ reveal_type(np.testing.rundocs())  # E: None
 reveal_type(np.testing.rundocs("test.py"))  # E: None
 reveal_type(np.testing.rundocs(Path("test.py"), raise_on_error=True))  # E: None
 
-@np.testing.raises(RuntimeError, RuntimeWarning)
 def func3(a: int) -> bool: ...
 
 reveal_type(func3)  # E: def (a: builtins.int) -> builtins.bool
@@ -175,4 +174,3 @@ reveal_type(np.testing.assert_no_gc_cycles(func3, 5))  # E: None
 reveal_type(np.testing.break_cycles())  # E: None
 
 reveal_type(np.testing.TestCase())  # E: unittest.case.TestCase
-reveal_type(np.testing.run_module_suite(file_to_run="numpy/tests/test_matlib.py"))  # E: None
diff --git a/numpy/typing/tests/test_generic_alias.py b/numpy/typing/tests/test_generic_alias.py
deleted file mode 100644
index 861babd4b..000000000
--- a/numpy/typing/tests/test_generic_alias.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from __future__ import annotations
-
-import sys
-import copy
-import types
-import pickle
-import weakref
-from typing import TypeVar, Any, Union, Callable
-
-import pytest
-import numpy as np
-from numpy._typing._generic_alias import _GenericAlias
-from typing_extensions import Unpack
-
-ScalarType = TypeVar("ScalarType", bound=np.generic, covariant=True)
-T1 = TypeVar("T1")
-T2 = TypeVar("T2")
-DType = _GenericAlias(np.dtype, (ScalarType,))
-NDArray = _GenericAlias(np.ndarray, (Any, DType))
-
-# NOTE: The `npt._GenericAlias` *class* isn't quite stable on python >=3.11.
-# This is not a problem during runtime (as it's 3.8-exclusive), but we still
-# need it for the >=3.9 in order to verify its semantics match
-# `types.GenericAlias` replacement. xref numpy/numpy#21526
-if sys.version_info >= (3, 9):
-    DType_ref = types.GenericAlias(np.dtype, (ScalarType,))
-    NDArray_ref = types.GenericAlias(np.ndarray, (Any, DType_ref))
-    FuncType = Callable[["_GenericAlias | types.GenericAlias"], Any]
-else:
-    DType_ref = Any
-    NDArray_ref = Any
-    FuncType = Callable[["_GenericAlias"], Any]
-
-GETATTR_NAMES = sorted(set(dir(np.ndarray)) - _GenericAlias._ATTR_EXCEPTIONS)
-
-BUFFER = np.array([1], dtype=np.int64)
-BUFFER.setflags(write=False)
-
-def _get_subclass_mro(base: type) -> tuple[type, ...]:
-    class Subclass(base):  # type: ignore[misc,valid-type]
-        pass
-    return Subclass.__mro__[1:]
-
-
-class TestGenericAlias:
-    """Tests for `numpy._typing._generic_alias._GenericAlias`."""
-
-    @pytest.mark.parametrize("name,func", [
-        ("__init__", lambda n: n),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, Any)),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (Any,))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (Any, Any))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, T1)),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (T1,))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (T1, T2))),
-        ("__origin__", lambda n: n.__origin__),
-        ("__args__", lambda n: n.__args__),
-        ("__parameters__", lambda n: n.__parameters__),
-        ("__mro_entries__", lambda n: n.__mro_entries__([object])),
-        ("__hash__", lambda n: hash(n)),
-        ("__repr__", lambda n: repr(n)),
-        ("__getitem__", lambda n: n[np.float64]),
-        ("__getitem__", lambda n: n[ScalarType][np.float64]),
-        ("__getitem__", lambda n: n[Union[np.int64, ScalarType]][np.float64]),
-        ("__getitem__", lambda n: n[Union[T1, T2]][np.float32, np.float64]),
-        ("__eq__", lambda n: n == n),
-        ("__ne__", lambda n: n != np.ndarray),
-        ("__call__", lambda n: n((1,), np.int64, BUFFER)),
-        ("__call__", lambda n: n(shape=(1,), dtype=np.int64, buffer=BUFFER)),
-        ("subclassing", lambda n: _get_subclass_mro(n)),
-        ("pickle", lambda n: n == pickle.loads(pickle.dumps(n))),
-    ])
-    def test_pass(self, name: str, func: FuncType) -> None:
-        """Compare `types.GenericAlias` with its numpy-based backport.
-
-        Checker whether ``func`` runs as intended and that both `GenericAlias`
-        and `_GenericAlias` return the same result.
-
-        """
-        value = func(NDArray)
-
-        if sys.version_info >= (3, 9):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    @pytest.mark.parametrize("name,func", [
-        ("__copy__", lambda n: n == copy.copy(n)),
-        ("__deepcopy__", lambda n: n == copy.deepcopy(n)),
-    ])
-    def test_copy(self, name: str, func: FuncType) -> None:
-        value = func(NDArray)
-
-        # xref bpo-45167
-        GE_398 = (
-            sys.version_info[:2] == (3, 9) and sys.version_info >= (3, 9, 8)
-        )
-        if GE_398 or sys.version_info >= (3, 10, 1):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    def test_dir(self) -> None:
-        value = dir(NDArray)
-        if sys.version_info < (3, 9):
-            return
-
-        # A number attributes only exist in `types.GenericAlias` in >= 3.11
-        if sys.version_info < (3, 11, 0, "beta", 3):
-            value.remove("__typing_unpacked_tuple_args__")
-        if sys.version_info < (3, 11, 0, "beta", 1):
-            value.remove("__unpacked__")
-        assert value == dir(NDArray_ref)
-
-    @pytest.mark.parametrize("name,func,dev_version", [
-        ("__iter__", lambda n: len(list(n)), ("beta", 1)),
-        ("__iter__", lambda n: next(iter(n)), ("beta", 1)),
-        ("__unpacked__", lambda n: n.__unpacked__, ("beta", 1)),
-        ("Unpack", lambda n: Unpack[n], ("beta", 1)),
-
-        # The right operand should now have `__unpacked__ = True`,
-        # and they are thus now longer equivalent
-        ("__ne__", lambda n: n != next(iter(n)), ("beta", 1)),
-
-        # >= beta3
-        ("__typing_unpacked_tuple_args__",
-         lambda n: n.__typing_unpacked_tuple_args__, ("beta", 3)),
-
-        # >= beta4
-        ("__class__", lambda n: n.__class__ == type(n), ("beta", 4)),
-    ])
-    def test_py311_features(
-        self,
-        name: str,
-        func: FuncType,
-        dev_version: tuple[str, int],
-    ) -> None:
-        """Test Python 3.11 features."""
-        value = func(NDArray)
-
-        if sys.version_info >= (3, 11, 0, *dev_version):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    def test_weakref(self) -> None:
-        """Test ``__weakref__``."""
-        value = weakref.ref(NDArray)()
-
-        if sys.version_info >= (3, 9, 1):  # xref bpo-42332
-            value_ref = weakref.ref(NDArray_ref)()
-            assert value == value_ref
-
-    @pytest.mark.parametrize("name", GETATTR_NAMES)
-    def test_getattr(self, name: str) -> None:
-        """Test that `getattr` wraps around the underlying type,
-        aka ``__origin__``.
-
-        """
-        value = getattr(NDArray, name)
-        value_ref1 = getattr(np.ndarray, name)
-
-        if sys.version_info >= (3, 9):
-            value_ref2 = getattr(NDArray_ref, name)
-            assert value == value_ref1 == value_ref2
-        else:
-            assert value == value_ref1
-
-    @pytest.mark.parametrize("name,exc_type,func", [
-        ("__getitem__", TypeError, lambda n: n[()]),
-        ("__getitem__", TypeError, lambda n: n[Any, Any]),
-        ("__getitem__", TypeError, lambda n: n[Any][Any]),
-        ("isinstance", TypeError, lambda n: isinstance(np.array(1), n)),
-        ("issublass", TypeError, lambda n: issubclass(np.ndarray, n)),
-        ("setattr", AttributeError, lambda n: setattr(n, "__origin__", int)),
-        ("setattr", AttributeError, lambda n: setattr(n, "test", int)),
-        ("getattr", AttributeError, lambda n: getattr(n, "test")),
-    ])
-    def test_raise(
-        self,
-        name: str,
-        exc_type: type[BaseException],
-        func: FuncType,
-    ) -> None:
-        """Test operations that are supposed to raise."""
-        with pytest.raises(exc_type):
-            func(NDArray)
-
-        if sys.version_info >= (3, 9):
-            with pytest.raises(exc_type):
-                func(NDArray_ref)
diff --git a/numpy/typing/tests/test_runtime.py b/numpy/typing/tests/test_runtime.py
index 44d069006..c32c5db32 100644
--- a/numpy/typing/tests/test_runtime.py
+++ b/numpy/typing/tests/test_runtime.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import sys
 from typing import (
     get_type_hints,
     Union,
@@ -24,10 +23,7 @@ class TypeTup(NamedTuple):
     origin: None | type
 
 
-if sys.version_info >= (3, 9):
-    NDArrayTup = TypeTup(npt.NDArray, npt.NDArray.__args__, np.ndarray)
-else:
-    NDArrayTup = TypeTup(npt.NDArray, (), None)
+NDArrayTup = TypeTup(npt.NDArray, npt.NDArray.__args__, np.ndarray)
 
 TYPES = {
     "ArrayLike": TypeTup(npt.ArrayLike, npt.ArrayLike.__args__, Union),
diff --git a/numpy/typing/tests/test_typing.py b/numpy/typing/tests/test_typing.py
index 5011339b5..bcaaf5250 100644
--- a/numpy/typing/tests/test_typing.py
+++ b/numpy/typing/tests/test_typing.py
@@ -431,7 +431,7 @@ def test_extended_precision() -> None:
     output_mypy = OUTPUT_MYPY
     assert path in output_mypy
 
-    with open(path, "r") as f:
+    with open(path) as f:
         expression_list = f.readlines()
 
     for _msg in output_mypy[path]:
diff --git a/pavement.py b/pavement.py
index 6d85bf96b..b6ad4358b 100644
--- a/pavement.py
+++ b/pavement.py
@@ -184,7 +184,7 @@ def write_release_task(options, filename='README'):
     ----------
     options :
         Set by ``task`` decorator.
-    filename : string
+    filename : str
         Filename of the modified notes. The file is written
         in the release directory.
 
diff --git a/pyproject.toml b/pyproject.toml
index 65653353c..759b538fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,8 +8,8 @@ requires = [
     # `wheel` is needed for non-isolated builds, given that `meson-python`
     # doesn't list it as a runtime requirement (at least in 0.11.0) - it's
     # likely to be removed as a dependency in meson-python 0.12.0.
-    "wheel==0.37.0",
-    "Cython>=0.29.30,<3.0",
+    "wheel==0.38.1",
+    "Cython>=0.29.34,<3.0",
 #    "meson-python>=0.10.0",
 ]
 
@@ -30,7 +30,7 @@ requires = [
 #maintainers = [
 #    {name = "NumPy Developers", email="numpy-discussion@python.org"},
 #]
-#requires-python = ">=3.8"
+#requires-python = ">=3.9"
 #readme = "README.md"
 #classifiers = [
 #    'Development Status :: 5 - Production/Stable',
@@ -40,7 +40,6 @@ requires = [
 #    'Programming Language :: C',
 #    'Programming Language :: Python',
 #    'Programming Language :: Python :: 3',
-#    'Programming Language :: Python :: 3.8',
 #    'Programming Language :: Python :: 3.9',
 #    'Programming Language :: Python :: 3.10',
 #    'Programming Language :: Python :: 3.11',
@@ -63,6 +62,8 @@ requires = [
 #'f2py3 = numpy.f2py.f2py2e:main'
 #'f2py3.MINOR_VERSION = numpy.f2py.f2py2e:main'
 #
+# When enabling this stanza, make sure to remove the meson-specific xfail from
+# numpy/tests/test_public_api.py
 #[project.entry-points]
 #'array_api': 'numpy = numpy.array_api'
 #'pyinstaller40': 'hook-dirs = numpy:_pyinstaller_hooks_dir'
@@ -143,7 +144,7 @@ requires = [
 
 
 [tool.cibuildwheel]
-skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*"
+skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux_aarch64"
 build-verbosity = "3"
 before-build = "bash {project}/tools/wheels/cibw_before_build.sh {project}"
 before-test = "pip install -r {project}/test_requirements.txt"
@@ -152,6 +153,7 @@ test-command = "bash {project}/tools/wheels/cibw_test_command.sh {project}"
 [tool.cibuildwheel.linux]
 manylinux-x86_64-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"
+musllinux-x86_64-image = "musllinux_1_1"
 environment = { CFLAGS="-std=c99 -fno-strict-aliasing", LDFLAGS="-Wl,--strip-debug", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", RUNNER_OS="Linux" }
 
 [tool.cibuildwheel.macos]
@@ -161,9 +163,9 @@ environment = { CFLAGS="-std=c99 -fno-strict-aliasing", LDFLAGS="-Wl,--strip-deb
 # https://github.com/multi-build/multibuild/blame/devel/README.rst#L541-L565
 # for more info
 archs = "x86_64 arm64"
-test-skip = "*_arm64 *_universal2:arm64"
+test-skip = "*_universal2:arm64"
 # MACOS linker doesn't support stripping symbols
-environment = { CFLAGS="-std=c99 -fno-strict-aliasing", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", CC="clang", CXX = "clang++" }
+environment = { CFLAGS="-std=c99 -fno-strict-aliasing", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", CC="clang", CXX = "clang++", RUNNER_OS="macOS" }
 
 [tool.cibuildwheel.windows]
 environment = { OPENBLAS64_="openblas", OPENBLAS="", NPY_USE_BLAS_ILP64="1", CFLAGS="", LDFLAGS="" }
@@ -172,9 +174,9 @@ environment = { OPENBLAS64_="openblas", OPENBLAS="", NPY_USE_BLAS_ILP64="1", CFL
 select = "*-win32"
 environment = { OPENBLAS64_="", OPENBLAS="openblas", NPY_USE_BLAS_ILP64="0", CFLAGS="-m32", LDFLAGS="-m32" }
 
-[tool.devpy]
+[tool.spin]
 package = 'numpy'
 
-[tool.devpy.commands]
-"Build" = ["devpy.build", "devpy.test"]
-"Environments" = ["devpy.shell", "devpy.ipython", "devpy.python"]
+[tool.spin.commands]
+"Build" = ["spin.cmds.meson.build", "spin.cmds.meson.test"]
+"Environments" = ["spin.cmds.meson.shell", "spin.cmds.meson.ipython", "spin.cmds.meson.python"]
diff --git a/release_requirements.txt b/release_requirements.txt
index 43a677e05..13f763651 100644
--- a/release_requirements.txt
+++ b/release_requirements.txt
@@ -7,7 +7,7 @@ beautifulsoup4
 
 # changelog.py
 pygithub
-gitpython
+gitpython>=3.1.30
 
 # uploading wheels
 twine
diff --git a/runtests.py b/runtests.py
index fea29a10d..8b026bec7 100755
--- a/runtests.py
+++ b/runtests.py
@@ -220,7 +220,7 @@ def main(argv):
             # Don't use subprocess, since we don't want to include the
             # current path in PYTHONPATH.
             sys.argv = extra_argv
-            with open(extra_argv[0], 'r') as f:
+            with open(extra_argv[0]) as f:
                 script = f.read()
             sys.modules['__main__'] = types.ModuleType('__main__')
             ns = dict(__name__='__main__',
@@ -540,11 +540,22 @@ def build_project(args):
         print("Build OK")
     else:
         if not args.show_build_log:
-            with open(log_filename, 'r') as f:
+            with open(log_filename) as f:
                 print(f.read())
             print("Build failed!")
         sys.exit(1)
 
+    # Rebase
+    if sys.platform == "cygwin":
+        from pathlib import path
+        testenv_root = Path(config_vars["platbase"])
+        dll_list = testenv_root.glob("**/*.dll")
+        rebase_cmd = ["/usr/bin/rebase", "--database", "--oblivious"]
+        rebase_cmd.extend(dll_list)
+        if subprocess.run(rebase_cmd):
+            print("Rebase failed")
+            sys.exit(1)
+
     return site_dir, site_dir_noarch
 
 def asv_compare_config(bench_path, args, h_commits):
@@ -624,7 +635,7 @@ def asv_substitute_config(in_config, out_config, **custom_vars):
 
     vars_hash = sdbm_hash(custom_vars, os.path.getmtime(in_config))
     try:
-        with open(out_config, "r") as wfd:
+        with open(out_config) as wfd:
             hash_line = wfd.readline().split('hash:')
             if len(hash_line) > 1 and int(hash_line[1]) == vars_hash:
                 return True
diff --git a/setup.py b/setup.py
index 1fe7a6f4d..c924bb999 100755
--- a/setup.py
+++ b/setup.py
@@ -12,12 +12,14 @@ import textwrap
 import warnings
 import builtins
 import re
+import tempfile
 
+from distutils.errors import CompileError
 
 # Python supported version checks. Keep right after stdlib imports to ensure we
 # get a sensible error for older Python versions
-if sys.version_info[:2] < (3, 8):
-    raise RuntimeError("Python version >= 3.8 required.")
+if sys.version_info[:2] < (3, 9):
+    raise RuntimeError("Python version >= 3.9 required.")
 
 
 import versioneer
@@ -90,7 +92,6 @@ License :: OSI Approved :: BSD License
 Programming Language :: C
 Programming Language :: Python
 Programming Language :: Python :: 3
-Programming Language :: Python :: 3.8
 Programming Language :: Python :: 3.9
 Programming Language :: Python :: 3.10
 Programming Language :: Python :: 3.11
@@ -158,11 +159,11 @@ class concat_license_files():
 
     def __enter__(self):
         """Concatenate files and remove LICENSES_bundled.txt"""
-        with open(self.f1, 'r') as f1:
+        with open(self.f1) as f1:
             self.bsd_text = f1.read()
 
         with open(self.f1, 'a') as f1:
-            with open(self.f2, 'r') as f2:
+            with open(self.f2) as f2:
                 self.bundled_text = f2.read()
                 f1.write('\n\n')
                 f1.write(self.bundled_text)
@@ -185,45 +186,132 @@ class sdist_checked(cmdclass['sdist']):
 
 def get_build_overrides():
     """
-    Custom build commands to add `-std=c99` to compilation
+    Custom build commands to add std flags if required to compilation
     """
     from numpy.distutils.command.build_clib import build_clib
     from numpy.distutils.command.build_ext import build_ext
     from numpy._utils import _pep440
 
-    def _needs_gcc_c99_flag(obj):
-        if obj.compiler.compiler_type != 'unix':
-            return False
+    def try_compile(compiler, file, flags = [], verbose=False):
+        bk_ver = getattr(compiler, 'verbose', False)
+        compiler.verbose = verbose
+        try:
+            compiler.compile([file], extra_postargs=flags)
+            return True, ''
+        except CompileError as e:
+            return False, str(e)
+        finally:
+            compiler.verbose = bk_ver
+
+    def flags_is_required(compiler, is_cpp, flags, code):
+        if is_cpp:
+            compiler = compiler.cxx_compiler()
+            suf = '.cpp'
+        else:
+            suf = '.c'
+        with tempfile.TemporaryDirectory() as temp_dir:
+            tmp_file = os.path.join(temp_dir, "test" + suf)
+            with open(tmp_file, "w+") as f:
+                f.write(code)
+            # without specify any flags in case of the required
+            # standard already supported by default, then there's
+            # no need for passing the flags
+            comp = try_compile(compiler, tmp_file)
+            if not comp[0]:
+                comp = try_compile(compiler, tmp_file, flags)
+                if not comp[0]:
+                    # rerun to verbose the error
+                    try_compile(compiler, tmp_file, flags, True)
+                    if is_cpp:
+                        raise RuntimeError(
+                            "Broken toolchain during testing C++ compiler. \n"
+                            "A compiler with support for C++17 language "
+                            "features is required.\n"
+                            f"Triggered the following error: {comp[1]}."
+                        )
+                    else:
+                        raise RuntimeError(
+                            "Broken toolchain during testing C compiler. \n"
+                            "A compiler with support for C99 language "
+                            "features is required.\n"
+                            f"Triggered the following error: {comp[1]}."
+                        )
+                return True
+        return False
 
-        cc = obj.compiler.compiler[0]
-        if "gcc" not in cc:
-            return False
-
-        # will print something like '4.2.1\n'
-        out = subprocess.run([cc, '-dumpversion'],
-                             capture_output=True, text=True)
-        # -std=c99 is default from this version on
-        if _pep440.parse(out.stdout) >= _pep440.Version('5.0'):
-            return False
-        return True
+    def std_cxx_flags(cmd):
+        compiler = cmd.compiler
+        flags = getattr(compiler, '__np_cache_cpp_flags', None)
+        if flags is not None:
+            return flags
+        flags = dict(
+            msvc = ['/std:c++17']
+        ).get(compiler.compiler_type, ['-std=c++17'])
+        # These flags are used to compile any C++ source within Numpy.
+        # They are chosen to have very few runtime dependencies.
+        extra_flags = dict(
+            # to update #def __cplusplus with enabled C++ version
+            msvc = ['/Zc:__cplusplus']
+        ).get(compiler.compiler_type, [
+            # The following flag is used to avoid emit any extra code
+            # from STL since extensions are build by C linker and
+            # without C++ runtime dependencies.
+            '-fno-threadsafe-statics',
+            '-D__STDC_VERSION__=0',  # for compatibility with C headers
+            '-fno-exceptions',  # no exception support
+            '-fno-rtti'  # no runtime type information
+        ])
+        if not flags_is_required(compiler, True, flags, textwrap.dedent('''
+            #include <type_traits>
+            template<typename ...T>
+            constexpr bool test_fold = (... && std::is_const_v<T>);
+            int main()
+            {
+                if (test_fold<int, const int>) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+        ''')):
+            flags.clear()
+        flags += extra_flags
+        setattr(compiler, '__np_cache_cpp_flags', flags)
+        return flags
+
+    def std_c_flags(cmd):
+        compiler = cmd.compiler
+        flags = getattr(compiler, '__np_cache_c_flags', None)
+        if flags is not None:
+            return flags
+        flags = dict(
+            msvc = []
+        ).get(compiler.compiler_type, ['-std=c99'])
+
+        if not flags_is_required(compiler, False, flags, textwrap.dedent('''
+            inline static int test_inline() { return 0; }
+            int main(void)
+            { return test_inline(); }
+        ''')):
+            flags.clear()
+
+        setattr(compiler, '__np_cache_c_flags', flags)
+        return flags
 
     class new_build_clib(build_clib):
         def build_a_library(self, build_info, lib_name, libraries):
-            from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
-            if _needs_gcc_c99_flag(self):
-                build_info['extra_cflags'] = ['-std=c99']
-            build_info['extra_cxxflags'] = NPY_CXX_FLAGS
+            build_info['extra_cflags'] = std_c_flags(self)
+            build_info['extra_cxxflags'] = std_cxx_flags(self)
             build_clib.build_a_library(self, build_info, lib_name, libraries)
 
     class new_build_ext(build_ext):
         def build_extension(self, ext):
-            if _needs_gcc_c99_flag(self):
-                if '-std=c99' not in ext.extra_compile_args:
-                    ext.extra_compile_args.append('-std=c99')
+            ext.extra_c_compile_args += std_c_flags(self)
+            ext.extra_cxx_compile_args += std_cxx_flags(self)
             build_ext.build_extension(self, ext)
     return new_build_clib, new_build_ext
 
-
 def generate_cython():
     # Check Cython version
     from numpy._utils import _pep440
@@ -432,7 +520,7 @@ def setup_package():
         test_suite='pytest',
         version=versioneer.get_version(),
         cmdclass=cmdclass,
-        python_requires='>=3.8',
+        python_requires='>=3.9',
         zip_safe=False,
         entry_points={
             'console_scripts': f2py_cmds,
diff --git a/site.cfg.example b/site.cfg.example
index 4df01a210..5d36922ea 100644
--- a/site.cfg.example
+++ b/site.cfg.example
@@ -225,9 +225,10 @@
 #
 # UMFPACK is not used by NumPy.
 #
-#   https://www.cise.ufl.edu/research/sparse/umfpack/
-#   https://www.cise.ufl.edu/research/sparse/amd/
+#   https://github.com/DrTimothyAldenDavis/SuiteSparse/tree/dev/UMFPACK
+#   https://github.com/DrTimothyAldenDavis/SuiteSparse/tree/dev/AMD
 #   https://scikit-umfpack.github.io/scikit-umfpack/
+#   https://people.engr.tamu.edu/davis/suitesparse.html
 #
 #[amd]
 #libraries = amd
@@ -252,3 +253,13 @@
 #[djbfft]
 #include_dirs = /usr/local/djbfft/include
 #library_dirs = /usr/local/djbfft/lib
+
+# Fujitsu SSL2
+# ----
+#[ssl2]
+#library_dirs = /opt/FJSVstclanga/v1.1.0/lib64
+#include_dirs = /opt/FJSVstclanga/v1.1.0/clang-comp/include
+# Single-threaded version.
+#libraies = fjlapacksve
+# Multi-threaded version. Python itself must be built by Fujitsu compiler.
+#libraies = fjlapackexsve
diff --git a/test_requirements.txt b/test_requirements.txt
index 3e7d3fef7..16f448eb8 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -1,5 +1,5 @@
-cython>=0.29.30,<3.0
-wheel==0.37.0
+cython>=0.29.34,<3.0
+wheel==0.38.1
 setuptools==59.2.0
 hypothesis==6.24.1
 pytest==6.2.5
@@ -12,3 +12,5 @@ cffi; python_version < '3.10'
 # NOTE: Keep mypy in sync with environment.yml
 mypy==0.981; platform_python_implementation != "PyPy"
 typing_extensions>=4.2.0
+# for optional f2py encoding detection
+charset-normalizer
diff --git a/tools/allocation_tracking/README.md b/tools/allocation_tracking/README.md
deleted file mode 100644
index 6cc4c2a58..000000000
--- a/tools/allocation_tracking/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Note that since Python 3.6 the builtin tracemalloc module can be used to
-track allocations inside numpy.
-Numpy places its CPU memory allocations into the `np.lib.tracemalloc_domain`
-domain.
-See https://docs.python.org/3/library/tracemalloc.html.
-
-The tool that used to be here has been deprecated.
diff --git a/tools/ci/cirrus_general.yml b/tools/ci/cirrus_general.yml
deleted file mode 100644
index 2ec2ad837..000000000
--- a/tools/ci/cirrus_general.yml
+++ /dev/null
@@ -1,99 +0,0 @@
-build_and_store_wheels: &BUILD_AND_STORE_WHEELS
-  install_cibuildwheel_script:
-    - python -m pip install cibuildwheel==2.11.2
-  cibuildwheel_script:
-    - cibuildwheel
-  wheels_artifacts:
-    path: "wheelhouse/*"
-
-
-######################################################################
-# Build linux_aarch64 natively
-######################################################################
-
-linux_aarch64_task:
-  compute_engine_instance:
-    image_project: cirrus-images
-    image: family/docker-builder-arm64
-    architecture: arm64
-    platform: linux
-    cpu: 2
-    memory: 8G
-  matrix:
-    # build in a matrix because building and testing all four wheels in a
-    # single task takes longer than 60 mins (the default time limit for a
-    # cirrus-ci task).
-    - env:
-        CIBW_BUILD: cp38-*
-        EXPECT_CPU_FEATURES: NEON NEON_FP16 NEON_VFPV4 ASIMD ASIMDHP ASIMDDP ASIMDFHM
-    - env:
-        CIBW_BUILD: cp39-*
-    - env:
-        CIBW_BUILD: cp310-*
-    - env:
-        CIBW_BUILD: cp311-*
-
-  build_script: |
-    apt install -y python3-venv python-is-python3 gfortran libatlas-base-dev libgfortran5 eatmydata
-    git fetch origin
-    ./tools/travis-before-install.sh
-    which python
-    echo $CIRRUS_CHANGE_MESSAGE
-  <<: *BUILD_AND_STORE_WHEELS
-
-
-######################################################################
-# Upload all wheels
-######################################################################
-
-wheels_upload_task:
-  # Artifacts don't seem to be persistent from task to task.
-  # Rather than upload wheels at the end of each cibuildwheel run we do a
-  # final upload here. This is because a run may be on different OS for
-  # which bash, etc, may not be present.
-  depends_on:
-    - linux_aarch64
-  compute_engine_instance:
-    image_project: cirrus-images
-    image: family/docker-builder
-    platform: linux
-    cpu: 1
-
-  env:
-    NUMPY_STAGING_UPLOAD_TOKEN: ENCRYPTED[!5a69522ae0c2af9edb2bc1cdfeaca6292fb3666d9ecd82dca0615921834a6ce3b702352835d8bde4ea2a9ed5ef8424ac!]
-    NUMPY_NIGHTLY_UPLOAD_TOKEN: ENCRYPTED[!196422e6c3419a3b1d79815e1026094a215cb0f346fe34ed0f9d3ca1c19339df7398d04556491b1e0420fc1fe3713289!]
-
-  upload_script: |
-    apt-get install -y python3-venv python-is-python3 curl
-    export IS_SCHEDULE_DISPATCH="false"
-    export IS_PUSH="false"
-
-    # cron job
-    if [[ "$CIRRUS_CRON" == "weekly" ]]; then
-      export IS_SCHEDULE_DISPATCH="true"
-    fi
-
-    # a manual build was started
-    if [[ "$CIRRUS_BUILD_SOURCE" == "api" && "$CIRRUS_COMMIT_MESSAGE" == "API build for null" ]]; then
-      export IS_SCHEDULE_DISPATCH="true"
-    fi
-
-    # only upload wheels to staging if it's a tag beginning with 'v' and you're
-    # on a maintenance branch
-    if [[ "$CIRRUS_TAG" == v* ]] && [[ "$CIRRUS_BRANCH" == maintenance* ]]; then
-      export IS_PUSH="true"    
-    fi
-
-    # The name of the zip file is derived from the `wheels_artifact` line.
-    # If you change the artifact line to `myfile_artifact` then it would be
-    # called myfile.zip
-    
-    curl https://api.cirrus-ci.com/v1/artifact/build/$CIRRUS_BUILD_ID/wheels.zip --output wheels.zip
-    unzip wheels.zip
-    
-    source ./tools/wheels/upload_wheels.sh
-    # IS_PUSH takes precedence over IS_SCHEDULE_DISPATCH
-    set_upload_vars
-    
-    # Will be skipped if not a push/tag/scheduled build
-    upload_wheels
diff --git a/tools/ci/cirrus_macosx_arm64.yml b/tools/ci/cirrus_macosx_arm64.yml
new file mode 100644
index 000000000..2343b807f
--- /dev/null
+++ b/tools/ci/cirrus_macosx_arm64.yml
@@ -0,0 +1,55 @@
+modified_clone: &MODIFIED_CLONE
+  # makes sure that for a PR the CI runs against a merged main
+  clone_script: |
+    if [ -z "$CIRRUS_PR" ]; then
+      # if you're not in a PR then clone against the branch name that was pushed to.
+      git clone --recursive --branch=$CIRRUS_BRANCH https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git $CIRRUS_WORKING_DIR
+      git reset --hard $CIRRUS_CHANGE_IN_REPO
+    else
+      # it's a PR so clone the main branch then merge the changes from the PR
+      git clone https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git $CIRRUS_WORKING_DIR
+      git fetch origin pull/$CIRRUS_PR/head:pull/$CIRRUS_PR
+    
+      # CIRRUS_BASE_BRANCH will probably be `main` for the majority of the time
+      # However, if you do a PR against a maintenance branch we will want to
+      # merge the PR into the maintenance branch, not main
+      git checkout $CIRRUS_BASE_BRANCH
+
+      # alpine git package needs default user.name and user.email to be set before a merge
+      git -c user.email="you@example.com" merge --no-commit pull/$CIRRUS_PR
+      git submodule update --init --recursive
+    fi
+
+
+macos_arm64_test_task:
+  macos_instance:
+    image: ghcr.io/cirruslabs/macos-monterey-xcode:14
+
+  <<: *MODIFIED_CLONE
+
+  pip_cache:
+    folder: ~/.cache/pip
+
+  test_script: |
+    brew install python@3.10
+
+    export PATH=/opt/homebrew/opt/python@3.10/libexec/bin:$PATH
+    python --version
+
+    RUNNER_OS="macOS"
+    CFLAGS="-std=c99 -fno-strict-aliasing"
+    SDKROOT=/Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk
+    
+    # used for installing OpenBLAS/gfortran
+    bash tools/wheels/cibw_before_build.sh $PWD
+
+    pushd ~/
+    python -m venv numpy-dev
+    source numpy-dev/bin/activate
+    popd
+
+    pip install -r build_requirements.txt
+    pip install pytest hypothesis typing_extensions
+    
+    spin build
+    spin test
diff --git a/tools/ci/cirrus_wheels.yml b/tools/ci/cirrus_wheels.yml
new file mode 100644
index 000000000..60512afab
--- /dev/null
+++ b/tools/ci/cirrus_wheels.yml
@@ -0,0 +1,157 @@
+build_and_store_wheels: &BUILD_AND_STORE_WHEELS
+  install_cibuildwheel_script:
+    - python -m pip install cibuildwheel==2.12.1
+  cibuildwheel_script:
+    - cibuildwheel
+  wheels_artifacts:
+    path: "wheelhouse/*"
+
+######################################################################
+# Build linux_aarch64 natively
+######################################################################
+
+linux_aarch64_task:
+  compute_engine_instance:
+    image_project: cirrus-images
+    image: family/docker-builder-arm64
+    architecture: arm64
+    platform: linux
+    cpu: 2
+    memory: 8G
+  matrix:
+    # build in a matrix because building and testing all four wheels in a
+    # single task takes longer than 60 mins (the default time limit for a
+    # cirrus-ci task).
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp39-*
+        EXPECT_CPU_FEATURES: NEON NEON_FP16 NEON_VFPV4 ASIMD ASIMDHP ASIMDDP ASIMDFHM
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp310-*
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp311-*
+
+  build_script: |
+    apt install -y python3-venv python-is-python3 gfortran libatlas-base-dev libgfortran5 eatmydata
+    git fetch origin
+    ./tools/travis-before-install.sh
+    which python
+    echo $CIRRUS_CHANGE_MESSAGE
+  <<: *BUILD_AND_STORE_WHEELS
+
+
+######################################################################
+# Build macosx_arm64 natively
+######################################################################
+
+macosx_arm64_task:
+  macos_instance:
+    image: ghcr.io/cirruslabs/macos-monterey-xcode:14
+  matrix:
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp39-*
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp310-* cp311-*
+  env:
+    PATH: /opt/homebrew/opt/python@3.10/bin:/usr/local/lib:/usr/local/include:$PATH
+    CIBW_ARCHS: arm64
+    # Specifying CIBW_ENVIRONMENT_MACOS overrides pyproject.toml, so include
+    # all the settings from there, otherwise they're lost.
+    # SDKROOT needs to be set for repackaged conda-forge gfortran compilers
+    # supplied by isuruf.
+    # Find out SDKROOT via `xcrun --sdk macosx --show-sdk-path`
+    CIBW_ENVIRONMENT_MACOS: >
+      RUNNER_OS=macOS
+      SDKROOT=/Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk
+      LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+      CFLAGS="-std=c99 -fno-strict-aliasing"
+      OPENBLAS64_="/usr/local"
+      NPY_USE_BLAS_ILP64="1"
+
+  build_script:
+    - brew install python@3.10
+    - ln -s python3 /opt/homebrew/opt/python@3.10/bin/python
+    - which python
+    # needed for submodules
+    - git submodule update --init
+    # need to obtain all the tags so setup.py can determine FULLVERSION
+    - git fetch origin
+    - uname -m
+    - python -c "import platform;print(platform.python_version());print(platform.system());print(platform.machine())"
+    - clang --version
+  <<: *BUILD_AND_STORE_WHEELS
+
+
+######################################################################
+# Upload all wheels
+######################################################################
+
+wheels_upload_task:
+  # Artifacts don't seem to be persistent from task to task.
+  # Rather than upload wheels at the end of each cibuildwheel run we do a
+  # final upload here. This is because a run may be on different OS for
+  # which bash, etc, may not be present.
+  depends_on:
+    - linux_aarch64
+    - macosx_arm64
+  compute_engine_instance:
+    image_project: cirrus-images
+    image: family/docker-builder
+    platform: linux
+    cpu: 1
+
+  env:
+    NUMPY_STAGING_UPLOAD_TOKEN: ENCRYPTED[!5a69522ae0c2af9edb2bc1cdfeaca6292fb3666d9ecd82dca0615921834a6ce3b702352835d8bde4ea2a9ed5ef8424ac!]
+    NUMPY_NIGHTLY_UPLOAD_TOKEN: ENCRYPTED[!196422e6c3419a3b1d79815e1026094a215cb0f346fe34ed0f9d3ca1c19339df7398d04556491b1e0420fc1fe3713289!]
+
+  upload_script: |
+    apt-get update
+    apt-get install -y curl wget
+    export IS_SCHEDULE_DISPATCH="false"
+    export IS_PUSH="false"
+
+    # cron job
+    if [[ "$CIRRUS_CRON" == "weekly" ]]; then
+      export IS_SCHEDULE_DISPATCH="true"
+    fi
+
+    # a manual build was started
+    if [[ "$CIRRUS_BUILD_SOURCE" == "api" && "$CIRRUS_COMMIT_MESSAGE" == "API build for null" ]]; then
+      export IS_SCHEDULE_DISPATCH="true"
+    fi
+
+    # only upload wheels to staging if it's a tag beginning with 'v' and you're
+    # on a maintenance branch
+    if [[ "$CIRRUS_TAG" == v* ]] && [[ $CIRRUS_TAG != *"dev0"* ]]; then
+      export IS_PUSH="true"    
+    fi
+
+    if [[ $IS_PUSH == "true" ]] || [[ $IS_SCHEDULE_DISPATCH == "true" ]]; then
+        # install miniconda in the home directory. For some reason HOME isn't set by Cirrus
+        export HOME=$PWD
+    
+        # install miniconda for uploading to anaconda
+        wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+        bash miniconda.sh -b -p $HOME/miniconda3
+        $HOME/miniconda3/bin/conda init bash
+        source $HOME/miniconda3/bin/activate
+        conda install -y anaconda-client
+    
+        # The name of the zip file is derived from the `wheels_artifact` line.
+        # If you change the artifact line to `myfile_artifact` then it would be
+        # called myfile.zip
+        
+        curl https://api.cirrus-ci.com/v1/artifact/build/$CIRRUS_BUILD_ID/wheels.zip --output wheels.zip
+        unzip wheels.zip
+        
+        source ./tools/wheels/upload_wheels.sh
+        # IS_PUSH takes precedence over IS_SCHEDULE_DISPATCH
+        set_upload_vars
+        
+        # Will be skipped if not a push/tag/scheduled build
+        upload_wheels
+    fi
diff --git a/tools/ci/push_docs_to_repo.py b/tools/ci/push_docs_to_repo.py
index e34522a29..0471e3824 100755
--- a/tools/ci/push_docs_to_repo.py
+++ b/tools/ci/push_docs_to_repo.py
@@ -19,6 +19,8 @@ parser.add_argument('--committer', default='numpy-commit-bot',
                     help='Name of the git committer')
 parser.add_argument('--email', default='numpy-commit-bot@nomail',
                     help='Email of the git committer')
+parser.add_argument('--count', default=1, type=int,
+                    help="minimum number of expected files, defaults to 1")
 
 parser.add_argument(
     '--force', action='store_true',
@@ -31,6 +33,11 @@ if not os.path.exists(args.dir):
     print('Content directory does not exist')
     sys.exit(1)
 
+count = len([name for name in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, name))])
+
+if count < args.count:
+    print(f"Expected {args.count} top-directory files to upload, got {count}")
+    sys.exit(1)
 
 def run(cmd, stdout=True):
     pipe = None if stdout else subprocess.DEVNULL
diff --git a/tools/cythonize.py b/tools/cythonize.py
index 311a52c91..d66ddb98c 100755
--- a/tools/cythonize.py
+++ b/tools/cythonize.py
@@ -52,7 +52,7 @@ def process_tempita_pyx(fromfile, tofile):
     import npy_tempita as tempita
 
     assert fromfile.endswith('.pyx.in')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     pyxfile = fromfile[:-len('.pyx.in')] + '.pyx'
@@ -66,7 +66,7 @@ def process_tempita_pyd(fromfile, tofile):
 
     assert fromfile.endswith('.pxd.in')
     assert tofile.endswith('.pxd')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -77,7 +77,7 @@ def process_tempita_pxi(fromfile, tofile):
 
     assert fromfile.endswith('.pxi.in')
     assert tofile.endswith('.pxi')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -88,7 +88,7 @@ def process_tempita_pxd(fromfile, tofile):
 
     assert fromfile.endswith('.pxd.in')
     assert tofile.endswith('.pxd')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -109,7 +109,7 @@ def load_hashes(filename):
     # Return { filename : (sha256 of input, sha256 of output) }
     if os.path.isfile(filename):
         hashes = {}
-        with open(filename, 'r') as f:
+        with open(filename) as f:
             for line in f:
                 filename, inhash, outhash = line.split()
                 hashes[filename] = (inhash, outhash)
diff --git a/tools/gitpod/Dockerfile b/tools/gitpod/Dockerfile
deleted file mode 100644
index dd5561750..000000000
--- a/tools/gitpod/Dockerfile
+++ /dev/null
@@ -1,101 +0,0 @@
-#
-# Dockerfile for NumPy development
-#
-# Usage:
-# -------
-#
-# To make a local build of the container, from the 'Docker-dev' directory:
-# docker build  --rm -f "Dockerfile" -t <build-tag> "."
-#
-# To use the container use the following command. It assumes that you are in
-# the root folder of the NumPy git repository, making it available as
-# /home/numpy in the container. Whatever changes you make to that directory
-# are visible in the host and container.
-# The docker image is retrieved from the NumPy dockerhub repository
-#
-# docker run --rm -it -v $(pwd):/home/numpy numpy/numpy-dev:<image-tag>
-#
-# By default the container will activate the conda environment numpy-dev
-# which contains all the dependencies needed for NumPy development
-#
-# To build NumPy run: python setup.py build_ext --inplace
-#
-# To run the tests use: python runtests.py
-#
-# This image is based on: Ubuntu 20.04 (focal)
-# https://hub.docker.com/_/ubuntu/?tab=tags&name=focal
-# OS/ARCH: linux/amd64
-FROM gitpod/workspace-base:latest
-
-ARG MAMBAFORGE_VERSION="4.11.0-0"
-ARG CONDA_ENV=numpy-dev
-
-
-# ---- Configure environment ----
-ENV CONDA_DIR=/home/gitpod/mambaforge3 \
-    SHELL=/bin/bash
-ENV PATH=${CONDA_DIR}/bin:$PATH \
-    WORKSPACE=/workspace/numpy
-
-
-# -----------------------------------------------------------------------------
-# ---- Creating as root - note: make sure to change to gitpod in the end ----
-USER root
-
-# hadolint ignore=DL3008
-RUN apt-get update && \
-    apt-get install -yq --no-install-recommends \
-    ca-certificates \
-    dirmngr \
-    dvisvgm \
-    gnupg \
-    gpg-agent \
-    texlive-latex-extra \
-    vim && \
-    # this needs to be done after installing dirmngr
-    apt-key adv --keyserver keyserver.ubuntu.com --recv-key 23F3D4EA75716059 && \ 
-    apt-add-repository https://cli.github.com/packages && \ 
-    apt-get install -yq --no-install-recommends \
-    gh && \ 
-    locale-gen en_US.UTF-8 && \
-    apt-get clean && \
-    rm -rf /var/cache/apt/* &&\
-    rm -rf /var/lib/apt/lists/* &&\
-    rm -rf /tmp/*
-
-# Allows this Dockerfile to activate conda environments
-SHELL ["/bin/bash", "--login", "-o", "pipefail", "-c"]
-
-# -----------------------------------------------------------------------------
-# ---- Installing mamba  ----
-RUN wget -q -O mambaforge3.sh \
-    "https://github.com/conda-forge/miniforge/releases/download/$MAMBAFORGE_VERSION/Mambaforge-$MAMBAFORGE_VERSION-Linux-x86_64.sh" && \
-    bash mambaforge3.sh -p ${CONDA_DIR} -b && \
-    rm mambaforge3.sh
-
-# -----------------------------------------------------------------------------
-# ---- Copy needed files ----
-# basic workspace configurations
-COPY ./tools/gitpod/workspace_config /usr/local/bin/workspace_config
-
-RUN chmod a+rx /usr/local/bin/workspace_config && \
-    workspace_config
-
-# Copy conda environment file into the container - this needs to exists inside 
-# the container to create a conda environment from it
-COPY environment.yml /tmp/environment.yml
-
-# -----------------------------------------------------------------------------
-# ---- Create conda environment ----
-# Install NumPy dependencies
-RUN mamba env create -f /tmp/environment.yml && \
-    conda activate ${CONDA_ENV} && \
-    mamba install ccache -y && \
-    # needed for docs rendering later on
-    python -m pip install --no-cache-dir sphinx-autobuild && \
-    conda clean --all -f -y && \
-    rm -rf /tmp/*
-
-# -----------------------------------------------------------------------------
-# Always make sure we are not root
-USER gitpod
-\ No newline at end of file
diff --git a/tools/gitpod/gitpod.Dockerfile b/tools/gitpod/gitpod.Dockerfile
deleted file mode 100644
index 8dac0d597..000000000
--- a/tools/gitpod/gitpod.Dockerfile
+++ /dev/null
@@ -1,49 +0,0 @@
-# Doing a local shallow clone - keeps the container secure
-# and much slimmer than using COPY directly or making a 
-# remote clone
-ARG BASE_CONTAINER="numpy/numpy-dev:latest"
-FROM gitpod/workspace-base:latest as clone
-
-COPY --chown=gitpod . /tmp/numpy_repo
-
-# the clone should be deep enough for versioneer to work
-RUN git clone --shallow-since=2021-05-22 file:////tmp/numpy_repo /tmp/numpy
-
-# -----------------------------------------------------------------------------
-# Using the numpy-dev Docker image as a base
-# This way, we ensure we have all the needed compilers and dependencies
-# while reducing the build time
-FROM ${BASE_CONTAINER} as build
-
-# -----------------------------------------------------------------------------
-USER root
-
-# -----------------------------------------------------------------------------
-# ---- ENV variables ----
-# ---- Directories needed ----
-ENV WORKSPACE=/workspace/numpy/ \
-    CONDA_ENV=numpy-dev
-
-# Allows this Dockerfile to activate conda environments
-SHELL ["/bin/bash", "--login", "-o", "pipefail", "-c"]
-
-# Copy over the shallow clone
-COPY --from=clone --chown=gitpod /tmp/numpy ${WORKSPACE}
-
-# Everything happens in the /workspace/numpy directory
-WORKDIR ${WORKSPACE}
-
-# Build numpy to populate the cache used by ccache
-RUN git config --global --add safe.directory /workspace/numpy
-RUN git submodule update --init --depth=1 -- numpy/core/src/umath/svml
-RUN conda activate ${CONDA_ENV} && \ 
-    python setup.py build_ext --inplace && \
-    ccache -s
-
-# Gitpod will load the repository into /workspace/numpy. We remove the
-# directory from the image to prevent conflicts
-RUN rm -rf ${WORKSPACE}
-
-# -----------------------------------------------------------------------------
-# Always return to non privileged user
-USER gitpod
diff --git a/tools/gitpod/settings.json b/tools/gitpod/settings.json
deleted file mode 100644
index 50296336d..000000000
--- a/tools/gitpod/settings.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "restructuredtext.updateOnTextChanged": "true",
-    "restructuredtext.updateDelay": 300,
-    "restructuredtext.linter.disabledLinters": ["doc8","rst-lint", "rstcheck"],
-    "python.defaultInterpreterPath": "/home/gitpod/mambaforge3/envs/numpy-dev/bin/python",
-    "esbonio.sphinx.buildDir": "${workspaceRoot}/doc/build/html",
-    "esbonio.sphinx.confDir": ""
-}
-\ No newline at end of file
diff --git a/tools/gitpod/workspace_config b/tools/gitpod/workspace_config
deleted file mode 100644
index aa859c9be..000000000
--- a/tools/gitpod/workspace_config
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Basic configurations for the workspace
-
-set -e
-
-# gitpod/workspace-base needs at least one file here
-touch /home/gitpod/.bashrc.d/empty
-
-# Add git aliases
-git config --global alias.co checkout
-git config --global alias.ci commit
-git config --global alias.st status
-git config --global alias.br branch
-git config --global alias.hist "log --pretty=format:'%h %ad | %s%d [%an]' --graph --date=short"
-git config --global alias.type 'cat-file -t'
-git config --global alias.dump 'cat-file -p'
-
-# Enable basic vim defaults in ~/.vimrc
-echo "filetype plugin indent on" >>~/.vimrc
-echo "set colorcolumn=80" >>~/.vimrc
-echo "set number" >>~/.vimrc
-echo "syntax enable" >>~/.vimrc
-
-# Vanity custom bash prompt - makes it more legible
-echo "PS1='\[\e]0;\u \w\a\]\[\033[01;36m\]\u\[\033[m\] > \[\033[38;5;141m\]\w\[\033[m\] \\$ '" >>~/.bashrc
-
-# Enable prompt color in the skeleton .bashrc
-# hadolint ignore=SC2016
-sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc
-
-# .gitpod.yml is configured to install NumPy from /workspace/numpy
-echo "export PYTHONPATH=${WORKSPACE}" >>~/.bashrc
-
-# make conda activate command available from /bin/bash (login and interactive)
-if [[ ! -f "/etc/profile.d/conda.sh" ]]; then
-    ln -s ${CONDA_DIR}/etc/profile.d/conda.sh /etc/profile.d/conda.sh
-fi
-echo ". ${CONDA_DIR}/etc/profile.d/conda.sh" >>~/.bashrc
-echo "conda activate numpy-dev" >>~/.bashrc
-
-# Enable prompt color in the skeleton .bashrc
-# hadolint ignore=SC2016
-sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc
-
-# .gitpod.yml is configured to install numpy from /workspace/numpy
-echo "export PYTHONPATH=/workspace/numpy" >>~/.bashrc
-
-# Set up ccache for compilers for this Dockerfile
-# REF: https://github.com/conda-forge/compilers-feedstock/issues/31
-echo "conda activate numpy-dev" >>~/.startuprc
-echo "export CC=\"ccache \$CC\"" >>~/.startuprc
-echo "export CXX=\"ccache \$CXX\"" >>~/.startuprc
-echo "export F77=\"ccache \$F77\"" >>~/.startuprc
-echo "export F90=\"ccache \$F90\"" >>~/.startuprc
-echo "export GFORTRAN=\"ccache \$GFORTRAN\"" >>~/.startuprc
-echo "export FC=\"ccache \$FC\"" >>~/.startuprc
-echo "source ~/.startuprc" >>~/.profile
-echo "source ~/.startuprc" >>~/.bashrc
diff --git a/tools/list_installed_dll_dependencies_cygwin.sh b/tools/list_installed_dll_dependencies_cygwin.sh
index ee06ae0d0..78436fe53 100644
--- a/tools/list_installed_dll_dependencies_cygwin.sh
+++ b/tools/list_installed_dll_dependencies_cygwin.sh
@@ -14,11 +14,11 @@
 py_ver=${1}
 dll_list=`/bin/dash tools/list_numpy_dlls.sh ${py_ver}`
 echo "Checks for existence, permissions and file type"
-ls -l ${dll_list}
-file ${dll_list}
+/usr/bin/timeout 10m /usr/bin/ls -l ${dll_list}
+/usr/bin/timeout 10m /usr/bin/file ${dll_list}
 echo "Dependency checks"
-ldd ${dll_list} | grep -F -e " => not found" && exit 1
-cygcheck ${dll_list} >cygcheck_dll_list 2>cygcheck_missing_deps
+/usr/bin/timeout 10m /usr/bin/ldd ${dll_list} | grep -F -e " => not found" && exit 1
+/usr/bin/timeout 10m /usr/bin/cygcheck ${dll_list} >cygcheck_dll_list 2>cygcheck_missing_deps
 grep -F -e "cygcheck: track_down: could not find " cygcheck_missing_deps && exit 1
 echo "Import tests"
 mkdir -p dist/
@@ -31,5 +31,5 @@ do
 			 -e "s/^\/+(home|usr).*?site-packages\/+//" \
 			 -e "s/.cpython-3.m?-x86(_64)?-cygwin.dll$//" \
 			 -e "s/\//./g"`
-    python${py_ver} -c "import ${ext_module}"
+    /usr/bin/timeout 2m /usr/bin/python${py_ver} -c "import ${ext_module}"
 done
diff --git a/tools/openblas_support.py b/tools/openblas_support.py
index ed43804af..fd8c6a97a 100644
--- a/tools/openblas_support.py
+++ b/tools/openblas_support.py
@@ -13,13 +13,14 @@ from tempfile import mkstemp, gettempdir
 from urllib.request import urlopen, Request
 from urllib.error import HTTPError
 
-OPENBLAS_V = '0.3.21'
-OPENBLAS_LONG = 'v0.3.21'
+OPENBLAS_V = '0.3.23'
+OPENBLAS_LONG = 'v0.3.23'
 BASE_LOC = 'https://anaconda.org/multibuild-wheels-staging/openblas-libs'
 BASEURL = f'{BASE_LOC}/{OPENBLAS_LONG}/download'
 SUPPORTED_PLATFORMS = [
     'linux-aarch64',
     'linux-x86_64',
+    'musllinux-x86_64',
     'linux-i686',
     'linux-ppc64le',
     'linux-s390x',
@@ -55,10 +56,39 @@ def get_ilp64():
 
 def get_manylinux(arch):
     default = '2014'
-    ret = os.environ.get("MB_ML_VER", default)
+    ml_ver = os.environ.get("MB_ML_VER", default)
     # XXX For PEP 600 this can be a glibc version
-    assert ret in ('2010', '2014', '_2_24'), f'invalid MB_ML_VER {ret}'
-    return ret
+    assert ml_ver in ('2010', '2014', '_2_24'), f'invalid MB_ML_VER {ml_ver}'
+    suffix = f'manylinux{ml_ver}_{arch}.tar.gz'
+    return suffix
+
+
+def get_musllinux(arch):
+    musl_ver = "1_1"
+    suffix = f'musllinux_{musl_ver}_{arch}.tar.gz'
+    return suffix
+
+
+def get_linux(arch):
+    # best way of figuring out whether manylinux or musllinux is to look
+    # at the packaging tags. If packaging isn't installed (it's not by default)
+    # fallback to sysconfig (which may be flakier)
+    try:
+        from packaging.tags import sys_tags
+        tags = list(sys_tags())
+        plat = tags[0].platform
+    except ImportError:
+        # fallback to sysconfig for figuring out if you're using musl
+        plat = 'manylinux'
+        # value could be None
+        v = sysconfig.get_config_var('HOST_GNU_TYPE') or ''
+        if 'musl' in v:
+            plat = 'musllinux'
+
+    if 'manylinux' in plat:
+        return get_manylinux(arch)
+    elif 'musllinux' in plat:
+        return get_musllinux(arch)
 
 
 def download_openblas(target, plat, ilp64):
@@ -70,8 +100,10 @@ def download_openblas(target, plat, ilp64):
                 '(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3')}
     suffix = None
     if osname == "linux":
-        ml_ver = get_manylinux(arch)
-        suffix = f'manylinux{ml_ver}_{arch}.tar.gz'
+        suffix = get_linux(arch)
+        typ = 'tar.gz'
+    elif osname == "musllinux":
+        suffix = get_musllinux(arch)
         typ = 'tar.gz'
     elif plat == 'macosx-x86_64':
         suffix = 'macosx_10_9_x86_64-gf_c469a42.tar.gz'
@@ -200,7 +232,7 @@ def make_init(dirname):
     '''
     Create a _distributor_init.py file for OpenBlas
     '''
-    with open(os.path.join(dirname, '_distributor_init.py'), 'wt') as fid:
+    with open(os.path.join(dirname, '_distributor_init.py'), 'w') as fid:
         fid.write(textwrap.dedent("""
             '''
             Helper to preload windows dlls to prevent dll not found errors.
@@ -267,15 +299,11 @@ def test_setup(plats):
             if not target:
                 raise RuntimeError(f'Could not setup {plat}')
             print('success with', plat, ilp64)
-            if osname == 'win':
-                if not target.endswith('.a'):
-                    raise RuntimeError("Not .a extracted!")
-            else:
-                files = glob.glob(os.path.join(target, "lib", "*.a"))
-                if not files:
-                    raise RuntimeError("No lib/*.a unpacked!")
+            files = glob.glob(os.path.join(target, "lib", "*.a"))
+            if not files:
+                raise RuntimeError("No lib/*.a unpacked!")
         finally:
-            if target is not None:
+            if target:
                 if os.path.isfile(target):
                     os.unlink(target)
                 else:
diff --git a/tools/rebase_installed_dlls_cygwin.sh b/tools/rebase_installed_dlls_cygwin.sh
index f772879d9..58d4f0c0f 100644
--- a/tools/rebase_installed_dlls_cygwin.sh
+++ b/tools/rebase_installed_dlls_cygwin.sh
@@ -2,4 +2,6 @@
 # Rebase the dlls installed by NumPy
 
 py_ver=${1}
-/usr/bin/rebase --database --oblivious `/bin/dash tools/list_numpy_dlls.sh ${py_ver}`
+numpy_dlls="`/bin/dash tools/list_numpy_dlls.sh ${py_ver}`"
+/usr/bin/rebase --verbose --database --oblivious ${numpy_dlls}
+/usr/bin/rebase --verbose --info ${numpy_dlls}
diff --git a/tools/refguide_check.py b/tools/refguide_check.py
index 1d230b385..2c69b9464 100644
--- a/tools/refguide_check.py
+++ b/tools/refguide_check.py
@@ -99,12 +99,6 @@ DOCTEST_SKIPDICT = {
     'numpy.lib.DataSource': None,
     'numpy.lib.Repository': None,
 }
-if sys.version_info < (3, 9):
-    DOCTEST_SKIPDICT.update({
-        "numpy.core.ndarray": {"__class_getitem__"},
-        "numpy.core.dtype": {"__class_getitem__"},
-        "numpy.core.number": {"__class_getitem__"},
-    })
 
 # Skip non-numpy RST files, historical release notes
 # Any single-directory exact match will skip the directory and all subdirs.
diff --git a/tools/swig/pyfragments.swg b/tools/swig/pyfragments.swg
index eac817322..6d3e6ff41 100644
--- a/tools/swig/pyfragments.swg
+++ b/tools/swig/pyfragments.swg
@@ -52,6 +52,7 @@
     }
 %#endif
     if (!PyArray_IsScalar(obj,Integer)) return SWIG_TypeError;
+    if (!val) return SWIG_OK;
     PyArray_Descr * longDescr = PyArray_DescrFromType(NPY_LONG);
     PyArray_CastScalarToCtype(obj, (void*)val, longDescr);
     Py_DECREF(longDescr);
@@ -102,6 +103,7 @@
     }
 %#endif
     if (!PyArray_IsScalar(obj,Integer)) return SWIG_TypeError;
+    if (!val) return SWIG_OK;
     PyArray_Descr * ulongDescr = PyArray_DescrFromType(NPY_ULONG);
     PyArray_CastScalarToCtype(obj, (void*)val, ulongDescr);
     Py_DECREF(ulongDescr);
diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index 6216c239b..67e507eb3 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -33,7 +33,7 @@ werrors="$werrors -Werror=implicit-function-declaration"
 setup_base()
 {
   # use default python flags but remove sign-compare
-  sysflags="$($PYTHON -c "from distutils import sysconfig; \
+  sysflags="$($PYTHON -c "import sysconfig; \
     print (sysconfig.get_config_var('CFLAGS'))")"
   export CFLAGS="$sysflags $werrors -Wlogical-op -Wno-sign-compare"
 
diff --git a/tools/wheels/check_license.py b/tools/wheels/check_license.py
index 0fe7356c0..8ced317d6 100644
--- a/tools/wheels/check_license.py
+++ b/tools/wheels/check_license.py
@@ -9,7 +9,6 @@ distribution.
 """
 import os
 import sys
-import io
 import re
 import argparse
 
@@ -36,7 +35,7 @@ def main():
 
     # Check license text
     license_txt = os.path.join(os.path.dirname(mod.__file__), "LICENSE.txt")
-    with io.open(license_txt, "r", encoding="utf-8") as f:
+    with open(license_txt, encoding="utf-8") as f:
         text = f.read()
 
     ok = check_text(text)
diff --git a/tools/wheels/cibw_before_build.sh b/tools/wheels/cibw_before_build.sh
index ad76b330f..493cceeae 100644
--- a/tools/wheels/cibw_before_build.sh
+++ b/tools/wheels/cibw_before_build.sh
@@ -15,13 +15,18 @@ fi
 # Install Openblas
 if [[ $RUNNER_OS == "Linux" || $RUNNER_OS == "macOS" ]] ; then
     basedir=$(python tools/openblas_support.py)
-    cp -r $basedir/lib/* /usr/local/lib
-    cp $basedir/include/* /usr/local/include
     if [[ $RUNNER_OS == "macOS" && $PLATFORM == "macosx-arm64" ]]; then
+        # /usr/local/lib doesn't exist on cirrus-ci runners
+        sudo mkdir -p /usr/local/lib /usr/local/include /usr/local/lib/cmake/openblas
         sudo mkdir -p /opt/arm64-builds/lib /opt/arm64-builds/include
         sudo chown -R $USER /opt/arm64-builds
         cp -r $basedir/lib/* /opt/arm64-builds/lib
         cp $basedir/include/* /opt/arm64-builds/include
+        sudo cp -r $basedir/lib/* /usr/local/lib
+        sudo cp $basedir/include/* /usr/local/include
+    else
+        cp -r $basedir/lib/* /usr/local/lib
+        cp $basedir/include/* /usr/local/include
     fi
 elif [[ $RUNNER_OS == "Windows" ]]; then
     PYTHONPATH=tools python -c "import openblas_support; openblas_support.make_init('numpy')"
@@ -42,6 +47,5 @@ if [[ $RUNNER_OS == "macOS" ]]; then
     fi
     source $PROJECT_DIR/tools/wheels/gfortran_utils.sh
     install_gfortran
-    # Try a newer version of delocate that knows about /usr/local/lib
-    pip install git+https://github.com/isuruf/delocate@search_usr_local#egg=delocate
+    pip install "delocate==0.10.4"
 fi
diff --git a/tools/wheels/gfortran_utils.sh b/tools/wheels/gfortran_utils.sh
index 39357b3fe..9102ba127 100644
--- a/tools/wheels/gfortran_utils.sh
+++ b/tools/wheels/gfortran_utils.sh
@@ -123,7 +123,7 @@ if [ "$(uname)" == "Darwin" ]; then
         curl -L -O https://github.com/isuruf/gcc/releases/download/gcc-11.3.0-2/gfortran-darwin-${arch}-${type}.tar.gz
 	case ${arch}-${type} in
 	    arm64-native)
-	        export GFORTRAN_SHA=142290685240f4f86cdf359cb2030d586105d7e4
+	        export GFORTRAN_SHA=0d5c118e5966d0fb9e7ddb49321f63cac1397ce8
 		;;
 	    arm64-cross)
 		export GFORTRAN_SHA=527232845abc5af21f21ceacc46fb19c190fe804
@@ -148,10 +148,10 @@ if [ "$(uname)" == "Darwin" ]; then
 	if [[ "${type}" == "native" ]]; then
 	    # Link these into /usr/local so that there's no need to add rpath or -L
 	    for f in libgfortran.dylib libgfortran.5.dylib libgcc_s.1.dylib libgcc_s.1.1.dylib libquadmath.dylib libquadmath.0.dylib; do
-                ln -sf /opt/gfortran-darwin-${arch}-${type}/lib/$f /usr/local/lib/$f
+                sudo ln -sf /opt/gfortran-darwin-${arch}-${type}/lib/$f /usr/local/lib/$f
             done
 	    # Add it to PATH
-	    ln -sf /opt/gfortran-darwin-${arch}-${type}/bin/gfortran /usr/local/bin/gfortran
+	    sudo ln -sf /opt/gfortran-darwin-${arch}-${type}/bin/gfortran /usr/local/bin/gfortran
 	fi
     }
 
diff --git a/tools/wheels/upload_wheels.sh b/tools/wheels/upload_wheels.sh
index 6694caf01..bc2066726 100644
--- a/tools/wheels/upload_wheels.sh
+++ b/tools/wheels/upload_wheels.sh
@@ -37,8 +37,6 @@ upload_wheels() {
         if [[ -z ${TOKEN} ]]; then
             echo no token set, not uploading
         else
-            python -m pip install \
-            git+https://github.com/Anaconda-Platform/anaconda-client.git@be1e14936a8e947da94d026c990715f0596d7043
             # sdists are located under dist folder when built through setup.py
             if compgen -G "./dist/*.gz"; then
                 echo "Found sdist"