51 files changed, 375 insertions, 371 deletions
diff --git a/.gitpod.yml b/.gitpod.yml
index f9c35fd9b..c46752f10 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -6,18 +6,20 @@
 
 image: numpy/numpy-gitpod:latest
 tasks:
-  - name: Prepare development
+  - name: Prepare development environment
     init: |
       mkdir -p .vscode
       cp tools/gitpod/settings.json .vscode/settings.json
+      rm -f /workspace/numpy/.git/shallow.lock  
       conda activate numpy-dev
+      git pull --unshallow  # need to force this else the prebuild fails
+      git fetch --tags
       python setup.py build_ext --inplace
       echo "🛠 Completed rebuilding NumPy!! 🛠 "
       echo "📖 Building docs 📖 "
       cd doc
       make html
       echo "✨ Pre-build complete! You can close this terminal ✨ "
-  
 
 # --------------------------------------------------------
 # exposing ports for liveserve
@@ -60,3 +62,4 @@ github:
     addBadge: false
     # add a label once the prebuild is ready to pull requests (defaults to false)
     addLabel: false
+    
+\ No newline at end of file
diff --git a/README.md b/README.md
index 6be1ee316..0d1750006 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](
 https://numfocus.org)
-[![Pypi Downloads](https://img.shields.io/pypi/dm/numpy.svg?label=Pypi%20downloads)](
+[![PyPI Downloads](https://img.shields.io/pypi/dm/numpy.svg?label=PyPI%20downloads)](
 https://pypi.org/project/numpy/)
 [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/numpy.svg?label=Conda%20downloads)](
 https://anaconda.org/conda-forge/numpy)
diff --git a/doc/changelog/1.12.0-changelog.rst b/doc/changelog/1.12.0-changelog.rst
index 2e91f510f..052714374 100644
--- a/doc/changelog/1.12.0-changelog.rst
+++ b/doc/changelog/1.12.0-changelog.rst
@@ -283,7 +283,7 @@ A total of 418 pull requests were merged for this release.
 * `#7373 <https://github.com/numpy/numpy/pull/7373>`__: ENH: Add bitwise_and identity
 * `#7378 <https://github.com/numpy/numpy/pull/7378>`__: added NumPy logo and separator
 * `#7382 <https://github.com/numpy/numpy/pull/7382>`__: MAINT: cleanup np.average
-* `#7385 <https://github.com/numpy/numpy/pull/7385>`__: DOC: note about wheels / windows wheels for pypi
+* `#7385 <https://github.com/numpy/numpy/pull/7385>`__: DOC: note about wheels / windows wheels for PyPI
 * `#7386 <https://github.com/numpy/numpy/pull/7386>`__: Added label icon to Travis status
 * `#7397 <https://github.com/numpy/numpy/pull/7397>`__: BUG: incorrect type for objects whose __len__ fails
 * `#7398 <https://github.com/numpy/numpy/pull/7398>`__: DOC: fix typo
diff --git a/doc/changelog/1.20.0-changelog.rst b/doc/changelog/1.20.0-changelog.rst
index f06bd8a8d..f2af4a7de 100644
--- a/doc/changelog/1.20.0-changelog.rst
+++ b/doc/changelog/1.20.0-changelog.rst
@@ -714,7 +714,7 @@ A total of 716 pull requests were merged for this release.
 * `#17440 <https://github.com/numpy/numpy/pull/17440>`__: DOC: Cleaner template for PRs
 * `#17442 <https://github.com/numpy/numpy/pull/17442>`__: MAINT: fix exception chaining in format.py
 * `#17443 <https://github.com/numpy/numpy/pull/17443>`__: ENH: Warn on unsupported Python 3.10+
-* `#17444 <https://github.com/numpy/numpy/pull/17444>`__: ENH: Add ``Typing :: Typed`` to the PyPi classifier
+* `#17444 <https://github.com/numpy/numpy/pull/17444>`__: ENH: Add ``Typing :: Typed`` to the PyPI classifier
 * `#17445 <https://github.com/numpy/numpy/pull/17445>`__: DOC: Fix the references for macros
 * `#17447 <https://github.com/numpy/numpy/pull/17447>`__: NEP: update NEP 42 with discussion of type hinting applications
 * `#17448 <https://github.com/numpy/numpy/pull/17448>`__: DOC: Remove CoC pages from Sphinx
diff --git a/doc/neps/nep-0017-split-out-maskedarray.rst b/doc/neps/nep-0017-split-out-maskedarray.rst
index 5cb1c0c39..faad68828 100644
--- a/doc/neps/nep-0017-split-out-maskedarray.rst
+++ b/doc/neps/nep-0017-split-out-maskedarray.rst
@@ -123,7 +123,7 @@ References and Footnotes
 
 .. [1] Subclassing ndarray,
        https://docs.scipy.org/doc/numpy/user/basics.subclassing.html
-.. [2] PyPi: maskedarray, https://pypi.org/project/maskedarray/
+.. [2] PyPI: maskedarray, https://pypi.org/project/maskedarray/
 
 Copyright
 ---------
diff --git a/doc/source/dev/development_workflow.rst b/doc/source/dev/development_workflow.rst
index 457bcf34a..38f047ec0 100644
--- a/doc/source/dev/development_workflow.rst
+++ b/doc/source/dev/development_workflow.rst
@@ -204,10 +204,10 @@ fragments in your commit message::
    ``[skip travis]``: skip TravisCI jobs
    ``[skip azurepipelines]``: skip Azure jobs
 
-*Note: unfortunately not all CI systems implement this feature well, or at all.
+*Note*: unfortunately not all CI systems implement this feature well, or at all.
 CircleCI supports ``ci skip`` but has no command to skip only CircleCI.
 Azure chooses to still run jobs with skip commands on PRs, the jobs only get
-skipped on merging to master.*
+skipped on merging to master.
 
 
 .. _workflow_mailing_list:
diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst
index aaabc9b39..674799d47 100644
--- a/doc/source/reference/random/index.rst
+++ b/doc/source/reference/random/index.rst
@@ -185,7 +185,7 @@ What's New or Different
   methods which are 2-10 times faster than NumPy's Box-Muller or inverse CDF
   implementations.
 * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
-  to produce either single or double prevision uniform random variables for
+  to produce either single or double precision uniform random variables for
   select distributions
 * Optional ``out`` argument that allows existing arrays to be filled for
   select distributions
diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst
index a81543926..7a206a2ce 100644
--- a/doc/source/reference/random/new-or-different.rst
+++ b/doc/source/reference/random/new-or-different.rst
@@ -84,7 +84,7 @@ And in more detail:
 * The bit generators can be used in downstream projects via
   Cython.
 * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
-  to produce either single or double prevision uniform random variables for
+  to produce either single or double precision uniform random variables for
   select distributions
 
   * Uniforms (`~.Generator.random` and `~.Generator.integers`)
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
index 17d1b4951..7782172d2 100644
--- a/doc/source/reference/simd/generated_tables/cpu_features.inc
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -36,26 +36,28 @@ On IBM/POWER big-endian
 .. table::
     :align: left
 
-    ======== ================
-    Name     Implies         
-    ======== ================
-    ``VSX``                  
-    ``VSX2`` ``VSX``         
-    ``VSX3`` ``VSX`` ``VSX2``
-    ======== ================
+    ======== =========================
+    Name     Implies                  
+    ======== =========================
+    ``VSX``                           
+    ``VSX2`` ``VSX``                  
+    ``VSX3`` ``VSX`` ``VSX2``         
+    ``VSX4`` ``VSX`` ``VSX2`` ``VSX3``
+    ======== =========================
 
 On IBM/POWER little-endian
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. table::
     :align: left
 
-    ======== ================
-    Name     Implies         
-    ======== ================
-    ``VSX``  ``VSX2``        
-    ``VSX2`` ``VSX``         
-    ``VSX3`` ``VSX`` ``VSX2``
-    ======== ================
+    ======== =========================
+    Name     Implies                  
+    ======== =========================
+    ``VSX``  ``VSX2``                 
+    ``VSX2`` ``VSX``                  
+    ``VSX3`` ``VSX`` ``VSX2``         
+    ``VSX4`` ``VSX`` ``VSX2`` ``VSX3``
+    ======== =========================
 
 On ARMv7/A32
 ~~~~~~~~~~~~
diff --git a/doc/source/release/1.10.3-notes.rst b/doc/source/release/1.10.3-notes.rst
index 0d4df4ce6..9172f7663 100644
--- a/doc/source/release/1.10.3-notes.rst
+++ b/doc/source/release/1.10.3-notes.rst
@@ -2,4 +2,4 @@
 NumPy 1.10.3 Release Notes
 ==========================
 
-N/A this release did not happen due to various screwups involving PyPi.
+N/A this release did not happen due to various screwups involving PyPI.
diff --git a/doc/source/release/1.11.1-notes.rst b/doc/source/release/1.11.1-notes.rst
index 6303c32f0..a196502cf 100644
--- a/doc/source/release/1.11.1-notes.rst
+++ b/doc/source/release/1.11.1-notes.rst
@@ -4,7 +4,7 @@ NumPy 1.11.1 Release Notes
 
 Numpy 1.11.1 supports Python 2.6 - 2.7 and 3.2 - 3.5. It fixes bugs and
 regressions found in Numpy 1.11.0 and includes several build related
-improvements. Wheels for Linux, Windows, and OSX can be found on pypi.
+improvements. Wheels for Linux, Windows, and OSX can be found on PyPI.
 
 Fixes Merged
 ============
diff --git a/doc/source/release/1.12.1-notes.rst b/doc/source/release/1.12.1-notes.rst
index f67dab108..09a2e6738 100644
--- a/doc/source/release/1.12.1-notes.rst
+++ b/doc/source/release/1.12.1-notes.rst
@@ -4,7 +4,7 @@ NumPy 1.12.1 Release Notes
 
 NumPy 1.12.1 supports Python 2.7 and 3.4 - 3.6 and fixes bugs and regressions
 found in NumPy 1.12.0. In particular, the regression in f2py constant parsing
-is fixed. Wheels for Linux, Windows, and OSX can be found on pypi,
+is fixed. Wheels for Linux, Windows, and OSX can be found on PyPI,
 
 Bugs Fixed
 ==========
diff --git a/doc/source/user/absolute_beginners.rst b/doc/source/user/absolute_beginners.rst
index 2c6882905..cf11c6745 100644
--- a/doc/source/user/absolute_beginners.rst
+++ b/doc/source/user/absolute_beginners.rst
@@ -1588,7 +1588,7 @@ If you created this array "a" ::
 
 .. for doctests
    The continuous integration truncates dataframe display without this setting.
-   >>> pd.set_option('max_columns', 10)
+   >>> pd.set_option('display.max_columns', 10)
 
 You could create a Pandas dataframe ::
 
diff --git a/environment.yml b/environment.yml
index 024bee2c7..214a75352 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,7 @@ name: numpy-dev
 channels:
   - conda-forge
 dependencies:
-  - python
+  - python=3.9 #need to pin to avoid issues with builds
   - cython
   - compilers
   - openblas
@@ -21,13 +21,14 @@ dependencies:
   # For type annotations
   - mypy=0.931
   # For building docs
-  - sphinx=4.1.1
+  - sphinx=4.2.0
+  - sphinx-panels
   - numpydoc=1.1.0
   - ipython
   - scipy
   - pandas
   - matplotlib
-  - pydata-sphinx-theme
+  - pydata-sphinx-theme=0.7.2
   - breathe
   # For linting
   - pycodestyle=2.7.0
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 451a29a02..92f98a801 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -3673,6 +3673,8 @@ class memmap(ndarray[_ShapeType, _DType_co]):
     ) -> Any: ...
     def flush(self) -> None: ...
 
+# TODO: Add a mypy plugin for managing functions whose output type is dependant
+# on the literal value of some sort of signature (e.g. `einsum` and `vectorize`)
 class vectorize:
     pyfunc: Callable[..., Any]
     cache: bool
@@ -3689,7 +3691,7 @@ class vectorize:
         cache: bool = ...,
         signature: None | str = ...,
     ) -> None: ...
-    def __call__(self, *args: Any, **kwargs: Any) -> NDArray[Any]: ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 class poly1d:
     @property
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 7081f9a59..1bbacad45 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -2943,7 +2943,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('any',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('argmax',
     """
-    a.argmax(axis=None, out=None)
+    a.argmax(axis=None, out=None, *, keepdims=False)
 
     Return indices of the maximum values along the given axis.
 
@@ -2958,7 +2958,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmax',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin',
     """
-    a.argmin(axis=None, out=None)
+    a.argmin(axis=None, out=None, *, keepdims=False)
 
     Return indices of the minimum values along the given axis.
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 054150b28..b11504c03 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -844,7 +844,7 @@ defdict = {
           docstrings.get('numpy.core.umath.trunc'),
           None,
           TD('e', f='trunc', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
           ),
@@ -860,7 +860,7 @@ defdict = {
           docstrings.get('numpy.core.umath.floor'),
           None,
           TD('e', f='floor', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
           ),
@@ -869,7 +869,7 @@ defdict = {
           docstrings.get('numpy.core.umath.rint'),
           None,
           TD('e', f='rint', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
           ),
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index f26f306fa..0ec4f7687 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -3408,6 +3408,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, *,
     0.55000000074505806 # may vary
 
     Specifying a where argument:
+
     >>> a = np.array([[5, 9, 13], [14, 10, 12], [11, 15, 19]])
     >>> np.mean(a)
     12.0
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 47d063178..35d82ec03 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -87,7 +87,7 @@ enum NPY_TYPES {    NPY_BOOL=0,
                     /* The number of types not including the new 1.6 types */
                     NPY_NTYPES_ABI_COMPATIBLE=21
 };
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #pragma deprecated(NPY_CHAR)
 #endif
 
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 88794ca07..2bcc45e4f 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -131,9 +131,10 @@
 #endif
 #endif
 
-#if defined(_MSC_VER)
-        #define NPY_INLINE __inline
-#elif defined(__GNUC__)
+#if defined(_MSC_VER) && !defined(__clang__)
+    #define NPY_INLINE __inline
+/* clang included here to handle clang-cl on Windows */
+#elif defined(__GNUC__) || defined(__clang__)
     #if defined(__STRICT_ANSI__)
          #define NPY_INLINE __inline__
     #else
@@ -180,12 +181,6 @@
     defined(__MINGW32__) || defined(__MINGW64__)
     #include <io.h>
 
-/* mingw based on 3.4.5 has lseek but not ftell/fseek */
-#if defined(__MINGW32__) || defined(__MINGW64__)
-extern int __cdecl _fseeki64(FILE *, long long, int);
-extern long long __cdecl _ftelli64(FILE *);
-#endif
-
     #define npy_fseek _fseeki64
     #define npy_ftell _ftelli64
     #define npy_lseek _lseeki64
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 84de9a059..fabec069c 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square, ceil, trunc#
+ * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor#
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square, ceil, trunc#
+ * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index 1385220f9..ff4f9f60a 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -61,7 +61,7 @@ npy_cpu_features_dict(void)
      *            AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI,
      *            AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG,
      *            AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL,
-     *            VSX, VSX2, VSX3,
+     *            VSX, VSX2, VSX3, VSX4,
      *            VX, VXE, VXE2,
      *            NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM#
     */
@@ -474,9 +474,15 @@ npy__cpu_init_features(void)
     #ifndef AT_HWCAP2
         #define AT_HWCAP2 26
     #endif
+    #ifndef PPC_FEATURE2_ARCH_2_07
+        #define PPC_FEATURE2_ARCH_2_07 0x80000000
+    #endif
     #ifndef PPC_FEATURE2_ARCH_3_00
         #define PPC_FEATURE2_ARCH_3_00 0x00800000
     #endif
+    #ifndef PPC_FEATURE2_ARCH_3_1
+        #define PPC_FEATURE2_ARCH_3_1  0x00040000
+    #endif
 #endif
 
 static void
@@ -489,15 +495,18 @@ npy__cpu_init_features(void)
         return;
 
     hwcap = getauxval(AT_HWCAP2);
-    if (hwcap & PPC_FEATURE2_ARCH_3_00)
+    if (hwcap & PPC_FEATURE2_ARCH_3_1)
     {
         npy__cpu_have[NPY_CPU_FEATURE_VSX]  =
         npy__cpu_have[NPY_CPU_FEATURE_VSX2] =
-        npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
+        npy__cpu_have[NPY_CPU_FEATURE_VSX3] =
+        npy__cpu_have[NPY_CPU_FEATURE_VSX4] = 1;
         return;
     }
-    npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
+    npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
 // TODO: AIX, FreeBSD
 #else
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
@@ -507,6 +516,9 @@ npy__cpu_init_features(void)
     #ifdef NPY_HAVE_VSX3
     npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
     #endif
+    #ifdef NPY_HAVE_VSX4
+    npy__cpu_have[NPY_CPU_FEATURE_VSX4] = 1;
+    #endif
 #endif
 }
 
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 1f52a445d..3d5f2e75c 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -65,6 +65,8 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_VSX2              = 201,
     // POWER9
     NPY_CPU_FEATURE_VSX3              = 202,
+    // POWER10
+    NPY_CPU_FEATURE_VSX4              = 203,
 
     // ARM
     NPY_CPU_FEATURE_NEON              = 300,
@@ -167,8 +169,8 @@ npy_cpu_baseline_list(void);
  * On x64: ['SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...]
  * On armhf: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD', 'ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
  * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
- * On ppc64:  ['VSX', 'VSX2', 'VSX3']
- * On ppc64le: ['VSX3']
+ * On ppc64:  ['VSX', 'VSX2', 'VSX3', 'VSX4']
+ * On ppc64le: ['VSX3', 'VSX4']
  * On s390x: ['VX', 'VXE', VXE2]
  * On any other arch or if the optimization is disabled: []
  */
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index ec15e50e1..deaf4ad11 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_f64 _mm256_max_pd
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m256 nn  = _mm256_cmp_ps(b, b, _CMP_ORD_Q);
@@ -76,7 +76,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
 #define npyv_min_f64 _mm256_min_pd
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m256 nn  = _mm256_cmp_ps(b, b, _CMP_ORD_Q);
@@ -105,6 +105,10 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
 }
 
+// round to nearest intger even
+#define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT)
+#define npyv_rint_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_NEAREST_INT)
+
 // ceil
 #define npyv_ceil_f32 _mm256_ceil_ps
 #define npyv_ceil_f64 _mm256_ceil_pd
@@ -113,4 +117,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
 #define npyv_trunc_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_ZERO)
 #define npyv_trunc_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_ZERO)
 
+// floor
+#define npyv_floor_f32 _mm256_floor_ps
+#define npyv_floor_f64 _mm256_floor_pd
+
 #endif // _NPY_SIMD_AVX2_MATH_H
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index f30e50ad0..5a6cb6dcd 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -51,7 +51,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_f64 _mm512_max_pd
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
 {
     __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
@@ -84,7 +84,7 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
 #define npyv_min_f64 _mm512_min_pd
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
 {
     __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
@@ -112,6 +112,10 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
 #define npyv_min_u64 _mm512_min_epu64
 #define npyv_min_s64 _mm512_min_epi64
 
+// round to nearest integer even
+#define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT)
+#define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT)
+
 // ceil
 #define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF)
 #define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF)
@@ -120,4 +124,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
 #define npyv_trunc_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_ZERO)
 #define npyv_trunc_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_ZERO)
 
+// floor
+#define npyv_floor_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEG_INF)
+#define npyv_floor_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEG_INF)
+
 #endif // _NPY_SIMD_AVX512_MATH_H
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index a7a461721..42f022c55 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -136,7 +136,7 @@ NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor)
 {
     assert(divisor > 1);
     npy_uint64 quotient;
-#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920
+#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920 && !defined(__clang__)
     npy_uint64 remainder;
     quotient = _udiv128(high, 0, divisor, &remainder);
     (void)remainder;
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 19e5cd846..4607d6f27 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,6 +153,33 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// round to nearest integer even
+NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
+{
+#ifdef NPY_HAVE_ASIMD
+    return vrndnq_f32(a);
+#else
+    // ARMv7 NEON only supports fp to int truncate conversion.
+    // a magic trick of adding 1.5 * 2**23 is used for rounding
+    // to nearest even and then substract this magic number to get
+    // the integer.
+    const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
+    npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
+    npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
+    round = vbslq_f32(overflow, round, a);
+    // signed zero
+    round = vreinterpretq_f32_s32(vorrq_s32(
+         vreinterpretq_s32_f32(round),
+         vandq_s32(vreinterpretq_s32_f32(a), szero)
+    ));
+    return round;
+#endif
+}
+#if NPY_SIMD_F64
+    #define npyv_rint_f64 vrndnq_f64
+#endif // NPY_SIMD_F64
+
 // ceil
 #ifdef NPY_HAVE_ASIMD
     #define npyv_ceil_f32 vrndpq_f32
@@ -223,4 +250,36 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     #define npyv_trunc_f64 vrndq_f64
 #endif // NPY_SIMD_F64
 
+// floor
+#ifdef NPY_HAVE_ASIMD
+    #define npyv_floor_f32 vrndmq_f32
+#else
+   NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+   {
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
+        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+
+        npyv_s32 roundi = vcvtq_s32_f32(a);
+        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
+            vandq_u32(vcgtq_f32(round, a), one)
+        ));
+        // respect signed zero
+        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
+            vreinterpretq_s32_f32(floor),
+            vandq_s32(vreinterpretq_s32_f32(a), szero)
+        ));
+        npyv_u32 nnan = npyv_notnan_f32(a);
+        npyv_u32 overflow = vorrq_u32(
+            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        );
+
+       return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+   }
+#endif // NPY_HAVE_ASIMD
+#if NPY_SIMD_F64
+    #define npyv_floor_f64 vrndmq_f64
+#endif // NPY_SIMD_F64
+
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 5daf7711e..e4b77b671 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_f64 _mm_max_pd
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m128 nn  = _mm_cmpord_ps(b, b);
@@ -95,7 +95,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
 #define npyv_min_f64 _mm_min_pd
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m128 nn  = _mm_cmpord_ps(b, b);
@@ -143,6 +143,38 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// round to nearest integer even
+NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
+{
+#ifdef NPY_HAVE_SSE41
+    return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+    const npyv_f32 szero = _mm_set1_ps(-0.0f);
+    __m128i roundi = _mm_cvtps_epi32(a);
+    __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+    __m128 r = _mm_cvtepi32_ps(roundi);
+    // respect sign of zero
+    r = _mm_or_ps(r, _mm_and_ps(a, szero));
+    return npyv_select_f32(overflow, a, r);
+#endif
+}
+
+// round to nearest integer even
+NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
+{
+#ifdef NPY_HAVE_SSE41
+    return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+    const npyv_f64 szero = _mm_set1_pd(-0.0);
+    const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+    npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+    // round by add magic number 2^52
+    npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+    // respect signed zero, e.g. -0.5 -> -0.0
+    return _mm_or_pd(round, _mm_and_pd(a, szero));
+#endif
+}
+
 // ceil
 #ifdef NPY_HAVE_SSE41
     #define npyv_ceil_f32 _mm_ceil_ps
@@ -202,4 +234,23 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     }
 #endif
 
+// floor
+#ifdef NPY_HAVE_SSE41
+    #define npyv_floor_f32 _mm_floor_ps
+    #define npyv_floor_f64 _mm_floor_pd
+#else
+    NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+    {
+        const npyv_f32 one = _mm_set1_ps(1.0f);
+        npyv_f32 round = npyv_rint_f32(a);
+        return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+    }
+    NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
+    {
+        const npyv_f64 one = _mm_set1_pd(1.0);
+        npyv_f64 round = npyv_rint_f64(a);
+        return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+    }
+#endif // NPY_HAVE_SSE41
+
 #endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
index d138cae8a..444bc9e54 100644
--- a/numpy/core/src/common/simd/vsx/math.h
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -38,7 +38,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_f64 vec_max
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 #define npyv_maxp_f32 vec_max
 #define npyv_maxp_f64 vec_max
 // Maximum, integer operations
@@ -56,7 +56,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_min_f64 vec_min
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 #define npyv_minp_f32 vec_min
 #define npyv_minp_f64 vec_min
 // Minimum, integer operations
@@ -69,6 +69,10 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_min_u64 vec_min
 #define npyv_min_s64 vec_min
 
+// round to nearest int even
+#define npyv_rint_f32 vec_rint
+#define npyv_rint_f64 vec_rint
+
 // ceil
 #define npyv_ceil_f32 vec_ceil
 #define npyv_ceil_f64 vec_ceil
@@ -77,4 +81,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_trunc_f32 vec_trunc
 #define npyv_trunc_f64 vec_trunc
 
+// floor
+#define npyv_floor_f32 vec_floor
+#define npyv_floor_f64 vec_floor
+
 #endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/multiarray/common_dtype.c b/numpy/core/src/multiarray/common_dtype.c
index ca80b1ed7..3561a905a 100644
--- a/numpy/core/src/multiarray/common_dtype.c
+++ b/numpy/core/src/multiarray/common_dtype.c
@@ -41,7 +41,7 @@
  * @param dtype2 Second DType class.
  * @return The common DType or NULL with an error set
  */
-NPY_NO_EXPORT NPY_INLINE PyArray_DTypeMeta *
+NPY_NO_EXPORT PyArray_DTypeMeta *
 PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2)
 {
     if (dtype1 == dtype2) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5f054d0a9..7f084ac39 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1506,62 +1506,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- *  #func = rint, floor, trunc#
- *  #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-/**begin repeat1
-*  #TYPE = FLOAT, DOUBLE#
-*  #type = npy_float, npy_double#
-*  #typesub = f, #
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        *(@type@ *)op1 = @scalarf@@typesub@(in1);
-    }
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #isa = avx512f, fma#
- * #ISA = AVX512F, FMA#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #TYPE = FLOAT, DOUBLE#
- *  #type = npy_float, npy_double#
- *  #typesub = f, #
- */
-
-/**begin repeat2
- *  #func = rint, floor, trunc#
- *  #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@@typesub@(in1);
-        }
-    }
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 3eafbdf66..e5235b464 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -186,7 +186,7 @@ NPY_NO_EXPORT void
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
- * #kind = ceil, sqrt, absolute, square, reciprocal#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -274,26 +274,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
 /**end repeat**/
 
 /**begin repeat
- *  #func = rint, floor, trunc#
- */
-
-/**begin repeat1
-*  #TYPE = FLOAT, DOUBLE#
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-/**begin repeat2
- * #isa = avx512f, fma#
- */
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 93761b98c..78e231965 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -70,6 +70,15 @@ NPY_FINLINE double c_square_f64(double a)
 #define c_ceil_f32 npy_ceilf
 #define c_ceil_f64 npy_ceil
 
+#define c_trunc_f32 npy_truncf
+#define c_trunc_f64 npy_trunc
+
+#define c_floor_f32 npy_floorf
+#define c_floor_f64 npy_floor
+
+#define c_rint_f32 npy_rintf
+#define c_rint_f64 npy_rint
+
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
@@ -119,6 +128,9 @@ NPY_FINLINE double c_square_f64(double a)
         #if __clang_major__ < 10
         // Clang before v10
         #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #elif defined(_MSC_VER)
+        // clang-cl has the same bug
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
         #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
         // Clang v10+, targeting i386 or x86_64
         #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
@@ -139,10 +151,10 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #if @VCHK@
 /**begin repeat1
- * #kind     = ceil, sqrt, absolute, square, reciprocal#
- * #intr     = ceil, sqrt, abs,      square, recip#
- * #repl_0w1 = 0,    0,    0,        0,      1#
- * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
+ * #kind     = rint,  floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr     = rint,  floor, ceil, trunc, sqrt, abs,      square, recip#
+ * #repl_0w1 = 0*7, 1#
+ * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -250,9 +262,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
  * #VCHK = NPY_SIMD, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind  = ceil, sqrt, absolute, square, reciprocal#
- * #intr  = ceil, sqrt, abs,      square, recip#
- * #clear = 0,    0,    1,        0,      0#
+ * #kind  = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr  = rint, floor, ceil, trunc, sqrt, abs,      square, recip#
+ * #clear = 0,    0,     0,    0,     0,    1,        0,      0#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8b833ee56..b477027b3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -123,47 +123,6 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
 /**end repeat**/
 
 /**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512f#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- * #REGISTER_SIZE = 32, 64#
- */
-
-/* prototypes */
-
-/**begin repeat1
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- */
-
-/**begin repeat2
- *  #func = rint, floor, trunc#
- */
-
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) {
-        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
@@ -1119,144 +1078,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
 /**end repeat**/
 
 /**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256, __mmask16#
- * #vsub = , _mask#
- * #vtype = __m256, __m512#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #func = rint, floor, trunc#
- *  #vectorf = rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_FLOAT(npy_float* op,
-                   npy_float* ip,
-                   const npy_intp array_size,
-                   const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
-    npy_intp num_remaining_elements = array_size;
-    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
-    @mask@ load_mask = @isa@_get_full_load_mask_ps();
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 indexarr[16];
-    for (npy_int32 ii = 0; ii < 16; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
-                                                       num_lanes);
-        }
-        @vtype@ x;
-        if (stride == 1) {
-            x = @isa@_masked_load_ps(load_mask, ip);
-        }
-        else {
-            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
-        }
-        @vtype@ out = @isa@_@vectorf@_ps(x);
-        @masked_store@(op, @cvtps_epi32@(load_mask), out);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256i, __mmask8#
- * #vsub = , _mask#
- * #vtype = __m256d, __m512d#
- * #vindextype = __m128i, __m256i#
- * #vindexsize = 128, 256#
- * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
- * #cvtps_epi32 = _mm256_cvtpd_epi32, #
- * #castmask = _mm256_castsi256_pd, #
- * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #func = rint, floor, trunc#
- *  #vectorf =  rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_DOUBLE(npy_double* op,
-                    npy_double* ip,
-                    const npy_intp array_size,
-                    const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
-    npy_intp num_remaining_elements = array_size;
-    @mask@ load_mask = @isa@_get_full_load_mask_pd();
-    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-    npy_int32 indexarr[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
-                                                       num_lanes);
-        }
-        @vtype@ x;
-        if (stride == 1) {
-            x = @isa@_masked_load_pd(load_mask, ip);
-        }
-        else {
-            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
-        }
-        @vtype@ out = @isa@_@vectorf@_pd(x);
-        @masked_store@(op, load_mask, out);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
  * #num_lanes = 16, 8#
@@ -1535,3 +1356,4 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 #undef VECTOR_SIZE_BYTES
 #endif  /* NPY_HAVE_SSE2_INTRINSICS */
 #endif
+
diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
index 706cf7a7e..1a76897e2 100644
--- a/numpy/core/tests/test_cpu_features.py
+++ b/numpy/core/tests/test_cpu_features.py
@@ -140,8 +140,8 @@ class Test_X86_Features(AbstractTest):
 is_power = re.match("^(powerpc|ppc)64", machine, re.IGNORECASE)
 @pytest.mark.skipif(not is_linux or not is_power, reason="Only for Linux and Power")
 class Test_POWER_Features(AbstractTest):
-    features = ["VSX", "VSX2", "VSX3"]
-    features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00")
+    features = ["VSX", "VSX2", "VSX3", "VSX4"]
+    features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00", VSX4="ARCH_3_1")
 
     def load_flags(self):
         self.load_flags_auxv()
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 12a67c44d..605baefe6 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -330,16 +330,18 @@ class _SIMD_FP(_Test_Utility):
         square = self.square(vdata)
         assert square == data_square
 
-    @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil),
-    ("self.trunc", math.trunc)])
+    @pytest.mark.parametrize("intrin, func", [("ceil", math.ceil),
+    ("trunc", math.trunc), ("floor", math.floor), ("rint", round)])
     def test_rounding(self, intrin, func):
         """
         Test intrinsics:
+            npyv_rint_##SFX
             npyv_ceil_##SFX
             npyv_trunc_##SFX
+            npyv_floor##SFX
         """
         intrin_name = intrin
-        intrin = eval(intrin)
+        intrin = getattr(self, intrin)
         pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
         # special cases
         round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf))
@@ -347,20 +349,25 @@ class _SIMD_FP(_Test_Utility):
             data_round = [desired]*self.nlanes
             _round = intrin(self.setall(case))
             assert _round == pytest.approx(data_round, nan_ok=True)
+
         for x in range(0, 2**20, 256**2):
             for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15):
-                data = [x*w+a for a in range(self.nlanes)]
-                vdata = self.load(data)
+                data = self.load([(x+a)*w for a in range(self.nlanes)])
                 data_round = [func(x) for x in data]
-                _round = intrin(vdata)
+                _round = intrin(data)
                 assert _round == data_round
+
         # signed zero
-        if "ceil" in intrin_name or "trunc" in intrin_name:
-            for w in (-0.25, -0.30, -0.45):
-                _round = self._to_unsigned(intrin(self.setall(w)))
-                data_round = self._to_unsigned(self.setall(-0.0))
-                assert _round == data_round
-    
+        if intrin_name == "floor":
+            data_szero = (-0.0,)
+        else:
+            data_szero = (-0.0, -0.25, -0.30, -0.45, -0.5)
+
+        for w in data_szero:
+            _round = self._to_unsigned(intrin(self.setall(w)))
+            data_round = self._to_unsigned(self.setall(-0.0))
+            assert _round == data_round
+
     def test_max(self):
         """
         Test intrinsics:
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index f1d024b94..854584998 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -294,6 +294,9 @@ class _Config:
         VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
         ## Power9/ISA 3.00
         VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+        ## Power10/ISA 3.1
+        VSX4 = dict(interest=4, implies="VSX3", implies_detect=False,
+                    extra_checks="VSX4_MMA"),
         # IBM/Z
         ## VX(z13) support
         VX = dict(interest=1, headers="vecintrin.h"),
@@ -471,12 +474,16 @@ class _Config:
                 ),
                 VSX3 = dict(
                     flags="-mcpu=power9 -mtune=power9", implies_detect=False
+                ),
+                VSX4 = dict(
+                    flags="-mcpu=power10 -mtune=power10", implies_detect=False
                 )
             )
             if self.cc_is_clang:
                 partial["VSX"]["flags"]  = "-maltivec -mvsx"
                 partial["VSX2"]["flags"] = "-mpower8-vector"
                 partial["VSX3"]["flags"] = "-mpower9-vector"
+                partial["VSX4"]["flags"] = "-mpower10-vector"
 
             return partial
 
diff --git a/numpy/distutils/checks/cpu_vsx4.c b/numpy/distutils/checks/cpu_vsx4.c
new file mode 100644
index 000000000..a6acc7384
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx4.c
@@ -0,0 +1,14 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 v1 = (v_uint32x4){2, 4, 8, 16};
+    v_uint32x4 v2 = (v_uint32x4){2, 2, 2, 2};
+    v_uint32x4 v3 = vec_mod(v1, v2);
+    return (int)vec_extractm(v3);
+}
diff --git a/numpy/distutils/checks/extra_vsx4_mma.c b/numpy/distutils/checks/extra_vsx4_mma.c
new file mode 100644
index 000000000..a70b2a9f6
--- /dev/null
+++ b/numpy/distutils/checks/extra_vsx4_mma.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector float fv4sf_t;
+typedef __vector unsigned char vec_t;
+
+int main(void)
+{
+    __vector_quad acc0;
+    float a[4] = {0,1,2,3};
+    float b[4] = {0,1,2,3};
+    vec_t *va = (vec_t *) a;
+    vec_t *vb = (vec_t *) b;
+    __builtin_mma_xvf32ger(&acc0, va[0], vb[0]);
+    fv4sf_t result[4];
+    __builtin_mma_disassemble_acc((void *)result, &acc0);
+    fv4sf_t c0 = result[0];
+    return (int)((float*)&c0)[0];
+}
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index dc1ab3b9b..80830d559 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -47,8 +47,8 @@ class build(old_build):
             - not part of dispatch-able features(--cpu-dispatch)
             - not supported by compiler or platform
         """
-        self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F" \
-                         " AVX512_SKX VSX VSX2 VSX3 NEON ASIMD VX VXE VXE2"
+        self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F " \
+                         "AVX512_SKX VSX VSX2 VSX3 VSX4 NEON ASIMD VX VXE VXE2"
 
     def finalize_options(self):
         build_scripts = self.build_scripts
diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py
index 6f9970c75..1ca8bc09b 100644
--- a/numpy/distutils/tests/test_ccompiler_opt.py
+++ b/numpy/distutils/tests/test_ccompiler_opt.py
@@ -405,7 +405,7 @@ class _Test_CCompilerOpt:
                 # in msvc, avx512_knl avx512_knm aren't supported
                 x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*",
                 armhf=".* asimd asimdhp asimddp .*",
-                ppc64="vsx vsx2 vsx3.*",
+                ppc64="vsx vsx2 vsx3 vsx4.*",
                 s390x="vx vxe vxe2.*"
             )
         # min
@@ -544,13 +544,13 @@ class _Test_CCompilerOpt:
             """
             /*@targets
                 sse sse2 sse41 avx avx2 avx512f
-                vsx vsx2 vsx3
+                vsx vsx2 vsx3 vsx4
                 neon neon_fp16 asimdhp asimddp
                 vx vxe vxe2
             */
             """,
             baseline="avx vsx2 asimd vx vxe",
-            x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3",
+            x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx4 vsx3",
             s390x="vxe2"
         )
         # test skipping non-dispatch features
@@ -558,7 +558,7 @@ class _Test_CCompilerOpt:
             """
             /*@targets
                 sse41 avx avx2 avx512f
-                vsx2 vsx3
+                vsx2 vsx3 vsx4
                 asimd asimdhp asimddp
                 vx vxe vxe2
             */
@@ -571,13 +571,13 @@ class _Test_CCompilerOpt:
             """
             /*@targets
                 sse2 sse41 avx2 avx512f
-                vsx2 vsx3
+                vsx2 vsx3 vsx4
                 neon asimdhp asimddp
                 vx vxe vxe2
             */
             """,
             baseline="",
-            trap_files=".*(avx2|avx512f|vsx3|asimddp|vxe2).c",
+            trap_files=".*(avx2|avx512f|vsx3|vsx4|asimddp|vxe2).c",
             x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon",
             s390x="vxe vx"
         )
diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py
index 581f946e5..b4fab71f9 100644
--- a/numpy/f2py/capi_maps.py
+++ b/numpy/f2py/capi_maps.py
@@ -504,7 +504,8 @@ def sign2map(a, var):
     varname,ctype,atype
     init,init.r,init.i,pytype
     vardebuginfo,vardebugshowvalue,varshowvalue
-    varrfromat
+    varrformat
+    
     intent
     """
     out_a = a
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index d4abde425..d5b130b72 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4907,7 +4907,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
 
     >>> x = np.linspace(-5, 5, 101)
     >>> y = np.linspace(-5, 5, 101)
-    >>> # full coorindate arrays
+    >>> # full coordinate arrays
     >>> xx, yy = np.meshgrid(x, y)
     >>> zz = np.sqrt(xx**2 + yy**2)
     >>> xx.shape, yy.shape, zz.shape
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index f824c4c5e..6aa708861 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -686,7 +686,7 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False):
                                  "to scale the covariance matrix")
             # note, this used to be: fac = resids / (len(x) - order - 2.0)
             # it was deciced that the "- 2" (originally justified by "Bayesian
-            # uncertainty analysis") is not was the user expects
+            # uncertainty analysis") is not what the user expects
             # (see gh-11196 and gh-11197)
             fac = resids / (len(x) - order)
         if y.ndim == 1:
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index e0e5403a9..9c9dfac68 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -3542,15 +3542,17 @@ class MaskedArray(ndarray):
 
     def harden_mask(self):
         """
-        Force the mask to hard.
+        Force the mask to hard, preventing unmasking by assignment.
 
         Whether the mask of a masked array is hard or soft is determined by
         its `~ma.MaskedArray.hardmask` property. `harden_mask` sets
-        `~ma.MaskedArray.hardmask` to ``True``.
+        `~ma.MaskedArray.hardmask` to ``True`` (and returns the modified
+        self).
 
         See Also
         --------
         ma.MaskedArray.hardmask
+        ma.MaskedArray.soften_mask
 
         """
         self._hardmask = True
@@ -3558,15 +3560,17 @@ class MaskedArray(ndarray):
 
     def soften_mask(self):
         """
-        Force the mask to soft.
+        Force the mask to soft (default), allowing unmasking by assignment.
 
         Whether the mask of a masked array is hard or soft is determined by
         its `~ma.MaskedArray.hardmask` property. `soften_mask` sets
-        `~ma.MaskedArray.hardmask` to ``False``.
+        `~ma.MaskedArray.hardmask` to ``False`` (and returns the modified
+        self).
 
         See Also
         --------
         ma.MaskedArray.hardmask
+        ma.MaskedArray.harden_mask
 
         """
         self._hardmask = False
@@ -3574,16 +3578,55 @@ class MaskedArray(ndarray):
 
     @property
     def hardmask(self):
-        """ Hardness of the mask """
+        """
+        Specifies whether values can be unmasked through assignments.
+
+        By default, assigning definite values to masked array entries will
+        unmask them.  When `hardmask` is ``True``, the mask will not change
+        through assignments.
+        
+        See Also
+        --------
+        ma.MaskedArray.harden_mask
+        ma.MaskedArray.soften_mask
+
+        Examples
+        --------
+        >>> x = np.arange(10)
+        >>> m = np.ma.masked_array(x, x>5)
+        >>> assert not m.hardmask
+
+        Since `m` has a soft mask, assigning an element value unmasks that
+        element:
+
+        >>> m[8] = 42
+        >>> m
+        masked_array(data=[0, 1, 2, 3, 4, 5, --, --, 42, --],
+                     mask=[False, False, False, False, False, False,
+                           True, True, False, True],
+               fill_value=999999)
+
+        After hardening, the mask is not affected by assignments:
+
+        >>> hardened = np.ma.harden_mask(m)
+        >>> assert m.hardmask and hardened is m
+        >>> m[:] = 23
+        >>> m
+        masked_array(data=[23, 23, 23, 23, 23, 23, --, --, 23, --],
+                     mask=[False, False, False, False, False, False,
+                           True, True, False, True],
+               fill_value=999999)
+
+        """
         return self._hardmask
 
     def unshare_mask(self):
         """
-        Copy the mask and set the sharedmask flag to False.
+        Copy the mask and set the `sharedmask` flag to ``False``.
 
         Whether the mask is shared between masked arrays can be seen from
-        the `sharedmask` property. `unshare_mask` ensures the mask is not shared.
-        A copy of the mask is only made if it was shared.
+        the `sharedmask` property. `unshare_mask` ensures the mask is not
+        shared. A copy of the mask is only made if it was shared.
 
         See Also
         --------
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index 8bf74aa5d..38b5484bc 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -4341,7 +4341,7 @@ cdef class RandomState:
             The drawn samples, of shape ``(size, k)``.
 
         Raises
-        -------
+        ------
         ValueError
             If any value in ``alpha`` is less than or equal to zero
 
diff --git a/numpy/testing/_private/extbuild.py b/numpy/testing/_private/extbuild.py
index 940e2f7d7..b7a071e7f 100644
--- a/numpy/testing/_private/extbuild.py
+++ b/numpy/testing/_private/extbuild.py
@@ -8,8 +8,6 @@ import os
 import pathlib
 import sys
 import sysconfig
-from numpy.distutils.ccompiler import new_compiler
-from distutils.errors import CompileError
 
 __all__ = ['build_and_import_extension', 'compile_extension_module']
 
@@ -53,6 +51,7 @@ def build_and_import_extension(
     >>> assert not mod.test_bytes(u'abc')
     >>> assert mod.test_bytes(b'abc')
     """
+    from distutils.errors import CompileError
 
     body = prologue + _make_methods(functions, modname)
     init = """PyObject *mod = PyModule_Create(&moduledef);
@@ -221,6 +220,7 @@ def _c_compile(cfile, outputfilename, include_dirs=[], libraries=[],
 def build(cfile, outputfilename, compile_extra, link_extra,
           include_dirs, libraries, library_dirs):
     "cd into the directory where the cfile is, use distutils to build"
+    from numpy.distutils.ccompiler import new_compiler
 
     compiler = new_compiler(force=1, verbose=2)
     compiler.customize('')
diff --git a/numpy/typing/tests/data/reveal/lib_function_base.pyi b/numpy/typing/tests/data/reveal/lib_function_base.pyi
index c559eb295..eebe9fbfd 100644
--- a/numpy/typing/tests/data/reveal/lib_function_base.pyi
+++ b/numpy/typing/tests/data/reveal/lib_function_base.pyi
@@ -26,7 +26,7 @@ reveal_type(vectorized_func.signature)  # E: Union[None, builtins.str]
 reveal_type(vectorized_func.otypes)  # E: Union[None, builtins.str]
 reveal_type(vectorized_func.excluded)  # E: set[Union[builtins.int, builtins.str]]
 reveal_type(vectorized_func.__doc__)  # E: Union[None, builtins.str]
-reveal_type(vectorized_func([1]))  # E: ndarray[Any, dtype[Any]]
+reveal_type(vectorized_func([1]))  # E: Any
 reveal_type(np.vectorize(int))  # E: vectorize
 reveal_type(np.vectorize(  # E: vectorize
     int, otypes="i", doc="doc", excluded=(), cache=True, signature=None
diff --git a/setup.py b/setup.py
index bfb40acf1..2819d2a9f 100755
--- a/setup.py
+++ b/setup.py
@@ -291,7 +291,7 @@ def parse_setuppy_commands():
 
               - `pip install .`       (from a git repo or downloaded source
                                        release)
-              - `pip install numpy`   (last NumPy release on PyPi)
+              - `pip install numpy`   (last NumPy release on PyPI)
 
             """))
         return True
@@ -303,7 +303,7 @@ def parse_setuppy_commands():
 
             To install NumPy from here with reliable uninstall, we recommend
             that you use `pip install .`. To install the latest NumPy release
-            from PyPi, use `pip install numpy`.
+            from PyPI, use `pip install numpy`.
 
             For help with build/installation issues, please ask on the
             numpy-discussion mailing list.  If you are sure that you have run
@@ -371,7 +371,7 @@ def get_docs_url():
     if 'dev' in VERSION:
         return "https://numpy.org/devdocs"
     else:
-        # For releases, this URL ends up on pypi.
+        # For releases, this URL ends up on PyPI.
         # By pinning the version, users looking at old PyPI releases can get
         # to the associated docs easily.
         return "https://numpy.org/doc/{}.{}".format(MAJOR, MINOR)
diff --git a/tools/gitpod/Dockerfile b/tools/gitpod/Dockerfile
index e2e0e1bc9..592a5ee0a 100644
--- a/tools/gitpod/Dockerfile
+++ b/tools/gitpod/Dockerfile
@@ -27,7 +27,7 @@
 # OS/ARCH: linux/amd64
 FROM gitpod/workspace-base:latest
 
-ARG MAMBAFORGE_VERSION="4.10.0-0"
+ARG MAMBAFORGE_VERSION="4.11.0-0"
 ARG CONDA_ENV=numpy-dev
 
 
diff --git a/tools/openblas_support.py b/tools/openblas_support.py
index 4eb72dbc9..c89cb9284 100644
--- a/tools/openblas_support.py
+++ b/tools/openblas_support.py
@@ -13,8 +13,9 @@ from tempfile import mkstemp, gettempdir
 from urllib.request import urlopen, Request
 from urllib.error import HTTPError
 
-OPENBLAS_V = '0.3.18'
-OPENBLAS_LONG = 'v0.3.18'
+# 0.3.19 fails AVX512_SKX tests, issue 20654, comments in PR 20660
+OPENBLAS_V = '0.3.19.dev'
+OPENBLAS_LONG = 'v0.3.19-22-g5188aede'
 BASE_LOC = 'https://anaconda.org/multibuild-wheels-staging/openblas-libs'
 BASEURL = f'{BASE_LOC}/{OPENBLAS_LONG}/download'
 SUPPORTED_PLATFORMS = [