20 files changed, 1246 insertions, 944 deletions
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index cd5d8484a..076ac32c7 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -37,7 +37,7 @@ jobs:
           echo "::set-output name=message::$COMMIT_MSG"
 
   build_wheels:
-    name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform }}
+    name: Build wheel for ${{ matrix.python }}-${{ matrix.platform }}
     needs: get_commit_message
     if: >-
       contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
@@ -51,35 +51,49 @@ jobs:
         include:
         # manylinux builds
         - os: ubuntu-20.04
-          python: "38"
+          python: "cp38"
           platform: manylinux_x86_64
         - os: ubuntu-20.04
-          python: "39"
+          python: "cp39"
           platform: manylinux_x86_64
         - os: ubuntu-20.04
-          python: "310"
+          python: "cp310"
+          platform: manylinux_x86_64
+        # manylinux pypy builds
+        - os: ubuntu-20.04
+          python: "pp38"
           platform: manylinux_x86_64
 
         # MacOS builds
         - os: macos-10.15
-          python: "38"
+          python: "cp38"
           platform: macosx_*
         - os: macos-10.15
-          python: "39"
+          python: "cp39"
           platform: macosx_*
         - os: macos-10.15
-          python: "310"
+          python: "cp310"
           platform: macosx_*
+        # MacOS PyPy builds
+        # Disabled for now because of a PyPy bug
+        # that prevents successful compilation
+        #- os: macos-10.15
+        #  python: "pp38"
+        #  platform: macosx_x86_64
 
         # Windows builds
         - os: windows-2019
-          python: "38"
+          python: "cp38"
+          platform: win_amd64
+        - os: windows-2019
+          python: "cp39"
           platform: win_amd64
         - os: windows-2019
-          python: "39"
+          python: "cp310"
           platform: win_amd64
+        # Windows PyPy builds
         - os: windows-2019
-          python: "310"
+          python: "pp38"
           platform: win_amd64
 
     steps:
@@ -94,10 +108,10 @@ jobs:
           fetch-depth: 0
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.1.3
+        uses: pypa/cibuildwheel@v2.3.0
         env:
           NPY_USE_BLAS_ILP64: 1
-          CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform }}
+          CIBW_BUILD: ${{ matrix.python }}-${{ matrix.platform }}
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
           CIBW_ENVIRONMENT_LINUX: CFLAGS='-std=c99 -fno-strict-aliasing'
                                   LDFLAGS='-Wl,--strip-debug'
diff --git a/doc/neps/nep-0031-uarray.rst b/doc/neps/nep-0031-uarray.rst
index b4ec94077..b746c267d 100644
--- a/doc/neps/nep-0031-uarray.rst
+++ b/doc/neps/nep-0031-uarray.rst
@@ -302,7 +302,7 @@ This is different from monkeypatching in a few different ways:
   so there is at least the loose sense of an API contract. Monkeypatching
   does not provide this ability.
 * There is the ability of locally switching the backend.
-* It has been `suggested <http://numpy-discussion.10968.n7.nabble.com/NEP-31-Context-local-and-global-overrides-of-the-NumPy-API-tp47452p47472.html>`_
+* It has been `suggested <https://mail.python.org/archives/list/numpy-discussion@python.org/message/PS7EN3CRT6XERNTCN56MAYOXFFFEC55G/>`_
   that the reason that 1.17 hasn't landed in the Anaconda defaults channel is
   due to the incompatibility between monkeypatching and ``__array_function__``,
   as monkeypatching would bypass the protocol completely.
@@ -640,9 +640,9 @@ References and Footnotes
 
 .. [4] NEP 13 — A Mechanism for Overriding Ufuncs: https://numpy.org/neps/nep-0013-ufunc-overrides.html
 
-.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: http://numpy-discussion.10968.n7.nabble.com/Adding-to-the-non-dispatched-implementation-of-NumPy-methods-tp46816p46874.html
+.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/5GUDMALWDIRHITG5YUOCV343J66QSX3U/#5GUDMALWDIRHITG5YUOCV343J66QSX3U
 
-.. [6] Custom Dtype/Units discussion: http://numpy-discussion.10968.n7.nabble.com/Custom-Dtype-Units-discussion-td43262.html
+.. [6] Custom Dtype/Units discussion: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA/#RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA
 
 .. [7] The epic dtype cleanup plan: https://github.com/numpy/numpy/issues/2899
 
diff --git a/doc/neps/nep-0038-SIMD-optimizations.rst b/doc/neps/nep-0038-SIMD-optimizations.rst
index 927228447..2123c4f95 100644
--- a/doc/neps/nep-0038-SIMD-optimizations.rst
+++ b/doc/neps/nep-0038-SIMD-optimizations.rst
@@ -8,7 +8,7 @@ NEP 38 — Using SIMD optimization instructions for performance
 :Status: Accepted
 :Type: Standards
 :Created: 2019-11-25
-:Resolution: http://numpy-discussion.10968.n7.nabble.com/NEP-38-Universal-SIMD-intrinsics-td47854.html
+:Resolution: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB/#PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB
 
 
 Abstract
diff --git a/doc/neps/nep-0049.rst b/doc/neps/nep-0049.rst
index 3bd1d102c..0f0fd23c9 100644
--- a/doc/neps/nep-0049.rst
+++ b/doc/neps/nep-0049.rst
@@ -55,8 +55,8 @@ is to create a flexible enough interface without burdening normative users.
 .. _`issue 5312`: https://github.com/numpy/numpy/issues/5312
 .. _`from 2017`: https://github.com/numpy/numpy/issues/5312#issuecomment-315234656
 .. _`in 2005`: https://numpy-discussion.scipy.narkive.com/MvmMkJcK/numpy-arrays-data-allocation-and-simd-alignement
-.. _`here`: http://numpy-discussion.10968.n7.nabble.com/Aligned-configurable-memory-allocation-td39712.html
-.. _`and here`: http://numpy-discussion.10968.n7.nabble.com/Numpy-s-policy-for-releasing-memory-td1533.html
+.. _`here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH/#YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH
+.. _`and here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A/#IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A
 .. _`issue 14177`: https://github.com/numpy/numpy/issues/14177
 .. _`filprofiler`: https://github.com/pythonspeed/filprofiler/blob/master/design/allocator-overrides.md
 .. _`electric fence`: https://github.com/boundarydevices/efence
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index a18211cca..24bb6665d 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -26,7 +26,7 @@ For learning how to use NumPy, see the :ref:`complete documentation <numpy_docs_
    distutils
    distutils_guide
    c-api/index
-   simd/simd-optimizations
+   simd/index
    swig
 
 
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
new file mode 100644
index 000000000..80ef2c639
--- /dev/null
+++ b/doc/source/reference/simd/build-options.rst
@@ -0,0 +1,375 @@
+*****************
+CPU build options
+*****************
+
+Description
+-----------
+
+The following options are mainly used to change the default behavior of optimizations
+that target certain CPU features:
+
+- ``--cpu-baseline``: minimal set of required CPU features.
+   Default value is ``min`` which provides the minimum CPU features that can
+   safely run on a wide range of platforms within the processor family.
+
+   .. note::
+
+     During the runtime, NumPy modules will fail to load if any of specified features
+     are not supported by the target CPU (raises Python runtime error).
+
+- ``--cpu-dispatch``: dispatched set of additional CPU features.
+   Default value is ``max -xop -fma4`` which enables all CPU
+   features, except for AMD legacy features (in case of X86).
+
+   .. note::
+
+      During the runtime, NumPy modules will skip any specified features
+      that are not available in the target CPU.
+
+These options are accessible through :py:mod:`distutils` commands
+`distutils.command.build`, `distutils.command.build_clib` and
+`distutils.command.build_ext`.
+They accept a set of :ref:`CPU features <opt-supported-features>`
+or groups of features that gather several features or
+:ref:`special options <opt-special-options>` that
+perform a series of procedures.
+
+.. note::
+
+    If ``build_clib`` or ``build_ext`` are not specified by the user,
+    the arguments of ``build`` will be used instead, which also holds the default values.
+
+To customize both ``build_ext`` and ``build_clib``::
+
+    cd /path/to/numpy
+    python setup.py build --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_ext``::
+
+    cd /path/to/numpy
+    python setup.py build_ext --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_clib``::
+
+    cd /path/to/numpy
+    python setup.py build_clib --cpu-baseline="avx2 fma3" install --user
+
+You can also customize CPU/build options through PIP command::
+
+    pip install --no-use-pep517 --global-option=build \
+    --global-option="--cpu-baseline=avx2 fma3" \
+    --global-option="--cpu-dispatch=max" ./
+
+Quick Start
+-----------
+
+In general, the default settings tend to not impose certain CPU features that
+may not be available on some older processors. Raising the ceiling of the
+baseline features will often improve performance and may also reduce
+binary size.
+
+
+The following are the most common scenarios that may require changing
+the default settings:
+
+
+I am building NumPy for my local use
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+And I do not intend to export the build to other users or target a
+different CPU than what the host has.
+
+Set `native` for baseline, or manualy specify the CPU features in case of option
+`native` isn't supported by your platform::
+
+    python setup.py build --cpu-baseline="native" bdist
+
+Building NumPy with extra CPU features isn't necessary for this case,
+since all supported features are already defined within the baseline features::
+
+    python setup.py build --cpu-baseline=native --cpu-dispatch=none bdist
+
+.. note::
+
+    A fatal error will be raised if `native` isn't supported by the host platform.
+
+I do not want to support the old processors of the `x86` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since most of the CPUs nowadays support at least `AVX`, `F16C` features, you can use::
+
+    python setup.py build --cpu-baseline="avx f16c" bdist
+
+.. note::
+
+    ``--cpu-baseline`` force combine all implied features, so there's no need
+    to add SSE features.
+
+
+I'm facing the same case above but with `ppc64` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Then raise the ceiling of the baseline features to Power8::
+
+    python setup.py build --cpu-baseline="vsx2" bdist
+
+Having issues with `AVX512` features?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may have some reservations about including of `AVX512` or
+any other CPU feature and you want to exclude from the dispatched features::
+
+    python setup.py build --cpu-dispatch="max -avx512f -avx512cd \
+    -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl" \
+    bdist
+
+.. _opt-supported-features:
+
+Supported Features
+------------------
+
+The names of the features can express one feature or a group of features,
+as shown in the following tables supported depend on the lowest interest:
+
+.. note::
+
+    The following features may not be supported by all compilers,
+    also some compilers may produce different set of implied features
+    when it comes to features like ``AVX512``, ``AVX2``, and ``FMA3``.
+    See :ref:`opt-platform-differences` for more details.
+
+.. include:: generated_tables/cpu_features.inc
+
+.. _opt-special-options:
+
+Special Options
+---------------
+
+- ``NONE``: enable no features.
+
+- ``NATIVE``: Enables all CPU features that supported by the host CPU,
+  this operation is based on the compiler flags (``-march=native``, ``-xHost``, ``/QxHost``)
+
+- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
+
+  .. table::
+      :align: left
+
+      ======================================  =======================================
+       For Arch                               Implies
+      ======================================  =======================================
+       x86 (32-bit mode)                      ``SSE`` ``SSE2``
+       x86_64                                 ``SSE`` ``SSE2`` ``SSE3``
+       IBM/POWER (big-endian mode)            ``NONE``
+       IBM/POWER (little-endian mode)         ``VSX`` ``VSX2``
+       ARMHF                                  ``NONE``
+       ARM64 A.K. AARCH64                     ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+                                              ``ASIMD``
+      ======================================  =======================================
+
+- ``MAX``: Enables all supported CPU features by the compiler and platform.
+
+- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
+
+Behaviors
+---------
+
+- CPU features and other options are case-insensitive, for example::
+
+    python setup.py build --cpu-dispatch="SSE41 avx2 FMA3"
+
+- The order of the requested optimizations doesn't matter::
+
+    python setup.py build --cpu-dispatch="SSE41 AVX2 FMA3"
+    # equivalent to
+    python setup.py build --cpu-dispatch="FMA3 AVX2 SSE41"
+
+- Either commas or spaces or '+' can be used as a separator,
+  for example::
+
+    python setup.py build --cpu-dispatch="avx2 avx512f"
+    # or
+    python setup.py build --cpu-dispatch=avx2,avx512f
+    # or
+    python setup.py build --cpu-dispatch="avx2+avx512f"
+
+  all works but arguments should be enclosed in quotes or escaped
+  by backslash if any spaces are used.
+
+- ``--cpu-baseline`` combines all implied CPU features, for example::
+
+    python setup.py build --cpu-baseline=sse42
+    # equivalent to
+    python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-baseline`` will be treated as "native" if compiler native flag
+  ``-march=native`` or ``-xHost`` or ``/QxHost`` is enabled through environment variable
+  `CFLAGS`::
+
+    export CFLAGS="-march=native"
+    python setup.py install --user
+    # is equivalent to
+    python setup.py build --cpu-baseline=native install --user
+
+- ``--cpu-baseline`` escapes any specified features that aren't supported
+  by the target platform or compiler rather than raising fatal errors.
+
+   .. note::
+
+        Since ``--cpu-baseline`` combines all implied features, the maximum
+        supported of implied features will be enabled rather than escape all of them.
+        For example::
+
+           # Requesting `AVX2,FMA3` but the compiler only support **SSE** features
+           python setup.py build --cpu-baseline="avx2 fma3"
+           # is equivalent to
+           python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-dispatch`` does not combain any of implied CPU features,
+  so you must add them unless you want to disable one or all of them::
+
+    # Only dispatches AVX2 and FMA3
+    python setup.py build --cpu-dispatch=avx2,fma3
+    # Dispatches AVX and SSE features
+    python setup.py build --cpu-baseline=ssse3,sse41,sse42,avx,avx2,fma3
+
+- ``--cpu-dispatch`` escapes any specified baseline features and also escapes
+  any features not supported by the target platform or compiler without rasing
+  fatal errors.
+
+Eventually, you should always check the final report through the build log
+to verify the enabled features. See :ref:`opt-build-report` for more details.
+
+.. _opt-platform-differences:
+
+Platform differences
+--------------------
+
+Some exceptional conditions force us to link some features together when it come to
+certain compilers or architectures, resulting in the impossibility of building them separately.
+
+These conditions can be divided into two parts, as follows:
+
+**Architectural compatibility**
+
+The need to align certain CPU features that are assured to be supported by
+successive generations of the same architecture, some cases:
+
+- On ppc64le ``VSX(ISA 2.06)`` and ``VSX2(ISA 2.07)`` both imply one another since the
+  first generation that supports little-endian mode is Power-8`(ISA 2.07)`
+- On AArch64 ``NEON NEON_FP16 NEON_VFPV4 ASIMD`` implies each other since they are part of the
+  hardware baseline.
+
+For example::
+
+    # On ARMv8/A64, specify NEON is going to enable Advanced SIMD
+    # and all predecessor extensions
+    python setup.py build --cpu-baseline=neon
+    # which equivalent to
+    python setup.py build --cpu-baseline="neon neon_fp16 neon_vfpv4 asimd"
+
+.. note::
+
+    Please take a deep look at :ref:`opt-supported-features`,
+    in order to determine the features that imply one another.
+
+**Compilation compatibility**
+
+Some compilers don't provide independent support for all CPU features. For instance
+**Intel**'s compiler doesn't provide separated flags for ``AVX2`` and ``FMA3``,
+it makes sense since all Intel CPUs that comes with ``AVX2`` also support ``FMA3``,
+but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
+
+For example::
+
+    # Specify AVX2 will force enables FMA3 on Intel compilers
+    python setup.py build --cpu-baseline=avx2
+    # which equivalent to
+    python setup.py build --cpu-baseline="avx2 fma3"
+
+
+The following tables only show the differences imposed by some compilers from the
+general context that been shown in the :ref:`opt-supported-features` tables:
+
+.. note::
+
+    Features names with strikeout represent the unsupported CPU features.
+
+.. raw:: html
+
+    <style>
+        .enabled-feature {color:green; font-weight:bold;}
+        .disabled-feature {color:red; text-decoration: line-through;}
+    </style>
+
+.. role:: enabled
+    :class: enabled-feature
+
+.. role:: disabled
+    :class: disabled-feature
+
+.. include:: generated_tables/compilers-diff.inc
+
+.. _opt-build-report:
+
+Build report
+------------
+
+In most cases, the CPU build options do not produce any fatal errors that lead to hanging the build.
+Most of the errors that may appear in the build log serve as heavy warnings due to the lack of some
+expected CPU features by the compiler.
+
+So we strongly recommend checking the final report log, to be aware of what kind of CPU features
+are enabled and what are not.
+
+You can find the final report of CPU optimizations at the end of the build log,
+and here is how it looks on x86_64/gcc:
+
+.. raw:: html
+
+    <style>#build-report .highlight-bash pre{max-height:450px; overflow-y: scroll;}</style>
+
+.. literalinclude:: log_example.txt
+   :language: bash
+
+As you see, there is a separate report for each of ``build_ext`` and ``build_clib``
+that includes several sections, and each section has several values, representing the following:
+
+**Platform**:
+
+- :enabled:`Architecture`: The architecture name of target CPU. It should be one of
+  ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``.
+
+- :enabled:`Compiler`: The compiler name. It should be one of
+  gcc, clang, msvc, icc, iccw or unix-like.
+
+**CPU baseline**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-baseline`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Flags`: The compiler flags that were used to all NumPy `C/C++` sources
+  during the compilation except for temporary sources that have been used for generating
+  the binary objects of dispatched features.
+- :enabled:`Extra checks`: list of internal checks that activate certain functionality
+  or intrinsics related to the enabled features, useful for debugging when it comes
+  to developing SIMD kernels.
+
+**CPU dispatch**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-dispatch`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Generated`: At the beginning of the next row of this property,
+  the features for which optimizations have been generated are shown in the
+  form of several sections with similar properties explained as follows:
+
+  - :enabled:`One or multiple dispatched feature`: The implied CPU features.
+  - :enabled:`Flags`: The compiler flags that been used for these features.
+  - :enabled:`Extra checks`: Similar to the baseline but for these dispatched features.
+  - :enabled:`Detect`: Set of CPU features that need be detected in runtime in order to
+    execute the generated optimizations.
+  - The lines that come after the above property and end with a ':' on a separate line,
+    represent the paths of c/c++ sources that define the generated optimizations.
+
+Runtime Trace
+-------------
+To be completed.
diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
new file mode 100644
index 000000000..d74d54016
--- /dev/null
+++ b/doc/source/reference/simd/gen_features.py
@@ -0,0 +1,194 @@
+"""
+Generate CPU features tables from CCompilerOpt
+"""
+from os import sys, path
+from numpy.distutils.ccompiler_opt import CCompilerOpt
+
+class FakeCCompilerOpt(CCompilerOpt):
+    # disable caching no need for it
+    conf_nocache = True
+
+    def __init__(self, arch, cc, *args, **kwargs):
+        self.fake_info = (arch, cc, '')
+        CCompilerOpt.__init__(self, None, **kwargs)
+
+    def dist_compile(self, sources, flags, **kwargs):
+        return sources
+
+    def dist_info(self):
+        return self.fake_info
+
+    @staticmethod
+    def dist_log(*args, stderr=False):
+        # avoid printing
+        pass
+
+    def feature_test(self, name, force_flags=None, macros=[]):
+        # To speed up
+        return True
+
+class Features:
+    def __init__(self, arch, cc):
+        self.copt = FakeCCompilerOpt(arch, cc, cpu_baseline="max")
+
+    def names(self):
+        return self.copt.cpu_baseline_names()
+
+    def serialize(self, features_names):
+        result = []
+        for f in self.copt.feature_sorted(features_names):
+            gather = self.copt.feature_supported.get(f, {}).get("group", [])
+            implies = self.copt.feature_sorted(self.copt.feature_implies(f))
+            result.append((f, implies, gather))
+        return result
+
+    def table(self, **kwargs):
+        return self.gen_table(self.serialize(self.names()), **kwargs)
+
+    def table_diff(self, vs, **kwargs):
+        fnames = set(self.names())
+        fnames_vs = set(vs.names())
+        common = fnames.intersection(fnames_vs)
+        extra = fnames.difference(fnames_vs)
+        notavl = fnames_vs.difference(fnames)
+        iextra = {}
+        inotavl = {}
+        idiff = set()
+        for f in common:
+            implies = self.copt.feature_implies(f)
+            implies_vs = vs.copt.feature_implies(f)
+            e = implies.difference(implies_vs)
+            i = implies_vs.difference(implies)
+            if not i and not e:
+                continue
+            if e:
+                iextra[f] = e
+            if i:
+                inotavl[f] = e
+            idiff.add(f)
+
+        def fbold(f):
+            if f in extra:
+                return f':enabled:`{f}`'
+            if f in notavl:
+                return f':disabled:`{f}`'
+            return f
+
+        def fbold_implies(f, i):
+            if i in iextra.get(f, {}):
+                return f':enabled:`{i}`'
+            if f in notavl or i in inotavl.get(f, {}):
+                return f':disabled:`{i}`'
+            return i
+
+        diff_all = self.serialize(idiff.union(extra))
+        diff_all += vs.serialize(notavl)
+        content = self.gen_table(
+            diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
+        )
+        return content
+
+    def gen_table(self, serialized_features, fstyle=None, fstyle_implies=None,
+                  **kwargs):
+
+        if fstyle is None:
+            fstyle = lambda ft: f'``{ft}``'
+        if fstyle_implies is None:
+            fstyle_implies = lambda origin, ft: fstyle(ft)
+
+        rows = []
+        have_gather = False
+        for f, implies, gather in serialized_features:
+            if gather:
+                have_gather = True
+            name = fstyle(f)
+            implies = ' '.join([fstyle_implies(f, i) for i in implies])
+            gather = ' '.join([fstyle_implies(f, i) for i in gather])
+            rows.append((name, implies, gather))
+        if not rows:
+            return ''
+        fields = ["Name", "Implies", "Gathers"]
+        if not have_gather:
+            del fields[2]
+            rows = [(name, implies) for name, implies, _ in rows]
+        return self.gen_rst_table(fields, rows, **kwargs)
+
+    def gen_rst_table(self, field_names, rows, tab_size=4):
+        assert(not rows or len(field_names) == len(rows[0]))
+        rows.append(field_names)
+        fld_len = len(field_names)
+        cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
+        del rows[-1]
+        cformat = ' '.join('{:<%d}' % i for i in cls_len)
+        border = cformat.format(*['='*i for i in cls_len])
+
+        rows = [cformat.format(*row) for row in rows]
+        # header
+        rows = [border, cformat.format(*field_names), border] + rows
+        # footer
+        rows += [border]
+        # add left margin
+        rows = [(' ' * tab_size) + r for r in rows]
+        return '\n'.join(rows)
+
+def wrapper_section(title, content, tab_size=4):
+    tab = ' '*tab_size
+    if content:
+        return (
+            f"{title}\n{'~'*len(title)}"
+            f"\n.. table::\n{tab}:align: left\n\n"
+            f"{content}\n\n"
+        )
+    return ''
+
+def wrapper_tab(title, table, tab_size=4):
+    tab = ' '*tab_size
+    if table:
+        ('\n' + tab).join((
+            '.. tab:: ' + title,
+            tab + '.. table::',
+            tab + 'align: left',
+            table + '\n\n'
+        ))
+    return ''
+
+
+if __name__ == '__main__':
+
+    pretty_names = {
+        "PPC64": "IBM/POWER big-endian",
+        "PPC64LE": "IBM/POWER little-endian",
+        "ARMHF": "ARMv7/A32",
+        "AARCH64": "ARMv8/A64",
+        "ICC": "Intel Compiler",
+        # "ICCW": "Intel Compiler msvc-like",
+        "MSVC": "Microsoft Visual C/C++"
+    }
+    gen_path = path.join(
+        path.dirname(path.realpath(__file__)), "generated_tables"
+    )
+    with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
+        fd.write(f'.. generated via {__file__}\n\n')
+        for arch in (
+            ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
+        ):
+            title = "On " + pretty_names.get(arch, arch)
+            table = Features(arch, 'gcc').table()
+            fd.write(wrapper_section(title, table))
+
+    with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd:
+        fd.write(f'.. generated via {__file__}\n\n')
+        for arch, cc_names in (
+            ("x86", ("clang", "ICC", "MSVC")),
+            ("PPC64", ("clang",)),
+            ("PPC64LE", ("clang",)),
+            ("ARMHF", ("clang",)),
+            ("AARCH64", ("clang",))
+        ):
+            arch_pname = pretty_names.get(arch, arch)
+            for cc in cc_names:
+                title = f"On {arch_pname}::{pretty_names.get(cc, cc)}"
+                table = Features(arch, cc).table_diff(Features(arch, "gcc"))
+                fd.write(wrapper_section(title, table))
+
+
diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc
new file mode 100644
index 000000000..4b9009a68
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc
@@ -0,0 +1,33 @@
+.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+
+On x86::Intel Compiler
+~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ================ ==========================================================================================================================================
+    Name             Implies                                                                                                                                   
+    ================ ==========================================================================================================================================
+    FMA3             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                           
+    AVX2             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                           
+    AVX512F          SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`                                                             
+    :disabled:`XOP`  :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+    :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+    ================ ==========================================================================================================================================
+
+On x86::Microsoft Visual C/C++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+    Name                   Implies                                                                                                                                                                                                                                                       Gathers                                                                      
+    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                            
+    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                            
+    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`                                                                                                                                                                                                                                        
+    AVX512CD               SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                    
+    :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD`                        :disabled:`AVX512ER` :disabled:`AVX512PF`                                    
+    :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
+    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
new file mode 100644
index 000000000..a7eae5652
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -0,0 +1,93 @@
+.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+
+On x86
+~~~~~~
+.. table::
+    :align: left
+
+    ============== =========================================================================================================================================================================== =====================================================
+    Name           Implies                                                                                                                                                                     Gathers                                              
+    ============== =========================================================================================================================================================================== =====================================================
+    ``SSE``        ``SSE2``                                                                                                                                                                                                                         
+    ``SSE2``       ``SSE``                                                                                                                                                                                                                          
+    ``SSE3``       ``SSE`` ``SSE2``                                                                                                                                                                                                                 
+    ``SSSE3``      ``SSE`` ``SSE2`` ``SSE3``                                                                                                                                                                                                        
+    ``SSE41``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                                                                                                                                              
+    ``POPCNT``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                                                                                                                                    
+    ``SSE42``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                                                                                                                                         
+    ``AVX``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                                                                                                                                               
+    ``XOP``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
+    ``FMA4``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
+    ``F16C``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
+    ``FMA3``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
+    ``AVX2``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
+    ``AVX512F``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``                                                                                                                            
+    ``AVX512CD``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``                                                                                                                
+    ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512ER`` ``AVX512PF``                            
+    ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                               ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
+    ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``               
+    ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512VNNI``                                       
+    ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512IFMA`` ``AVX512VBMI``                        
+    ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` 
+    ============== =========================================================================================================================================================================== =====================================================
+
+On IBM/POWER big-endian
+~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ======== ================
+    Name     Implies         
+    ======== ================
+    ``VSX``                  
+    ``VSX2`` ``VSX``         
+    ``VSX3`` ``VSX`` ``VSX2``
+    ======== ================
+
+On IBM/POWER little-endian
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ======== ================
+    Name     Implies         
+    ======== ================
+    ``VSX``  ``VSX2``        
+    ``VSX2`` ``VSX``         
+    ``VSX3`` ``VSX`` ``VSX2``
+    ======== ================
+
+On ARMv7/A32
+~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ============== ===========================================================
+    Name           Implies                                                    
+    ============== ===========================================================
+    ``NEON``                                                                  
+    ``NEON_FP16``  ``NEON``                                                   
+    ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``                                     
+    ``ASIMD``      ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``                      
+    ``ASIMDHP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
+    ``ASIMDDP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
+    ``ASIMDFHM``   ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+    ============== ===========================================================
+
+On ARMv8/A64
+~~~~~~~~~~~~
+.. table::
+    :align: left
+
+    ============== ===========================================================
+    Name           Implies                                                    
+    ============== ===========================================================
+    ``NEON``       ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``                     
+    ``NEON_FP16``  ``NEON`` ``NEON_VFPV4`` ``ASIMD``                          
+    ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``                           
+    ``ASIMD``      ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``                      
+    ``ASIMDHP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
+    ``ASIMDDP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
+    ``ASIMDFHM``   ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+    ============== ===========================================================
+
diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst
new file mode 100644
index 000000000..a2882f484
--- /dev/null
+++ b/doc/source/reference/simd/how-it-works.rst
@@ -0,0 +1,349 @@
+**********************************
+How does the CPU dispatcher work?
+**********************************
+
+NumPy dispatcher is based on multi-source compiling, which means taking
+a certain source and compiling it multiple times with different compiler
+flags and also with different **C** definitions that affect the code
+paths. This enables certain instruction-sets for each compiled object
+depending on the required optimizations and ends with linking the
+returned objects together.
+
+.. figure:: ../figures/opt-infra.png
+
+This mechanism should support all compilers and it doesn't require any
+compiler-specific extension, but at the same time it adds a few steps to
+normal compilation that are explained as follows.
+
+1- Configuration
+~~~~~~~~~~~~~~~~
+
+Configuring the required optimization by the user before starting to build the
+source files via the two command arguments as explained above:
+
+-  ``--cpu-baseline``: minimal set of required optimizations.
+
+-  ``--cpu-dispatch``: dispatched set of additional optimizations.
+
+
+2- Discovering the environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this part, we check the compiler and platform architecture
+and cache some of the intermediary results to speed up rebuilding.
+
+3- Validating the requested optimizations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By testing them against the compiler, and seeing what the compiler can
+support according to the requested optimizations.
+
+4- Generating the main configuration header
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generated header ``_cpu_dispatch.h`` contains all the definitions and
+headers of instruction-sets for the required optimizations that have been
+validated during the previous step.
+
+It also contains extra C definitions that are used for defining NumPy's
+Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispatch__``.
+
+**What is in this header?**
+
+The example header was dynamically generated by gcc on an X86 machine.
+The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
+``--cpu-dispatch="ssse3 sse41"``, and the result is below.
+
+.. code:: c
+
+   // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
+   /**NOTE
+    ** C definitions prefixed with "NPY_HAVE_" represent
+    ** the required optimzations.
+    **
+    ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
+    ** shouldn't be used by any NumPy C sources.
+    */
+   /******* baseline features *******/
+   /** SSE **/
+   #define NPY_HAVE_SSE 1
+   #include <xmmintrin.h>
+   /** SSE2 **/
+   #define NPY_HAVE_SSE2 1
+   #include <emmintrin.h>
+   /** SSE3 **/
+   #define NPY_HAVE_SSE3 1
+   #include <pmmintrin.h>
+
+   /******* dispatch-able features *******/
+   #ifdef NPY__CPU_TARGET_SSSE3
+     /** SSSE3 **/
+     #define NPY_HAVE_SSSE3 1
+     #include <tmmintrin.h>
+   #endif
+   #ifdef NPY__CPU_TARGET_SSE41
+     /** SSE41 **/
+     #define NPY_HAVE_SSE41 1
+     #include <smmintrin.h>
+   #endif
+
+**Baseline features** are the minimal set of required optimizations configured
+via ``--cpu-baseline``. They have no preprocessor guards and they're
+always on, which means they can be used in any source.
+
+Does this mean NumPy's infrastructure passes the compiler's flags of
+baseline features to all sources?
+
+Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
+treated differently.
+
+What if the user specifies certain **baseline features** during the
+build but at runtime the machine doesn't support even these
+features? Will the compiled code be called via one of these definitions, or
+maybe the compiler itself auto-generated/vectorized certain piece of code
+based on the provided command line compiler flags?
+
+During the loading of the NumPy module, there's a validation step
+which detects this behavior. It will raise a Python runtime error to inform the
+user. This is to prevent the CPU reaching an illegal instruction error causing
+a segfault.
+
+**Dispatch-able features** are our dispatched set of additional optimizations
+that were configured via ``--cpu-dispatch``. They are not activated by
+default and are always guarded by other C definitions prefixed with
+``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
+enabled within **dispatch-able sources**.
+
+.. _dispatchable-sources:
+
+5- Dispatch-able sources and configuration statements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dispatch-able sources are special **C** files that can be compiled multiple
+times with different compiler flags and also with different **C**
+definitions. These affect code paths to enable certain
+instruction-sets for each compiled object according to "**the
+configuration statements**" that must be declared between a **C**
+comment\ ``(/**/)`` and start with a special mark **@targets** at the
+top of each dispatch-able source. At the same time, dispatch-able
+sources will be treated as normal **C** sources if the optimization was
+disabled by the command argument ``--disable-optimization`` .
+
+**What are configuration statements?**
+
+Configuration statements are sort of keywords combined together to
+determine the required optimization for the dispatch-able source.
+
+Example:
+
+.. code:: c
+
+   /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
+   // C code
+
+The keywords mainly represent the additional optimizations configured
+through ``--cpu-dispatch``, but it can also represent other options such as:
+
+- Target groups: pre-configured configuration statements used for
+  managing the required optimizations from outside the dispatch-able source.
+
+- Policies: collections of options used for changing the default
+  behaviors or forcing the compilers to perform certain things.
+
+- "baseline": a unique keyword represents the minimal optimizations
+  that configured through ``--cpu-baseline``
+
+**Numpy's infrastructure handles dispatch-able sources in four steps**:
+
+- **(A) Recognition**: Just like source templates and F2PY, the
+  dispatch-able sources requires a special extension ``*.dispatch.c``
+  to mark C dispatch-able source files, and for C++
+  ``*.dispatch.cpp`` or ``*.dispatch.cxx``
+  **NOTE**: C++ not supported yet.
+
+- **(B) Parsing and validating**: In this step, the
+  dispatch-able sources that had been filtered by the previous step
+  are parsed and validated by the configuration statements for each one
+  of them one by one in order to determine the required optimizations.
+
+- **(C) Wrapping**: This is the approach taken by NumPy's
+  infrastructure, which has proved to be sufficiently flexible in order
+  to compile a single source multiple times with different **C**
+  definitions and flags that affect the code paths. The process is
+  achieved by creating a temporary **C** source for each required
+  optimization that related to the additional optimization, which
+  contains the declarations of the **C** definitions and includes the
+  involved source via the **C** directive **#include**. For more
+  clarification take a look at the following code for AVX512F :
+
+  .. code:: c
+
+      /*
+       * this definition is used by NumPy utilities as suffixes for the
+       * exported symbols
+       */
+      #define NPY__CPU_TARGET_CURRENT AVX512F
+      /*
+       * The following definitions enable
+       * definitions of the dispatch-able features that are defined within the main
+       * configuration header. These are definitions for the implied features.
+       */
+      #define NPY__CPU_TARGET_SSE
+      #define NPY__CPU_TARGET_SSE2
+      #define NPY__CPU_TARGET_SSE3
+      #define NPY__CPU_TARGET_SSSE3
+      #define NPY__CPU_TARGET_SSE41
+      #define NPY__CPU_TARGET_POPCNT
+      #define NPY__CPU_TARGET_SSE42
+      #define NPY__CPU_TARGET_AVX
+      #define NPY__CPU_TARGET_F16C
+      #define NPY__CPU_TARGET_FMA3
+      #define NPY__CPU_TARGET_AVX2
+      #define NPY__CPU_TARGET_AVX512F
+      // our dispatch-able source
+      #include "/the/absuolate/path/of/hello.dispatch.c"
+
+- **(D) Dispatch-able configuration header**: The infrastructure
+  generates a config header for each dispatch-able source, this header
+  mainly contains two abstract **C** macros used for identifying the
+  generated objects, so they can be used for runtime dispatching
+  certain symbols from the generated objects by any **C** source. It is
+  also used for forward declarations.
+
+  The generated header takes the name of the dispatch-able source after
+  excluding the extension and replace it with ``.h``, for example
+  assume we have a dispatch-able source called ``hello.dispatch.c`` and
+  contains the following:
+
+  .. code:: c
+
+      // hello.dispatch.c
+      /*@targets baseline sse42 avx512f */
+      #include <stdio.h>
+      #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
+
+      #ifndef NPY__CPU_TARGET_CURRENT
+        // wrapping the dispatch-able source only happens to the additional optimizations
+        // but if the keyword 'baseline' provided within the configuration statements,
+        // the infrastructure will add extra compiling for the dispatch-able source by
+        // passing it as-is to the compiler without any changes.
+        #define CURRENT_TARGET(X) X
+        #define NPY__CPU_TARGET_CURRENT baseline // for printing only
+      #else
+        // since we reach to this point, that's mean we're dealing with
+          // the additional optimizations, so it could be SSE42 or AVX512F
+        #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
+      #endif
+      // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
+      // to avoid linking duplications, NumPy already has a macro called
+      // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
+      // numpy/numpy/core/src/common/npy_cpu_dispatch.h
+      // NOTE: we tend to not adding suffixes to the baseline exported symbols
+      void CURRENT_TARGET(simd_whoami)(const char *extra_info)
+      {
+          printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
+      }
+
+  Now assume you attached **hello.dispatch.c** to the source tree, then
+  the infrastructure should generate a temporary config header called
+  **hello.dispatch.h** that can be reached by any source in the source
+  tree, and it should contain the following code :
+
+  .. code:: c
+
+      #ifndef NPY__CPU_DISPATCH_EXPAND_
+        // To expand the macro calls in this header
+          #define NPY__CPU_DISPATCH_EXPAND_(X) X
+      #endif
+      // Undefining the following macros, due to the possibility of including config headers
+      // multiple times within the same source and since each config header represents
+      // different required optimizations according to the specified configuration
+      // statements in the dispatch-able source that derived from it.
+      #undef NPY__CPU_DISPATCH_BASELINE_CALL
+      #undef NPY__CPU_DISPATCH_CALL
+      // nothing strange here, just a normal preprocessor callback
+      // enabled only if 'baseline' specified within the configuration statements
+      #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+        NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
+      // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
+      // the required optimizations that specified within the configuration statements.
+      //
+      // @param CHK, Expected a macro that can be used to detect CPU features
+      // in runtime, which takes a CPU feature name without string quotes and
+      // returns the testing result in a shape of boolean value.
+      // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
+      //
+      // @param CB, a callback macro that expected to be called multiple times depending
+      // on the required optimizations, the callback should receive the following arguments:
+      //  1- The pending calls of @param CHK filled up with the required CPU features,
+      //     that need to be tested first in runtime before executing call belong to
+      //     the compiled object.
+      //  2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
+      //  3- Extra arguments in the macro itself
+      //
+      // By default the callback calls are sorted depending on the highest interest
+      // unless the policy "$keep_sort" was in place within the configuration statements
+      // see "Dive into the CPU dispatcher" for more clarification.
+      #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
+        NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
+        NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
+
+  An example of using the config header in light of the above:
+
+  .. code:: c
+
+      // NOTE: The following macros are only defined for demonstration purposes only.
+      // NumPy already has a collections of macros located at
+      // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
+      // and declarations scenarios.
+
+      #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
+      #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
+
+      // An example for setting a macro that calls all the exported symbols at once
+      // after checking if they're supported by the running machine.
+      #define DISPATCH_CALL_ALL(FN, ARGS) \
+          NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
+          NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
+      // The preprocessor callbacks.
+      // The same suffixes as we define it in the dispatch-able source.
+      #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
+        if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+      #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
+        FN NPY_EXPAND(ARGS);
+
+      // An example for setting a macro that calls the exported symbols of highest
+      // interest optimization, after checking if they're supported by the running machine.
+      #define DISPATCH_CALL_HIGH(FN, ARGS) \
+        if (0) {} \
+          NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
+          NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
+      // The preprocessor callbacks
+      // The same suffixes as we define it in the dispatch-able source.
+      #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
+        else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+      #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
+        else { FN NPY_EXPAND(ARGS); }
+
+      // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
+      // for forward declrations any kind of prototypes based on
+      // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+      // However in this example, we just handle it manually.
+      void simd_whoami(const char *extra_info);
+      void simd_whoami_AVX512F(const char *extra_info);
+      void simd_whoami_SSE41(const char *extra_info);
+
+      void trigger_me(void)
+      {
+          // bring the auto-gernreated config header
+          // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
+          // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+          // it highely recomaned to include the config header before exectuing
+        // the dispatching macros in case if there's another header in the scope.
+          #include "hello.dispatch.h"
+          DISPATCH_CALL_ALL(simd_whoami, ("all"))
+          DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
+          // An example of including multiple config headers in the same source
+          // #include "hello2.dispatch.h"
+          // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
+      }
diff --git a/doc/source/reference/simd/index.rst b/doc/source/reference/simd/index.rst
new file mode 100644
index 000000000..230e2dc15
--- /dev/null
+++ b/doc/source/reference/simd/index.rst
@@ -0,0 +1,43 @@
+.. _numpysimd:
+.. currentmodule:: numpysimd
+
+***********************
+CPU/SIMD Optimizations
+***********************
+
+NumPy comes with a flexible working mechanism that allows it to harness the SIMD
+features that CPUs own, in order to provide faster and more stable performance
+on all popular platforms. Currently, NumPy supports the X86, IBM/Power, ARM7 and ARM8
+architectures.
+
+The optimization process in NumPy is carried out in three layers:
+
+- Code is *written* using the universal intrinsics which is a set of types, macros and
+  functions that are mapped to each supported instruction-sets by using guards that
+  will enable use of the them only when the compiler recognizes them.
+  This allow us to generate multiple kernels for the same functionality,
+  in which each generated kernel represents a set of instructions that related one
+  or multiple certain CPU features. The first kernel represents the minimum (baseline)
+  CPU features, and the other kernels represent the additional (dispatched) CPU features.
+
+- At *compile* time, CPU build options are used to define the minimum and
+  additional features to support, based on user choice and compiler support. The
+  appropriate intrinsics are overlaid with the platform / architecture intrinsics,
+  and multiple kernels are compiled.
+
+- At *runtime import*, the CPU is probed for the set of supported CPU
+  features. A mechanism is used to grab the pointer to the most appropriate
+  kernel, and this will be the one called for the function.
+
+.. note::
+
+   NumPy community had a deep discussion before implementing this work,
+   please check `NEP-38`_ for more clarification.
+
+.. toctree::
+
+    build-options
+    how-it-works
+
+.. _`NEP-38`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+
diff --git a/doc/source/reference/simd/log_example.txt b/doc/source/reference/simd/log_example.txt
new file mode 100644
index 000000000..b0c732433
--- /dev/null
+++ b/doc/source/reference/simd/log_example.txt
@@ -0,0 +1,79 @@
+########### EXT COMPILER OPTIMIZATION ###########
+Platform      :
+  Architecture: x64
+  Compiler    : gcc
+
+CPU baseline  :
+  Requested   : 'min'
+  Enabled     : SSE SSE2 SSE3
+  Flags       : -msse -msse2 -msse3
+  Extra checks: none
+
+CPU dispatch  :
+  Requested   : 'max -xop -fma4'
+  Enabled     : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+  Generated   :
+              :
+  SSE41       : SSE SSE2 SSE3 SSSE3
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1
+  Extra checks: none
+  Detect      : SSE SSE2 SSE3 SSSE3 SSE41
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+              : numpy/core/src/umath/_umath_tests.dispatch.c
+              :
+  SSE42       : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2
+  Extra checks: none
+  Detect      : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42
+              : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+              :
+  AVX2        : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mavx2
+  Extra checks: none
+  Detect      : AVX F16C AVX2
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+              : numpy/core/src/umath/_umath_tests.dispatch.c
+              :
+  (FMA3 AVX2) : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2
+  Extra checks: none
+  Detect      : AVX F16C FMA3 AVX2
+              : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+              :
+  AVX512F     : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f
+  Extra checks: AVX512F_REDUCE
+  Detect      : AVX512F
+              : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+              :
+  AVX512_SKX  : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD
+  Flags       : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq
+  Extra checks: AVX512BW_MASK AVX512DQ_MASK
+  Detect      : AVX512_SKX
+              : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+              : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+CCompilerOpt.cache_flush[804] : write cache to path -> /home/seiko/work/repos/numpy/build/temp.linux-x86_64-3.9/ccompiler_opt_cache_ext.py
+
+########### CLIB COMPILER OPTIMIZATION ###########
+Platform      :
+  Architecture: x64
+  Compiler    : gcc
+
+CPU baseline  :
+  Requested   : 'min'
+  Enabled     : SSE SSE2 SSE3
+  Flags       : -msse -msse2 -msse3
+  Extra checks: none
+
+CPU dispatch  :
+  Requested   : 'max -xop -fma4'
+  Enabled     : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+  Generated   : none
diff --git a/doc/source/reference/simd/simd-optimizations-tables-diff.inc b/doc/source/reference/simd/simd-optimizations-tables-diff.inc
deleted file mode 100644
index 41fa96703..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables-diff.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86::Intel Compiler - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    =========== ==================================================================================================================
-    Name        Implies                                                                                                           
-    =========== ==================================================================================================================
-    ``FMA3``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**                      
-    ``AVX2``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**                      
-    ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD**
-    =========== ==================================================================================================================
-
-.. note::
-  The following features aren't supported by x86::Intel Compiler:
-  **XOP FMA4**
-
-x86::Microsoft Visual C/C++ - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ============ =================================================================================================================================
-    Name         Implies                                                                                                                          
-    ============ =================================================================================================================================
-    ``FMA3``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**                                     
-    ``AVX2``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**                                     
-    ``AVX512F``  ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD** **AVX512_SKX**
-    ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` **AVX512_SKX** 
-    ============ =================================================================================================================================
-
-.. note::
-  The following features aren't supported by x86::Microsoft Visual C/C++:
-  **AVX512_KNL AVX512_KNM**
-
diff --git a/doc/source/reference/simd/simd-optimizations-tables.inc b/doc/source/reference/simd/simd-optimizations-tables.inc
deleted file mode 100644
index f038a91e1..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables.inc
+++ /dev/null
@@ -1,103 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ============ =================================================================================================================
-    Name         Implies                                                                                                          
-    ============ =================================================================================================================
-    ``SSE``      ``SSE2``                                                                                                         
-    ``SSE2``     ``SSE``                                                                                                          
-    ``SSE3``     ``SSE`` ``SSE2``                                                                                                 
-    ``SSSE3``    ``SSE`` ``SSE2`` ``SSE3``                                                                                        
-    ``SSE41``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                              
-    ``POPCNT``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                    
-    ``SSE42``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                         
-    ``AVX``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                               
-    ``XOP``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                       
-    ``FMA4``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                       
-    ``F16C``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                       
-    ``FMA3``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                              
-    ``AVX2``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                              
-    ``AVX512F``  ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``            
-    ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``
-    ============ =================================================================================================================
-
-x86 - Group names
-~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ============== ===================================================== ===========================================================================================================================================================================
-    Name           Gather                                                Implies                                                                                                                                                                    
-    ============== ===================================================== ===========================================================================================================================================================================
-    ``AVX512_KNL`` ``AVX512ER`` ``AVX512PF``                             ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                             
-    ``AVX512_KNM`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                              
-    ``AVX512_SKX`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``                ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                             
-    ``AVX512_CLX`` ``AVX512VNNI``                                        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                              
-    ``AVX512_CNL`` ``AVX512IFMA`` ``AVX512VBMI``                         ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                              
-    ``AVX512_ICL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ``  ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL``
-    ============== ===================================================== ===========================================================================================================================================================================
-
-IBM/POWER big-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ======== ================
-    Name     Implies         
-    ======== ================
-    ``VSX``                  
-    ``VSX2`` ``VSX``         
-    ``VSX3`` ``VSX`` ``VSX2``
-    ======== ================
-
-IBM/POWER little-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ======== ================
-    Name     Implies         
-    ======== ================
-    ``VSX``  ``VSX2``        
-    ``VSX2`` ``VSX``         
-    ``VSX3`` ``VSX`` ``VSX2``
-    ======== ================
-
-ARMv7/A32 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ============== ===========================================================
-    Name           Implies                                                    
-    ============== ===========================================================
-    ``NEON``                                                                  
-    ``NEON_FP16``  ``NEON``                                                   
-    ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``                                     
-    ``ASIMD``      ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``                      
-    ``ASIMDHP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
-    ``ASIMDDP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
-    ``ASIMDFHM``   ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
-    ============== ===========================================================
-
-ARMv8/A64 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
-    :align: left
-
-    ============== ===========================================================
-    Name           Implies                                                    
-    ============== ===========================================================
-    ``NEON``       ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``                     
-    ``NEON_FP16``  ``NEON`` ``NEON_VFPV4`` ``ASIMD``                          
-    ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``                           
-    ``ASIMD``      ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``                      
-    ``ASIMDHP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
-    ``ASIMDDP``    ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``            
-    ``ASIMDFHM``   ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
-    ============== ===========================================================
-
diff --git a/doc/source/reference/simd/simd-optimizations.py b/doc/source/reference/simd/simd-optimizations.py
deleted file mode 100644
index a78302db5..000000000
--- a/doc/source/reference/simd/simd-optimizations.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""
-Generate CPU features tables from CCompilerOpt
-"""
-from os import sys, path
-gen_path = path.dirname(path.realpath(__file__))
-#sys.path.append(path.abspath(path.join(gen_path, *([".."]*4), "numpy", "distutils")))
-#from ccompiler_opt import CCompilerOpt
-from numpy.distutils.ccompiler_opt import CCompilerOpt
-
-class FakeCCompilerOpt(CCompilerOpt):
-    fake_info = ("arch", "compiler", "extra_args")
-    # disable caching no need for it
-    conf_nocache = True
-    def __init__(self, *args, **kwargs):
-        no_cc = None
-        CCompilerOpt.__init__(self, no_cc, **kwargs)
-    def dist_compile(self, sources, flags, **kwargs):
-        return sources
-    def dist_info(self):
-        return FakeCCompilerOpt.fake_info
-    @staticmethod
-    def dist_log(*args, stderr=False):
-        # avoid printing
-        pass
-    def feature_test(self, name, force_flags=None):
-        # To speed up
-        return True
-
-    def gen_features_table(self, features, ignore_groups=True,
-                           field_names=["Name", "Implies"],
-                           fstyle=None, fstyle_implies=None, **kwargs):
-        rows = []
-        if fstyle is None:
-            fstyle = lambda ft: f'``{ft}``'
-        if fstyle_implies is None:
-            fstyle_implies = lambda origin, ft: fstyle(ft)
-        for f in self.feature_sorted(features):
-            is_group = "group" in self.feature_supported.get(f, {})
-            if ignore_groups and is_group:
-                continue
-            implies = self.feature_sorted(self.feature_implies(f))
-            implies = ' '.join([fstyle_implies(f, i) for i in implies])
-            rows.append([fstyle(f), implies])
-        if rows:
-           return self.gen_rst_table(field_names, rows, **kwargs)
-
-    def gen_gfeatures_table(self, features,
-                            field_names=["Name", "Gather", "Implies"],
-                            fstyle=None, fstyle_implies=None, **kwargs):
-        rows = []
-        if fstyle is None:
-            fstyle = lambda ft: f'``{ft}``'
-        if fstyle_implies is None:
-            fstyle_implies = lambda origin, ft: fstyle(ft)
-        for f in self.feature_sorted(features):
-            gather = self.feature_supported.get(f, {}).get("group", None)
-            if not gather:
-                continue
-            implies = self.feature_sorted(self.feature_implies(f))
-            implies = ' '.join([fstyle_implies(f, i) for i in implies])
-            gather = ' '.join([fstyle_implies(f, i) for i in gather])
-            rows.append([fstyle(f), gather, implies])
-        if rows:
-            return self.gen_rst_table(field_names, rows, **kwargs)
-
-    def gen_rst_table(self, field_names, rows, tab_size=4):
-        assert(not rows or len(field_names) == len(rows[0]))
-        rows.append(field_names)
-        fld_len = len(field_names)
-        cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
-        del rows[-1]
-        cformat = ' '.join('{:<%d}' % i for i in cls_len)
-        border  = cformat.format(*['='*i for i in cls_len])
-
-        rows = [cformat.format(*row) for row in rows]
-        # header
-        rows = [border, cformat.format(*field_names), border] + rows
-        # footer
-        rows += [border]
-        # add left margin
-        rows = [(' ' * tab_size) + r for r in rows]
-        return '\n'.join(rows)
-
-def features_table_sections(name, ftable=None, gtable=None, tab_size=4):
-    tab = ' '*tab_size
-    content = ''
-    if ftable:
-        title = f"{name} - CPU feature names"
-        content = (
-            f"{title}\n{'~'*len(title)}"
-            f"\n.. table::\n{tab}:align: left\n\n"
-            f"{ftable}\n\n"
-        )
-    if gtable:
-        title = f"{name} - Group names"
-        content += (
-            f"{title}\n{'~'*len(title)}"
-            f"\n.. table::\n{tab}:align: left\n\n"
-            f"{gtable}\n\n"
-        )
-    return content
-
-def features_table(arch, cc="gcc", pretty_name=None, **kwargs):
-    FakeCCompilerOpt.fake_info = (arch, cc, '')
-    ccopt = FakeCCompilerOpt(cpu_baseline="max")
-    features = ccopt.cpu_baseline_names()
-    ftable = ccopt.gen_features_table(features, **kwargs)
-    gtable = ccopt.gen_gfeatures_table(features, **kwargs)
-
-    if not pretty_name:
-        pretty_name = arch + '/' + cc
-    return features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
-def features_table_diff(arch, cc, cc_vs="gcc", pretty_name=None, **kwargs):
-    FakeCCompilerOpt.fake_info = (arch, cc, '')
-    ccopt = FakeCCompilerOpt(cpu_baseline="max")
-    fnames = ccopt.cpu_baseline_names()
-    features = {f:ccopt.feature_implies(f) for f in fnames}
-
-    FakeCCompilerOpt.fake_info = (arch, cc_vs, '')
-    ccopt_vs = FakeCCompilerOpt(cpu_baseline="max")
-    fnames_vs = ccopt_vs.cpu_baseline_names()
-    features_vs = {f:ccopt_vs.feature_implies(f) for f in fnames_vs}
-
-    common  = set(fnames).intersection(fnames_vs)
-    extra_avl = set(fnames).difference(fnames_vs)
-    not_avl = set(fnames_vs).difference(fnames)
-    diff_impl_f = {f:features[f].difference(features_vs[f]) for f in common}
-    diff_impl = {k for k, v in diff_impl_f.items() if v}
-
-    fbold = lambda ft: f'**{ft}**' if ft in extra_avl else f'``{ft}``'
-    fbold_implies = lambda origin, ft: (
-        f'**{ft}**' if ft in diff_impl_f.get(origin, {}) else f'``{ft}``'
-    )
-    diff_all = diff_impl.union(extra_avl)
-    ftable = ccopt.gen_features_table(
-        diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
-    )
-    gtable = ccopt.gen_gfeatures_table(
-        diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
-    )
-    if not pretty_name:
-        pretty_name = arch + '/' + cc
-    content = features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
-    if not_avl:
-        not_avl = ccopt_vs.feature_sorted(not_avl)
-        not_avl = ' '.join(not_avl)
-        content += (
-            ".. note::\n"
-            f"  The following features aren't supported by {pretty_name}:\n"
-            f"  **{not_avl}**\n\n"
-        )
-    return content
-
-if __name__ == '__main__':
-    pretty_names = {
-        "PPC64": "IBM/POWER big-endian",
-        "PPC64LE": "IBM/POWER little-endian",
-        "ARMHF": "ARMv7/A32",
-        "AARCH64": "ARMv8/A64",
-        "ICC": "Intel Compiler",
-        # "ICCW": "Intel Compiler msvc-like",
-        "MSVC": "Microsoft Visual C/C++"
-    }
-    with open(path.join(gen_path, 'simd-optimizations-tables.inc'), 'wt') as fd:
-        fd.write(f'.. generated via {__file__}\n\n')
-        for arch in (
-            ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
-        ):
-            pretty_name = pretty_names.get(arch, arch)
-            table = features_table(arch=arch, pretty_name=pretty_name)
-            assert(table)
-            fd.write(table)
-
-    with open(path.join(gen_path, 'simd-optimizations-tables-diff.inc'), 'wt') as fd:
-        fd.write(f'.. generated via {__file__}\n\n')
-        for arch, cc_names in (
-            ("x86", ("clang", "ICC", "MSVC")),
-            ("PPC64", ("clang",)),
-            ("PPC64LE", ("clang",)),
-            ("ARMHF", ("clang",)),
-            ("AARCH64", ("clang",))
-        ):
-            arch_pname = pretty_names.get(arch, arch)
-            for cc in cc_names:
-                pretty_name = f"{arch_pname}::{pretty_names.get(cc, cc)}"
-                table = features_table_diff(arch=arch, cc=cc, pretty_name=pretty_name)
-                if table:
-                    fd.write(table)
diff --git a/doc/source/reference/simd/simd-optimizations.rst b/doc/source/reference/simd/simd-optimizations.rst
index 9de6d1734..a18108266 100644
--- a/doc/source/reference/simd/simd-optimizations.rst
+++ b/doc/source/reference/simd/simd-optimizations.rst
@@ -1,527 +1,12 @@
-******************
-SIMD Optimizations
-******************
+:orphan:
 
-NumPy provides a set of macros that define `Universal Intrinsics`_ to
-abstract out typical platform-specific intrinsics so SIMD code needs to be
-written only once. There are three layers:
+.. raw:: html
 
-- Code is *written* using the universal intrinsic macros, with guards that
-  will enable use of the macros only when the compiler recognizes them.
-  In NumPy, these are used to construct multiple ufunc loops. Current policy is
-  to create three loops: One loop is the default and uses no intrinsics. One
-  uses the minimum intrinsics required on the architecture. And the third is
-  written using the maximum set of intrinsics possible.
-- At *compile* time, a distutils command is used to define the minimum and
-  maximum features to support, based on user choice and compiler support. The
-  appropriate macros are overlaid with the platform / architecture intrinsics,
-  and the three loops are compiled.
-- At *runtime import*, the CPU is probed for the set of supported intrinsic
-  features. A mechanism is used to grab the pointer to the most appropriate
-  function, and this will be the one called for the function.
+    <html>
+        <head>
+            <meta http-equiv="refresh" content="0; url=index.html"/>
+        </head>
+    </html>
 
-
-Build options for compilation
-=============================
-
-- ``--cpu-baseline``: minimal set of required optimizations. Default
-  value is ``min`` which provides the minimum CPU features that can
-  safely run on a wide range of platforms within the processor family.
-
-- ``--cpu-dispatch``: dispatched set of additional optimizations.
-  The default value is ``max -xop -fma4`` which enables all CPU
-  features, except for AMD legacy features(in case of X86).
-
-The command arguments are available in ``build``, ``build_clib``, and
-``build_ext``.
-if ``build_clib`` or ``build_ext`` are not specified by the user, the arguments of
-``build`` will be used instead, which also holds the default values.
-
-Optimization names can be CPU features or groups of features that gather
-several features or :ref:`special options <special-options>` to perform a series of procedures.
-
-
-The following tables show the current supported optimizations sorted from the lowest to the highest interest.
-
-.. include:: simd-optimizations-tables.inc
-
-----
-
-.. _tables-diff:
-
-While the above tables are based on the GCC Compiler, the following tables showing the differences in the
-other compilers:
-
-.. include:: simd-optimizations-tables-diff.inc
-
-.. _special-options:
-
-Special options
-~~~~~~~~~~~~~~~
-
-- ``NONE``: enable no features
-
-- ``NATIVE``: Enables all CPU features that supported by the current
-   machine, this operation is based on the compiler flags (``-march=native, -xHost, /QxHost``)
-
-- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
-
-  .. table::
-      :align: left
-
-      ======================================  =======================================
-       For Arch                               Returns
-      ======================================  =======================================
-       ``x86``                                ``SSE`` ``SSE2``
-       ``x86`` ``64-bit mode``                ``SSE`` ``SSE2`` ``SSE3``
-       ``IBM/POWER`` ``big-endian mode``      ``NONE``
-       ``IBM/POWER`` ``little-endian mode``   ``VSX`` ``VSX2``
-       ``ARMHF``                              ``NONE``
-       ``ARM64`` ``AARCH64``                  ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
-                                              ``ASIMD``
-      ======================================  =======================================
-
-- ``MAX``: Enables all supported CPU features by the Compiler and platform.
-
-- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
-
-NOTES
-~~~~~~~~~~~~~
-- CPU features and other options are case-insensitive.
-
-- The order of the requested optimizations doesn't matter.
-
-- Either commas or spaces can be used as a separator, e.g. ``--cpu-dispatch``\ =
-  "avx2 avx512f" or ``--cpu-dispatch``\ = "avx2, avx512f" both work, but the
-  arguments must be enclosed in quotes.
-
-- The operand ``+`` is only added for nominal reasons, For example:
-  ``--cpu-baseline= "min avx2"`` is equivalent to ``--cpu-baseline="min + avx2"``.
-  ``--cpu-baseline="min,avx2"`` is equivalent to ``--cpu-baseline`="min,+avx2"``
-
-- If the CPU feature is not supported by the user platform or
-  compiler, it will be skipped rather than raising a fatal error.
-
-- Any specified CPU feature to ``--cpu-dispatch`` will be skipped if
-  it's part of CPU baseline features
-
-- The ``--cpu-baseline`` argument force-enables implied features,
-  e.g. ``--cpu-baseline``\ ="sse42" is equivalent to
-  ``--cpu-baseline``\ ="sse sse2 sse3 ssse3 sse41 popcnt sse42"
-
-- The value of ``--cpu-baseline`` will be treated as "native" if
-  compiler native flag ``-march=native`` or ``-xHost`` or ``QxHost`` is
-  enabled through environment variable ``CFLAGS``
-
-- The validation process for the requested optimizations when it comes to
-  ``--cpu-baseline`` isn't strict. For example, if the user requested
-  ``AVX2`` but the compiler doesn't support it then we just skip it and return
-  the maximum optimization that the compiler can handle depending on the
-  implied features of ``AVX2``, let us assume ``AVX``.
-
-- The user should always check the final report through the build log
-  to verify the enabled features.
-
-Special cases
-~~~~~~~~~~~~~
-
-**Interrelated CPU features**: Some exceptional conditions force us to link some features together when it come to certain compilers or architectures, resulting in the impossibility of building them separately.
-These conditions can be divided into two parts, as follows:
-
-- **Architectural compatibility**: The need to align certain CPU features that are assured
-  to be supported by successive generations of the same architecture, for example:
-
-  - On ppc64le `VSX(ISA 2.06)` and `VSX2(ISA 2.07)` both imply one another since the
-    first generation that supports little-endian mode is Power-8`(ISA 2.07)`
-  - On AArch64 `NEON` `FP16` `VFPV4` `ASIMD` implies each other since they are part of the
-    hardware baseline.
-
-- **Compilation compatibility**: Not all **C/C++** compilers provide independent support for all CPU
-  features. For example, **Intel**'s compiler doesn't provide separated flags for `AVX2` and `FMA3`,
-  it makes sense since all Intel CPUs that comes with `AVX2` also support `FMA3` and vice versa,
-  but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
-  Therefore, there are differences in the depiction of CPU features between the C/C++ compilers,
-  as shown in the :ref:`tables above <tables-diff>`.
-
-
-Behaviors and Errors
-~~~~~~~~~~~~~~~~~~~~
-
-
-
-Usage and Examples
-~~~~~~~~~~~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-Understanding CPU Dispatching, How the NumPy dispatcher works?
-==============================================================
-
-NumPy dispatcher is based on multi-source compiling, which means taking
-a certain source and compiling it multiple times with different compiler
-flags and also with different **C** definitions that affect the code
-paths to enable certain instruction-sets for each compiled object
-depending on the required optimizations, then combining the returned
-objects together.
-
-.. figure:: ../figures/opt-infra.png
-
-This mechanism should support all compilers and it doesn't require any
-compiler-specific extension, but at the same time it is adds a few steps to
-normal compilation that are explained as follows:
-
-1- Configuration
-~~~~~~~~~~~~~~~~
-
-Configuring the required optimization by the user before starting to build the
-source files via the two command arguments as explained above:
-
--  ``--cpu-baseline``: minimal set of required optimizations.
-
--  ``--cpu-dispatch``: dispatched set of additional optimizations.
-
-
-2- Discovering the environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In this part, we check the compiler and platform architecture
-and cache some of the intermediary results to speed up rebuilding.
-
-3- Validating the requested optimizations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-By testing them against the compiler, and seeing what the compiler can
-support according to the requested optimizations.
-
-4- Generating the main configuration header
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The generated header ``_cpu_dispatch.h`` contains all the definitions and
-headers of instruction-sets for the required optimizations that have been
-validated during the previous step.
-
-It also contains extra C definitions that are used for defining NumPy's
-Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispaٍtch__``.
-
-**What is in this header?**
-
-The example header was dynamically generated by gcc on an X86 machine.
-The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
-``--cpu-dispatch="ssse3 sse41"``, and the result is below.
-
-.. code:: c
-
-   // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
-   /**NOTE
-    ** C definitions prefixed with "NPY_HAVE_" represent
-    ** the required optimzations.
-    **
-    ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
-    ** shouldn't be used by any NumPy C sources.
-    */
-   /******* baseline features *******/
-   /** SSE **/
-   #define NPY_HAVE_SSE 1
-   #include <xmmintrin.h>
-   /** SSE2 **/
-   #define NPY_HAVE_SSE2 1
-   #include <emmintrin.h>
-   /** SSE3 **/
-   #define NPY_HAVE_SSE3 1
-   #include <pmmintrin.h>
-
-   /******* dispatch-able features *******/
-   #ifdef NPY__CPU_TARGET_SSSE3
-     /** SSSE3 **/
-     #define NPY_HAVE_SSSE3 1
-     #include <tmmintrin.h>
-   #endif
-   #ifdef NPY__CPU_TARGET_SSE41
-     /** SSE41 **/
-     #define NPY_HAVE_SSE41 1
-     #include <smmintrin.h>
-   #endif
-
-**Baseline features** are the minimal set of required optimizations configured
-via ``--cpu-baseline``. They have no preprocessor guards and they're
-always on, which means they can be used in any source.
-
-Does this mean NumPy's infrastructure passes the compiler's flags of
-baseline features to all sources?
-
-Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
-treated differently.
-
-What if the user specifies certain **baseline features** during the
-build but at runtime the machine doesn't support even these
-features? Will the compiled code be called via one of these definitions, or
-maybe the compiler itself auto-generated/vectorized certain piece of code
-based on the provided command line compiler flags?
-
-During the loading of the NumPy module, there's a validation step
-which detects this behavior. It will raise a Python runtime error to inform the
-user. This is to prevent the CPU reaching an illegal instruction error causing
-a segfault.
-
-**Dispatch-able features** are our dispatched set of additional optimizations
-that were configured via ``--cpu-dispatch``. They are not activated by
-default and are always guarded by other C definitions prefixed with
-``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
-enabled within **dispatch-able sources**.
-
-.. _dispatchable-sources:
-
-5- Dispatch-able sources and configuration statements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Dispatch-able sources are special **C** files that can be compiled multiple
-times with different compiler flags and also with different **C**
-definitions. These affect code paths to enable certain
-instruction-sets for each compiled object according to "**the
-configuration statements**" that must be declared between a **C**
-comment\ ``(/**/)`` and start with a special mark **@targets** at the
-top of each dispatch-able source. At the same time, dispatch-able
-sources will be treated as normal **C** sources if the optimization was
-disabled by the command argument ``--disable-optimization`` .
-
-**What are configuration statements?**
-
-Configuration statements are sort of keywords combined together to
-determine the required optimization for the dispatch-able source.
-
-Example:
-
-.. code:: c
-
-   /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
-   // C code
-
-The keywords mainly represent the additional optimizations configured
-through ``--cpu-dispatch``, but it can also represent other options such as:
-
-- Target groups: pre-configured configuration statements used for
-  managing the required optimizations from outside the dispatch-able source.
-
-- Policies: collections of options used for changing the default
-  behaviors or forcing the compilers to perform certain things.
-
-- "baseline": a unique keyword represents the minimal optimizations
-  that configured through ``--cpu-baseline``
-
-**Numpy's infrastructure handles dispatch-able sources in four steps**:
-
-- **(A) Recognition**: Just like source templates and F2PY, the
-  dispatch-able sources requires a special extension ``*.dispatch.c``
-  to mark C dispatch-able source files, and for C++
-  ``*.dispatch.cpp`` or ``*.dispatch.cxx``
-  **NOTE**: C++ not supported yet.
-
-- **(B) Parsing and validating**: In this step, the
-  dispatch-able sources that had been filtered by the previous step
-  are parsed and validated by the configuration statements for each one
-  of them one by one in order to determine the required optimizations.
-
-- **(C) Wrapping**: This is the approach taken by NumPy's
-  infrastructure, which has proved to be sufficiently flexible in order
-  to compile a single source multiple times with different **C**
-  definitions and flags that affect the code paths. The process is
-  achieved by creating a temporary **C** source for each required
-  optimization that related to the additional optimization, which
-  contains the declarations of the **C** definitions and includes the
-  involved source via the **C** directive **#include**. For more
-  clarification take a look at the following code for AVX512F :
-
-  .. code:: c
-
-      /*
-       * this definition is used by NumPy utilities as suffixes for the
-       * exported symbols
-       */
-      #define NPY__CPU_TARGET_CURRENT AVX512F
-      /*
-       * The following definitions enable
-       * definitions of the dispatch-able features that are defined within the main
-       * configuration header. These are definitions for the implied features.
-       */
-      #define NPY__CPU_TARGET_SSE
-      #define NPY__CPU_TARGET_SSE2
-      #define NPY__CPU_TARGET_SSE3
-      #define NPY__CPU_TARGET_SSSE3
-      #define NPY__CPU_TARGET_SSE41
-      #define NPY__CPU_TARGET_POPCNT
-      #define NPY__CPU_TARGET_SSE42
-      #define NPY__CPU_TARGET_AVX
-      #define NPY__CPU_TARGET_F16C
-      #define NPY__CPU_TARGET_FMA3
-      #define NPY__CPU_TARGET_AVX2
-      #define NPY__CPU_TARGET_AVX512F
-      // our dispatch-able source
-      #include "/the/absuolate/path/of/hello.dispatch.c"
-
-- **(D) Dispatch-able configuration header**: The infrastructure
-  generates a config header for each dispatch-able source, this header
-  mainly contains two abstract **C** macros used for identifying the
-  generated objects, so they can be used for runtime dispatching
-  certain symbols from the generated objects by any **C** source. It is
-  also used for forward declarations.
-
-  The generated header takes the name of the dispatch-able source after
-  excluding the extension and replace it with '**.h**', for example
-  assume we have a dispatch-able source called **hello.dispatch.c** and
-  contains the following:
-
-  .. code:: c
-
-      // hello.dispatch.c
-      /*@targets baseline sse42 avx512f */
-      #include <stdio.h>
-      #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
-
-      #ifndef NPY__CPU_TARGET_CURRENT
-        // wrapping the dispatch-able source only happens to the additional optimizations
-        // but if the keyword 'baseline' provided within the configuration statements,
-        // the infrastructure will add extra compiling for the dispatch-able source by
-        // passing it as-is to the compiler without any changes.
-        #define CURRENT_TARGET(X) X
-        #define NPY__CPU_TARGET_CURRENT baseline // for printing only
-      #else
-        // since we reach to this point, that's mean we're dealing with
-          // the additional optimizations, so it could be SSE42 or AVX512F
-        #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
-      #endif
-      // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
-      // to avoid linking duplications, NumPy already has a macro called
-      // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
-      // numpy/numpy/core/src/common/npy_cpu_dispatch.h
-      // NOTE: we tend to not adding suffixes to the baseline exported symbols
-      void CURRENT_TARGET(simd_whoami)(const char *extra_info)
-      {
-          printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
-      }
-
-  Now assume you attached **hello.dispatch.c** to the source tree, then
-  the infrastructure should generate a temporary config header called
-  **hello.dispatch.h** that can be reached by any source in the source
-  tree, and it should contain the following code :
-
-  .. code:: c
-
-      #ifndef NPY__CPU_DISPATCH_EXPAND_
-        // To expand the macro calls in this header
-          #define NPY__CPU_DISPATCH_EXPAND_(X) X
-      #endif
-      // Undefining the following macros, due to the possibility of including config headers
-      // multiple times within the same source and since each config header represents
-      // different required optimizations according to the specified configuration
-      // statements in the dispatch-able source that derived from it.
-      #undef NPY__CPU_DISPATCH_BASELINE_CALL
-      #undef NPY__CPU_DISPATCH_CALL
-      // nothing strange here, just a normal preprocessor callback
-      // enabled only if 'baseline' specified within the configuration statements
-      #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
-        NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
-      // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
-      // the required optimizations that specified within the configuration statements.
-      //
-      // @param CHK, Expected a macro that can be used to detect CPU features
-      // in runtime, which takes a CPU feature name without string quotes and
-      // returns the testing result in a shape of boolean value.
-      // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
-      //
-      // @param CB, a callback macro that expected to be called multiple times depending
-      // on the required optimizations, the callback should receive the following arguments:
-      //  1- The pending calls of @param CHK filled up with the required CPU features,
-      //     that need to be tested first in runtime before executing call belong to
-      //     the compiled object.
-      //  2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
-      //  3- Extra arguments in the macro itself
-      //
-      // By default the callback calls are sorted depending on the highest interest
-      // unless the policy "$keep_sort" was in place within the configuration statements
-      // see "Dive into the CPU dispatcher" for more clarification.
-      #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
-        NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
-        NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
-
-  An example of using the config header in light of the above:
-
-  .. code:: c
-
-      // NOTE: The following macros are only defined for demonstration purposes only.
-      // NumPy already has a collections of macros located at
-      // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
-      // and declarations scenarios.
-
-      #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
-      #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
-
-      // An example for setting a macro that calls all the exported symbols at once
-      // after checking if they're supported by the running machine.
-      #define DISPATCH_CALL_ALL(FN, ARGS) \
-          NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
-          NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
-      // The preprocessor callbacks.
-      // The same suffixes as we define it in the dispatch-able source.
-      #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
-        if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
-      #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
-        FN NPY_EXPAND(ARGS);
-
-      // An example for setting a macro that calls the exported symbols of highest
-      // interest optimization, after checking if they're supported by the running machine.
-      #define DISPATCH_CALL_HIGH(FN, ARGS) \
-        if (0) {} \
-          NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
-          NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
-      // The preprocessor callbacks
-      // The same suffixes as we define it in the dispatch-able source.
-      #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
-        else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
-      #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
-        else { FN NPY_EXPAND(ARGS); }
-
-      // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
-      // for forward declrations any kind of prototypes based on
-      // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
-      // However in this example, we just handle it manually.
-      void simd_whoami(const char *extra_info);
-      void simd_whoami_AVX512F(const char *extra_info);
-      void simd_whoami_SSE41(const char *extra_info);
-
-      void trigger_me(void)
-      {
-          // bring the auto-gernreated config header
-          // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
-          // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
-          // it highely recomaned to include the config header before exectuing
-        // the dispatching macros in case if there's another header in the scope.
-          #include "hello.dispatch.h"
-          DISPATCH_CALL_ALL(simd_whoami, ("all"))
-          DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
-          // An example of including multiple config headers in the same source
-          // #include "hello2.dispatch.h"
-          // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
-      }
-
-
-Dive into the CPU dispatcher
-============================
-
-The baseline
-~~~~~~~~~~~~
-
-Dispatcher
-~~~~~~~~~~
-
-Groups and Policies
-~~~~~~~~~~~~~~~~~~~
-
-Examples
-~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-
-.. _`Universal Intrinsics`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+The location of this document has been changed , if you are not
+redirected in few seconds, `click here <index.html>`_.
diff --git a/doc/source/user/basics.creation.rst b/doc/source/user/basics.creation.rst
index 84ff1c30e..523a05379 100644
--- a/doc/source/user/basics.creation.rst
+++ b/doc/source/user/basics.creation.rst
@@ -74,10 +74,11 @@ assign a new type that satisfies all of the array elements involved in
 the computation, here ``uint32`` and ``int32`` can both be represented in
 as ``int64``. 
 
-The default NumPy behavior is to create arrays in either 64-bit signed
-integers or double precision floating point numbers, ``int64`` and
-``float``, respectively. If you expect your arrays to be a certain type,
-then you need to specify the ``dtype`` while you create the array. 
+The default NumPy behavior is to create arrays in either 32 or 64-bit signed
+integers (platform dependent and matches C int size) or double precision
+floating point numbers, int32/int64 and float, respectively. If you expect your
+integer arrays to be a specific type, then you need to specify the dtype while
+you create the array.
 
 2) Intrinsic NumPy array creation functions
 ===========================================
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 4c6b09b80..934434370 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -592,17 +592,19 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc,
         Py_INCREF(operation_DTypes[i]);
         Py_DECREF(out_descrs[i]);
     }
-    if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver) {
-        /*
-         * In this one case, the deprecation means that we actually override
-         * the signature.
-         */
-        for (int i = 0; i < nargs; i++) {
-            if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
-                Py_INCREF(operation_DTypes[i]);
-                Py_SETREF(signature[i], operation_DTypes[i]);
-                *out_cacheable = 0;
-            }
+    /*
+     * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation
+     * warning (ignoring `dtype=`) and cannot be cached.
+     * All datetime ones *should* have a warning, but currently don't,
+     * but ignore all signature passing also.  So they can also
+     * not be cached, and they mutate the signature which of course is wrong,
+     * but not doing it would confuse the code later.
+     */
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
+            Py_INCREF(operation_DTypes[i]);
+            Py_SETREF(signature[i], operation_DTypes[i]);
+            *out_cacheable = 0;
         }
     }
     return 0;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 9107323b0..1b310b471 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2737,7 +2737,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
     }
 
     PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
-            ops, signature, operation_DTypes, NPY_FALSE, NPY_FALSE, NPY_TRUE);
+            ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE, NPY_TRUE);
     /* Output can currently get cleared, others XDECREF in case of error */
     Py_XDECREF(operation_DTypes[1]);
     if (out != NULL) {
@@ -5194,60 +5194,18 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
             return NULL;
         }
     }
-
-    PyObject *promoter = NULL;
-    if (ufunc->ntypes == 1) {
-        npy_bool all_object = NPY_TRUE;
-        for (int i = 0; i < ufunc->nargs; i++) {
-            if (ufunc->types[i] != NPY_OBJECT) {
-                all_object = NPY_FALSE;
-                break;
-            }
-        }
-        if (all_object) {
-            promoter = PyCapsule_New(&object_only_ufunc_promoter,
-                    "numpy._ufunc_promoter", NULL);
-            if (promoter == NULL) {
-                Py_DECREF(ufunc);
-                return NULL;
-            }
-        }
-    }
-    if (promoter == NULL && ufunc->nin > 1) {
-        promoter = PyCapsule_New(&default_ufunc_promoter,
-                "numpy._ufunc_promoter", NULL);
-        if (promoter == NULL) {
-            Py_DECREF(ufunc);
-            return NULL;
-        }
-    }
-    if (promoter != NULL) {
-        /* Always install default promoter using the common DType */
-        PyObject *dtype_tuple = PyTuple_New(ufunc->nargs);
-        if (dtype_tuple == NULL) {
-            Py_DECREF(promoter);
-            Py_DECREF(ufunc);
-            return NULL;
-        }
-        for (int i = 0; i < ufunc->nargs; i++) {
-            Py_INCREF(Py_None);
-            PyTuple_SET_ITEM(dtype_tuple, i, Py_None);
-        }
-        PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter);
-        Py_DECREF(dtype_tuple);
-        Py_DECREF(promoter);
-        if (info == NULL) {
-            Py_DECREF(ufunc);
-            return NULL;
-        }
-
-        int res = PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
-        Py_DECREF(info);
-        if (res < 0) {
-            Py_DECREF(ufunc);
-            return NULL;
-        }
-    }
+    /*
+     * TODO: I tried adding a default promoter here (either all object for
+     *       some special cases, or all homogeneous).  Those are reasonable
+     *       defaults, but short-cut a deprecated SciPy loop, where the
+     *       homogeneous loop `ddd->d` was deprecated, but an inhomogeneous
+     *       one `dld->d` should be picked.
+     *       The default promoter *is* a reasonable default, but switched that
+     *       behaviour.
+     *       Another problem appeared due to buggy type-resolution for
+     *       datetimes, this meant that `timedelta.sum(dtype="f8")` returned
+     *       datetimes (and not floats or error), arguably wrong, but...
+     */
     return (PyObject *)ufunc;
 }
 
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index c6a3d4e79..baae77a35 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -2029,11 +2029,17 @@ class TestDateTime:
         assert_equal(np.maximum.reduce(a),
                      np.timedelta64(7, 's'))
 
+    def test_timedelta_correct_mean(self):
+        # test mainly because it worked only via a bug in that allowed:
+        # `timedelta.sum(dtype="f8")` to ignore the dtype request.
+        a = np.arange(1000, dtype="m8[s]")
+        assert_array_equal(a.mean(), a.sum() / len(a))
+
     def test_datetime_no_subtract_reducelike(self):
         # subtracting two datetime64 works, but we cannot reduce it, since
         # the result of that subtraction will have a different dtype.
         arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]")
-        msg = r"ufunc 'subtract' did not contain a loop with signature "
+        msg = r"the resolved dtypes are not compatible"
 
         with pytest.raises(TypeError, match=msg):
             np.subtract.reduce(arr)