summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/wheels.yml38
-rw-r--r--doc/neps/nep-0031-uarray.rst6
-rw-r--r--doc/neps/nep-0038-SIMD-optimizations.rst2
-rw-r--r--doc/neps/nep-0049.rst4
-rw-r--r--doc/source/reference/index.rst2
-rw-r--r--doc/source/reference/simd/build-options.rst375
-rw-r--r--doc/source/reference/simd/gen_features.py194
-rw-r--r--doc/source/reference/simd/generated_tables/compilers-diff.inc33
-rw-r--r--doc/source/reference/simd/generated_tables/cpu_features.inc93
-rw-r--r--doc/source/reference/simd/how-it-works.rst349
-rw-r--r--doc/source/reference/simd/index.rst43
-rw-r--r--doc/source/reference/simd/log_example.txt79
-rw-r--r--doc/source/reference/simd/simd-optimizations-tables-diff.inc37
-rw-r--r--doc/source/reference/simd/simd-optimizations-tables.inc103
-rw-r--r--doc/source/reference/simd/simd-optimizations.py190
-rw-r--r--doc/source/reference/simd/simd-optimizations.rst533
-rw-r--r--doc/source/user/basics.creation.rst9
-rw-r--r--numpy/core/src/umath/dispatching.c24
-rw-r--r--numpy/core/src/umath/ufunc_object.c68
-rw-r--r--numpy/core/tests/test_datetime.py8
20 files changed, 1246 insertions, 944 deletions
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index cd5d8484a..076ac32c7 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -37,7 +37,7 @@ jobs:
echo "::set-output name=message::$COMMIT_MSG"
build_wheels:
- name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform }}
+ name: Build wheel for ${{ matrix.python }}-${{ matrix.platform }}
needs: get_commit_message
if: >-
contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
@@ -51,35 +51,49 @@ jobs:
include:
# manylinux builds
- os: ubuntu-20.04
- python: "38"
+ python: "cp38"
platform: manylinux_x86_64
- os: ubuntu-20.04
- python: "39"
+ python: "cp39"
platform: manylinux_x86_64
- os: ubuntu-20.04
- python: "310"
+ python: "cp310"
+ platform: manylinux_x86_64
+ # manylinux pypy builds
+ - os: ubuntu-20.04
+ python: "pp38"
platform: manylinux_x86_64
# MacOS builds
- os: macos-10.15
- python: "38"
+ python: "cp38"
platform: macosx_*
- os: macos-10.15
- python: "39"
+ python: "cp39"
platform: macosx_*
- os: macos-10.15
- python: "310"
+ python: "cp310"
platform: macosx_*
+ # MacOS PyPy builds
+ # Disabled for now because of a PyPy bug
+ # that prevents successful compilation
+ #- os: macos-10.15
+ # python: "pp38"
+ # platform: macosx_x86_64
# Windows builds
- os: windows-2019
- python: "38"
+ python: "cp38"
+ platform: win_amd64
+ - os: windows-2019
+ python: "cp39"
platform: win_amd64
- os: windows-2019
- python: "39"
+ python: "cp310"
platform: win_amd64
+ # Windows PyPy builds
- os: windows-2019
- python: "310"
+ python: "pp38"
platform: win_amd64
steps:
@@ -94,10 +108,10 @@ jobs:
fetch-depth: 0
- name: Build wheels
- uses: pypa/cibuildwheel@v2.1.3
+ uses: pypa/cibuildwheel@v2.3.0
env:
NPY_USE_BLAS_ILP64: 1
- CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform }}
+ CIBW_BUILD: ${{ matrix.python }}-${{ matrix.platform }}
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_ENVIRONMENT_LINUX: CFLAGS='-std=c99 -fno-strict-aliasing'
LDFLAGS='-Wl,--strip-debug'
diff --git a/doc/neps/nep-0031-uarray.rst b/doc/neps/nep-0031-uarray.rst
index b4ec94077..b746c267d 100644
--- a/doc/neps/nep-0031-uarray.rst
+++ b/doc/neps/nep-0031-uarray.rst
@@ -302,7 +302,7 @@ This is different from monkeypatching in a few different ways:
so there is at least the loose sense of an API contract. Monkeypatching
does not provide this ability.
* There is the ability of locally switching the backend.
-* It has been `suggested <http://numpy-discussion.10968.n7.nabble.com/NEP-31-Context-local-and-global-overrides-of-the-NumPy-API-tp47452p47472.html>`_
+* It has been `suggested <https://mail.python.org/archives/list/numpy-discussion@python.org/message/PS7EN3CRT6XERNTCN56MAYOXFFFEC55G/>`_
that the reason that 1.17 hasn't landed in the Anaconda defaults channel is
due to the incompatibility between monkeypatching and ``__array_function__``,
as monkeypatching would bypass the protocol completely.
@@ -640,9 +640,9 @@ References and Footnotes
.. [4] NEP 13 — A Mechanism for Overriding Ufuncs: https://numpy.org/neps/nep-0013-ufunc-overrides.html
-.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: http://numpy-discussion.10968.n7.nabble.com/Adding-to-the-non-dispatched-implementation-of-NumPy-methods-tp46816p46874.html
+.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/5GUDMALWDIRHITG5YUOCV343J66QSX3U/#5GUDMALWDIRHITG5YUOCV343J66QSX3U
-.. [6] Custom Dtype/Units discussion: http://numpy-discussion.10968.n7.nabble.com/Custom-Dtype-Units-discussion-td43262.html
+.. [6] Custom Dtype/Units discussion: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA/#RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA
.. [7] The epic dtype cleanup plan: https://github.com/numpy/numpy/issues/2899
diff --git a/doc/neps/nep-0038-SIMD-optimizations.rst b/doc/neps/nep-0038-SIMD-optimizations.rst
index 927228447..2123c4f95 100644
--- a/doc/neps/nep-0038-SIMD-optimizations.rst
+++ b/doc/neps/nep-0038-SIMD-optimizations.rst
@@ -8,7 +8,7 @@ NEP 38 — Using SIMD optimization instructions for performance
:Status: Accepted
:Type: Standards
:Created: 2019-11-25
-:Resolution: http://numpy-discussion.10968.n7.nabble.com/NEP-38-Universal-SIMD-intrinsics-td47854.html
+:Resolution: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB/#PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB
Abstract
diff --git a/doc/neps/nep-0049.rst b/doc/neps/nep-0049.rst
index 3bd1d102c..0f0fd23c9 100644
--- a/doc/neps/nep-0049.rst
+++ b/doc/neps/nep-0049.rst
@@ -55,8 +55,8 @@ is to create a flexible enough interface without burdening normative users.
.. _`issue 5312`: https://github.com/numpy/numpy/issues/5312
.. _`from 2017`: https://github.com/numpy/numpy/issues/5312#issuecomment-315234656
.. _`in 2005`: https://numpy-discussion.scipy.narkive.com/MvmMkJcK/numpy-arrays-data-allocation-and-simd-alignement
-.. _`here`: http://numpy-discussion.10968.n7.nabble.com/Aligned-configurable-memory-allocation-td39712.html
-.. _`and here`: http://numpy-discussion.10968.n7.nabble.com/Numpy-s-policy-for-releasing-memory-td1533.html
+.. _`here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH/#YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH
+.. _`and here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A/#IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A
.. _`issue 14177`: https://github.com/numpy/numpy/issues/14177
.. _`filprofiler`: https://github.com/pythonspeed/filprofiler/blob/master/design/allocator-overrides.md
.. _`electric fence`: https://github.com/boundarydevices/efence
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index a18211cca..24bb6665d 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -26,7 +26,7 @@ For learning how to use NumPy, see the :ref:`complete documentation <numpy_docs_
distutils
distutils_guide
c-api/index
- simd/simd-optimizations
+ simd/index
swig
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
new file mode 100644
index 000000000..80ef2c639
--- /dev/null
+++ b/doc/source/reference/simd/build-options.rst
@@ -0,0 +1,375 @@
+*****************
+CPU build options
+*****************
+
+Description
+-----------
+
+The following options are mainly used to change the default behavior of optimizations
+that target certain CPU features:
+
+- ``--cpu-baseline``: minimal set of required CPU features.
+ Default value is ``min`` which provides the minimum CPU features that can
+ safely run on a wide range of platforms within the processor family.
+
+ .. note::
+
+ During the runtime, NumPy modules will fail to load if any of specified features
+ are not supported by the target CPU (raises Python runtime error).
+
+- ``--cpu-dispatch``: dispatched set of additional CPU features.
+ Default value is ``max -xop -fma4`` which enables all CPU
+ features, except for AMD legacy features (in case of X86).
+
+ .. note::
+
+ During the runtime, NumPy modules will skip any specified features
+ that are not available in the target CPU.
+
+These options are accessible through :py:mod:`distutils` commands
+`distutils.command.build`, `distutils.command.build_clib` and
+`distutils.command.build_ext`.
+They accept a set of :ref:`CPU features <opt-supported-features>`
+or groups of features that gather several features or
+:ref:`special options <opt-special-options>` that
+perform a series of procedures.
+
+.. note::
+
+ If ``build_clib`` or ``build_ext`` are not specified by the user,
+ the arguments of ``build`` will be used instead, which also holds the default values.
+
+To customize both ``build_ext`` and ``build_clib``::
+
+ cd /path/to/numpy
+ python setup.py build --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_ext``::
+
+ cd /path/to/numpy
+ python setup.py build_ext --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_clib``::
+
+ cd /path/to/numpy
+ python setup.py build_clib --cpu-baseline="avx2 fma3" install --user
+
+You can also customize CPU/build options through PIP command::
+
+ pip install --no-use-pep517 --global-option=build \
+ --global-option="--cpu-baseline=avx2 fma3" \
+ --global-option="--cpu-dispatch=max" ./
+
+Quick Start
+-----------
+
+In general, the default settings tend to not impose certain CPU features that
+may not be available on some older processors. Raising the ceiling of the
+baseline features will often improve performance and may also reduce
+binary size.
+
+
+The following are the most common scenarios that may require changing
+the default settings:
+
+
+I am building NumPy for my local use
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+And I do not intend to export the build to other users or target a
+different CPU than what the host has.
+
+Set `native` for baseline, or manualy specify the CPU features in case of option
+`native` isn't supported by your platform::
+
+ python setup.py build --cpu-baseline="native" bdist
+
+Building NumPy with extra CPU features isn't necessary for this case,
+since all supported features are already defined within the baseline features::
+
+ python setup.py build --cpu-baseline=native --cpu-dispatch=none bdist
+
+.. note::
+
+ A fatal error will be raised if `native` isn't supported by the host platform.
+
+I do not want to support the old processors of the `x86` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since most of the CPUs nowadays support at least `AVX`, `F16C` features, you can use::
+
+ python setup.py build --cpu-baseline="avx f16c" bdist
+
+.. note::
+
+ ``--cpu-baseline`` force combine all implied features, so there's no need
+ to add SSE features.
+
+
+I'm facing the same case above but with `ppc64` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Then raise the ceiling of the baseline features to Power8::
+
+ python setup.py build --cpu-baseline="vsx2" bdist
+
+Having issues with `AVX512` features?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may have some reservations about including of `AVX512` or
+any other CPU feature and you want to exclude from the dispatched features::
+
+ python setup.py build --cpu-dispatch="max -avx512f -avx512cd \
+ -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl" \
+ bdist
+
+.. _opt-supported-features:
+
+Supported Features
+------------------
+
+The names of the features can express one feature or a group of features,
+as shown in the following tables supported depend on the lowest interest:
+
+.. note::
+
+ The following features may not be supported by all compilers,
+ also some compilers may produce different set of implied features
+ when it comes to features like ``AVX512``, ``AVX2``, and ``FMA3``.
+ See :ref:`opt-platform-differences` for more details.
+
+.. include:: generated_tables/cpu_features.inc
+
+.. _opt-special-options:
+
+Special Options
+---------------
+
+- ``NONE``: enable no features.
+
+- ``NATIVE``: Enables all CPU features that supported by the host CPU,
+ this operation is based on the compiler flags (``-march=native``, ``-xHost``, ``/QxHost``)
+
+- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
+
+ .. table::
+ :align: left
+
+ ====================================== =======================================
+ For Arch Implies
+ ====================================== =======================================
+ x86 (32-bit mode) ``SSE`` ``SSE2``
+ x86_64 ``SSE`` ``SSE2`` ``SSE3``
+ IBM/POWER (big-endian mode) ``NONE``
+ IBM/POWER (little-endian mode) ``VSX`` ``VSX2``
+ ARMHF ``NONE``
+ ARM64 A.K. AARCH64 ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMD``
+ ====================================== =======================================
+
+- ``MAX``: Enables all supported CPU features by the compiler and platform.
+
+- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
+
+Behaviors
+---------
+
+- CPU features and other options are case-insensitive, for example::
+
+ python setup.py build --cpu-dispatch="SSE41 avx2 FMA3"
+
+- The order of the requested optimizations doesn't matter::
+
+ python setup.py build --cpu-dispatch="SSE41 AVX2 FMA3"
+ # equivalent to
+ python setup.py build --cpu-dispatch="FMA3 AVX2 SSE41"
+
+- Either commas or spaces or '+' can be used as a separator,
+ for example::
+
+ python setup.py build --cpu-dispatch="avx2 avx512f"
+ # or
+ python setup.py build --cpu-dispatch=avx2,avx512f
+ # or
+ python setup.py build --cpu-dispatch="avx2+avx512f"
+
+ all works but arguments should be enclosed in quotes or escaped
+ by backslash if any spaces are used.
+
+- ``--cpu-baseline`` combines all implied CPU features, for example::
+
+ python setup.py build --cpu-baseline=sse42
+ # equivalent to
+ python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-baseline`` will be treated as "native" if compiler native flag
+ ``-march=native`` or ``-xHost`` or ``/QxHost`` is enabled through environment variable
+ `CFLAGS`::
+
+ export CFLAGS="-march=native"
+ python setup.py install --user
+ # is equivalent to
+ python setup.py build --cpu-baseline=native install --user
+
+- ``--cpu-baseline`` escapes any specified features that aren't supported
+ by the target platform or compiler rather than raising fatal errors.
+
+ .. note::
+
+ Since ``--cpu-baseline`` combines all implied features, the maximum
+ supported of implied features will be enabled rather than escape all of them.
+ For example::
+
+ # Requesting `AVX2,FMA3` but the compiler only support **SSE** features
+ python setup.py build --cpu-baseline="avx2 fma3"
+ # is equivalent to
+ python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-dispatch`` does not combain any of implied CPU features,
+ so you must add them unless you want to disable one or all of them::
+
+ # Only dispatches AVX2 and FMA3
+ python setup.py build --cpu-dispatch=avx2,fma3
+ # Dispatches AVX and SSE features
+ python setup.py build --cpu-baseline=ssse3,sse41,sse42,avx,avx2,fma3
+
+- ``--cpu-dispatch`` escapes any specified baseline features and also escapes
+ any features not supported by the target platform or compiler without rasing
+ fatal errors.
+
+Eventually, you should always check the final report through the build log
+to verify the enabled features. See :ref:`opt-build-report` for more details.
+
+.. _opt-platform-differences:
+
+Platform differences
+--------------------
+
+Some exceptional conditions force us to link some features together when it come to
+certain compilers or architectures, resulting in the impossibility of building them separately.
+
+These conditions can be divided into two parts, as follows:
+
+**Architectural compatibility**
+
+The need to align certain CPU features that are assured to be supported by
+successive generations of the same architecture, some cases:
+
+- On ppc64le ``VSX(ISA 2.06)`` and ``VSX2(ISA 2.07)`` both imply one another since the
+ first generation that supports little-endian mode is Power-8`(ISA 2.07)`
+- On AArch64 ``NEON NEON_FP16 NEON_VFPV4 ASIMD`` implies each other since they are part of the
+ hardware baseline.
+
+For example::
+
+ # On ARMv8/A64, specify NEON is going to enable Advanced SIMD
+ # and all predecessor extensions
+ python setup.py build --cpu-baseline=neon
+ # which equivalent to
+ python setup.py build --cpu-baseline="neon neon_fp16 neon_vfpv4 asimd"
+
+.. note::
+
+ Please take a deep look at :ref:`opt-supported-features`,
+ in order to determine the features that imply one another.
+
+**Compilation compatibility**
+
+Some compilers don't provide independent support for all CPU features. For instance
+**Intel**'s compiler doesn't provide separated flags for ``AVX2`` and ``FMA3``,
+it makes sense since all Intel CPUs that comes with ``AVX2`` also support ``FMA3``,
+but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
+
+For example::
+
+ # Specify AVX2 will force enables FMA3 on Intel compilers
+ python setup.py build --cpu-baseline=avx2
+ # which equivalent to
+ python setup.py build --cpu-baseline="avx2 fma3"
+
+
+The following tables only show the differences imposed by some compilers from the
+general context that been shown in the :ref:`opt-supported-features` tables:
+
+.. note::
+
+ Features names with strikeout represent the unsupported CPU features.
+
+.. raw:: html
+
+ <style>
+ .enabled-feature {color:green; font-weight:bold;}
+ .disabled-feature {color:red; text-decoration: line-through;}
+ </style>
+
+.. role:: enabled
+ :class: enabled-feature
+
+.. role:: disabled
+ :class: disabled-feature
+
+.. include:: generated_tables/compilers-diff.inc
+
+.. _opt-build-report:
+
+Build report
+------------
+
+In most cases, the CPU build options do not produce any fatal errors that lead to hanging the build.
+Most of the errors that may appear in the build log serve as heavy warnings due to the lack of some
+expected CPU features by the compiler.
+
+So we strongly recommend checking the final report log, to be aware of what kind of CPU features
+are enabled and what are not.
+
+You can find the final report of CPU optimizations at the end of the build log,
+and here is how it looks on x86_64/gcc:
+
+.. raw:: html
+
+ <style>#build-report .highlight-bash pre{max-height:450px; overflow-y: scroll;}</style>
+
+.. literalinclude:: log_example.txt
+ :language: bash
+
+As you see, there is a separate report for each of ``build_ext`` and ``build_clib``
+that includes several sections, and each section has several values, representing the following:
+
+**Platform**:
+
+- :enabled:`Architecture`: The architecture name of target CPU. It should be one of
+ ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``.
+
+- :enabled:`Compiler`: The compiler name. It should be one of
+ gcc, clang, msvc, icc, iccw or unix-like.
+
+**CPU baseline**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-baseline`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Flags`: The compiler flags that were used to all NumPy `C/C++` sources
+ during the compilation except for temporary sources that have been used for generating
+ the binary objects of dispatched features.
+- :enabled:`Extra checks`: list of internal checks that activate certain functionality
+ or intrinsics related to the enabled features, useful for debugging when it comes
+ to developing SIMD kernels.
+
+**CPU dispatch**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-dispatch`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Generated`: At the beginning of the next row of this property,
+ the features for which optimizations have been generated are shown in the
+ form of several sections with similar properties explained as follows:
+
+ - :enabled:`One or multiple dispatched feature`: The implied CPU features.
+ - :enabled:`Flags`: The compiler flags that been used for these features.
+ - :enabled:`Extra checks`: Similar to the baseline but for these dispatched features.
+ - :enabled:`Detect`: Set of CPU features that need be detected in runtime in order to
+ execute the generated optimizations.
+ - The lines that come after the above property and end with a ':' on a separate line,
+ represent the paths of c/c++ sources that define the generated optimizations.
+
+Runtime Trace
+-------------
+To be completed.
diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
new file mode 100644
index 000000000..d74d54016
--- /dev/null
+++ b/doc/source/reference/simd/gen_features.py
@@ -0,0 +1,194 @@
+"""
+Generate CPU features tables from CCompilerOpt
+"""
+from os import sys, path
+from numpy.distutils.ccompiler_opt import CCompilerOpt
+
+class FakeCCompilerOpt(CCompilerOpt):
+ # disable caching no need for it
+ conf_nocache = True
+
+ def __init__(self, arch, cc, *args, **kwargs):
+ self.fake_info = (arch, cc, '')
+ CCompilerOpt.__init__(self, None, **kwargs)
+
+ def dist_compile(self, sources, flags, **kwargs):
+ return sources
+
+ def dist_info(self):
+ return self.fake_info
+
+ @staticmethod
+ def dist_log(*args, stderr=False):
+ # avoid printing
+ pass
+
+ def feature_test(self, name, force_flags=None, macros=[]):
+ # To speed up
+ return True
+
+class Features:
+ def __init__(self, arch, cc):
+ self.copt = FakeCCompilerOpt(arch, cc, cpu_baseline="max")
+
+ def names(self):
+ return self.copt.cpu_baseline_names()
+
+ def serialize(self, features_names):
+ result = []
+ for f in self.copt.feature_sorted(features_names):
+ gather = self.copt.feature_supported.get(f, {}).get("group", [])
+ implies = self.copt.feature_sorted(self.copt.feature_implies(f))
+ result.append((f, implies, gather))
+ return result
+
+ def table(self, **kwargs):
+ return self.gen_table(self.serialize(self.names()), **kwargs)
+
+ def table_diff(self, vs, **kwargs):
+ fnames = set(self.names())
+ fnames_vs = set(vs.names())
+ common = fnames.intersection(fnames_vs)
+ extra = fnames.difference(fnames_vs)
+ notavl = fnames_vs.difference(fnames)
+ iextra = {}
+ inotavl = {}
+ idiff = set()
+ for f in common:
+ implies = self.copt.feature_implies(f)
+ implies_vs = vs.copt.feature_implies(f)
+ e = implies.difference(implies_vs)
+ i = implies_vs.difference(implies)
+ if not i and not e:
+ continue
+ if e:
+ iextra[f] = e
+ if i:
+ inotavl[f] = e
+ idiff.add(f)
+
+ def fbold(f):
+ if f in extra:
+ return f':enabled:`{f}`'
+ if f in notavl:
+ return f':disabled:`{f}`'
+ return f
+
+ def fbold_implies(f, i):
+ if i in iextra.get(f, {}):
+ return f':enabled:`{i}`'
+ if f in notavl or i in inotavl.get(f, {}):
+ return f':disabled:`{i}`'
+ return i
+
+ diff_all = self.serialize(idiff.union(extra))
+ diff_all += vs.serialize(notavl)
+ content = self.gen_table(
+ diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
+ )
+ return content
+
+ def gen_table(self, serialized_features, fstyle=None, fstyle_implies=None,
+ **kwargs):
+
+ if fstyle is None:
+ fstyle = lambda ft: f'``{ft}``'
+ if fstyle_implies is None:
+ fstyle_implies = lambda origin, ft: fstyle(ft)
+
+ rows = []
+ have_gather = False
+ for f, implies, gather in serialized_features:
+ if gather:
+ have_gather = True
+ name = fstyle(f)
+ implies = ' '.join([fstyle_implies(f, i) for i in implies])
+ gather = ' '.join([fstyle_implies(f, i) for i in gather])
+ rows.append((name, implies, gather))
+ if not rows:
+ return ''
+ fields = ["Name", "Implies", "Gathers"]
+ if not have_gather:
+ del fields[2]
+ rows = [(name, implies) for name, implies, _ in rows]
+ return self.gen_rst_table(fields, rows, **kwargs)
+
+ def gen_rst_table(self, field_names, rows, tab_size=4):
+ assert(not rows or len(field_names) == len(rows[0]))
+ rows.append(field_names)
+ fld_len = len(field_names)
+ cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
+ del rows[-1]
+ cformat = ' '.join('{:<%d}' % i for i in cls_len)
+ border = cformat.format(*['='*i for i in cls_len])
+
+ rows = [cformat.format(*row) for row in rows]
+ # header
+ rows = [border, cformat.format(*field_names), border] + rows
+ # footer
+ rows += [border]
+ # add left margin
+ rows = [(' ' * tab_size) + r for r in rows]
+ return '\n'.join(rows)
+
+def wrapper_section(title, content, tab_size=4):
+ tab = ' '*tab_size
+ if content:
+ return (
+ f"{title}\n{'~'*len(title)}"
+ f"\n.. table::\n{tab}:align: left\n\n"
+ f"{content}\n\n"
+ )
+ return ''
+
+def wrapper_tab(title, table, tab_size=4):
+ tab = ' '*tab_size
+ if table:
+ ('\n' + tab).join((
+ '.. tab:: ' + title,
+ tab + '.. table::',
+ tab + 'align: left',
+ table + '\n\n'
+ ))
+ return ''
+
+
+if __name__ == '__main__':
+
+ pretty_names = {
+ "PPC64": "IBM/POWER big-endian",
+ "PPC64LE": "IBM/POWER little-endian",
+ "ARMHF": "ARMv7/A32",
+ "AARCH64": "ARMv8/A64",
+ "ICC": "Intel Compiler",
+ # "ICCW": "Intel Compiler msvc-like",
+ "MSVC": "Microsoft Visual C/C++"
+ }
+ gen_path = path.join(
+ path.dirname(path.realpath(__file__)), "generated_tables"
+ )
+ with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
+ fd.write(f'.. generated via {__file__}\n\n')
+ for arch in (
+ ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
+ ):
+ title = "On " + pretty_names.get(arch, arch)
+ table = Features(arch, 'gcc').table()
+ fd.write(wrapper_section(title, table))
+
+ with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd:
+ fd.write(f'.. generated via {__file__}\n\n')
+ for arch, cc_names in (
+ ("x86", ("clang", "ICC", "MSVC")),
+ ("PPC64", ("clang",)),
+ ("PPC64LE", ("clang",)),
+ ("ARMHF", ("clang",)),
+ ("AARCH64", ("clang",))
+ ):
+ arch_pname = pretty_names.get(arch, arch)
+ for cc in cc_names:
+ title = f"On {arch_pname}::{pretty_names.get(cc, cc)}"
+ table = Features(arch, cc).table_diff(Features(arch, "gcc"))
+ fd.write(wrapper_section(title, table))
+
+
diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc
new file mode 100644
index 000000000..4b9009a68
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc
@@ -0,0 +1,33 @@
+.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+
+On x86::Intel Compiler
+~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ================ ==========================================================================================================================================
+ Name Implies
+ ================ ==========================================================================================================================================
+ FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`
+ AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`
+ AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`
+ :disabled:`XOP` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+ :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+ ================ ==========================================================================================================================================
+
+On x86::Microsoft Visual C/C++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+ Name Implies Gathers
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+ FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`
+ AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`
+ AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`
+ AVX512CD SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`
+ :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512ER` :disabled:`AVX512PF`
+ :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
new file mode 100644
index 000000000..a7eae5652
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -0,0 +1,93 @@
+.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+
+On x86
+~~~~~~
+.. table::
+ :align: left
+
+ ============== =========================================================================================================================================================================== =====================================================
+ Name Implies Gathers
+ ============== =========================================================================================================================================================================== =====================================================
+ ``SSE`` ``SSE2``
+ ``SSE2`` ``SSE``
+ ``SSE3`` ``SSE`` ``SSE2``
+ ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3``
+ ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``
+ ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``
+ ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``
+ ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``
+ ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
+ ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
+ ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``
+ ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``
+ ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512ER`` ``AVX512PF``
+ ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
+ ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``
+ ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512VNNI``
+ ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512IFMA`` ``AVX512VBMI``
+ ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ``
+ ============== =========================================================================================================================================================================== =====================================================
+
+On IBM/POWER big-endian
+~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ======== ================
+ Name Implies
+ ======== ================
+ ``VSX``
+ ``VSX2`` ``VSX``
+ ``VSX3`` ``VSX`` ``VSX2``
+ ======== ================
+
+On IBM/POWER little-endian
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ======== ================
+ Name Implies
+ ======== ================
+ ``VSX`` ``VSX2``
+ ``VSX2`` ``VSX``
+ ``VSX3`` ``VSX`` ``VSX2``
+ ======== ================
+
+On ARMv7/A32
+~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ============== ===========================================================
+ Name Implies
+ ============== ===========================================================
+ ``NEON``
+ ``NEON_FP16`` ``NEON``
+ ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``
+ ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+ ============== ===========================================================
+
+On ARMv8/A64
+~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ============== ===========================================================
+ Name Implies
+ ============== ===========================================================
+ ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD``
+ ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``
+ ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+ ============== ===========================================================
+
diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst
new file mode 100644
index 000000000..a2882f484
--- /dev/null
+++ b/doc/source/reference/simd/how-it-works.rst
@@ -0,0 +1,349 @@
+**********************************
+How does the CPU dispatcher work?
+**********************************
+
+NumPy dispatcher is based on multi-source compiling, which means taking
+a certain source and compiling it multiple times with different compiler
+flags and also with different **C** definitions that affect the code
+paths. This enables certain instruction-sets for each compiled object
+depending on the required optimizations and ends with linking the
+returned objects together.
+
+.. figure:: ../figures/opt-infra.png
+
+This mechanism should support all compilers and it doesn't require any
+compiler-specific extension, but at the same time it adds a few steps to
+normal compilation that are explained as follows.
+
+1- Configuration
+~~~~~~~~~~~~~~~~
+
+Configuring the required optimization by the user before starting to build the
+source files via the two command arguments as explained above:
+
+- ``--cpu-baseline``: minimal set of required optimizations.
+
+- ``--cpu-dispatch``: dispatched set of additional optimizations.
+
+
+2- Discovering the environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this part, we check the compiler and platform architecture
+and cache some of the intermediary results to speed up rebuilding.
+
+3- Validating the requested optimizations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By testing them against the compiler, and seeing what the compiler can
+support according to the requested optimizations.
+
+4- Generating the main configuration header
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generated header ``_cpu_dispatch.h`` contains all the definitions and
+headers of instruction-sets for the required optimizations that have been
+validated during the previous step.
+
+It also contains extra C definitions that are used for defining NumPy's
+Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispatch__``.
+
+**What is in this header?**
+
+The example header was dynamically generated by gcc on an X86 machine.
+The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
+``--cpu-dispatch="ssse3 sse41"``, and the result is below.
+
+.. code:: c
+
+ // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
+ /**NOTE
+ ** C definitions prefixed with "NPY_HAVE_" represent
+ ** the required optimzations.
+ **
+ ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
+ ** shouldn't be used by any NumPy C sources.
+ */
+ /******* baseline features *******/
+ /** SSE **/
+ #define NPY_HAVE_SSE 1
+ #include <xmmintrin.h>
+ /** SSE2 **/
+ #define NPY_HAVE_SSE2 1
+ #include <emmintrin.h>
+ /** SSE3 **/
+ #define NPY_HAVE_SSE3 1
+ #include <pmmintrin.h>
+
+ /******* dispatch-able features *******/
+ #ifdef NPY__CPU_TARGET_SSSE3
+ /** SSSE3 **/
+ #define NPY_HAVE_SSSE3 1
+ #include <tmmintrin.h>
+ #endif
+ #ifdef NPY__CPU_TARGET_SSE41
+ /** SSE41 **/
+ #define NPY_HAVE_SSE41 1
+ #include <smmintrin.h>
+ #endif
+
+**Baseline features** are the minimal set of required optimizations configured
+via ``--cpu-baseline``. They have no preprocessor guards and they're
+always on, which means they can be used in any source.
+
+Does this mean NumPy's infrastructure passes the compiler's flags of
+baseline features to all sources?
+
+Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
+treated differently.
+
+What if the user specifies certain **baseline features** during the
+build but at runtime the machine doesn't support even these
+features? Will the compiled code be called via one of these definitions, or
+maybe the compiler itself auto-generated/vectorized certain piece of code
+based on the provided command line compiler flags?
+
+During the loading of the NumPy module, there's a validation step
+which detects this behavior. It will raise a Python runtime error to inform the
+user. This is to prevent the CPU reaching an illegal instruction error causing
+a segfault.
+
+**Dispatch-able features** are our dispatched set of additional optimizations
+that were configured via ``--cpu-dispatch``. They are not activated by
+default and are always guarded by other C definitions prefixed with
+``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
+enabled within **dispatch-able sources**.
+
+.. _dispatchable-sources:
+
+5- Dispatch-able sources and configuration statements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dispatch-able sources are special **C** files that can be compiled multiple
+times with different compiler flags and also with different **C**
+definitions. These affect code paths to enable certain
+instruction-sets for each compiled object according to "**the
+configuration statements**" that must be declared between a **C**
+comment\ ``(/**/)`` and start with a special mark **@targets** at the
+top of each dispatch-able source. At the same time, dispatch-able
+sources will be treated as normal **C** sources if the optimization was
+disabled by the command argument ``--disable-optimization`` .
+
+**What are configuration statements?**
+
+Configuration statements are sort of keywords combined together to
+determine the required optimization for the dispatch-able source.
+
+Example:
+
+.. code:: c
+
+ /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
+ // C code
+
+The keywords mainly represent the additional optimizations configured
+through ``--cpu-dispatch``, but it can also represent other options such as:
+
+- Target groups: pre-configured configuration statements used for
+ managing the required optimizations from outside the dispatch-able source.
+
+- Policies: collections of options used for changing the default
+ behaviors or forcing the compilers to perform certain things.
+
+- "baseline": a unique keyword represents the minimal optimizations
+ that configured through ``--cpu-baseline``
+
+**Numpy's infrastructure handles dispatch-able sources in four steps**:
+
+- **(A) Recognition**: Just like source templates and F2PY, the
+ dispatch-able sources requires a special extension ``*.dispatch.c``
+ to mark C dispatch-able source files, and for C++
+ ``*.dispatch.cpp`` or ``*.dispatch.cxx``
+ **NOTE**: C++ not supported yet.
+
+- **(B) Parsing and validating**: In this step, the
+ dispatch-able sources that had been filtered by the previous step
+ are parsed and validated by the configuration statements for each one
+ of them one by one in order to determine the required optimizations.
+
+- **(C) Wrapping**: This is the approach taken by NumPy's
+ infrastructure, which has proved to be sufficiently flexible in order
+ to compile a single source multiple times with different **C**
+ definitions and flags that affect the code paths. The process is
+ achieved by creating a temporary **C** source for each required
+ optimization that related to the additional optimization, which
+ contains the declarations of the **C** definitions and includes the
+ involved source via the **C** directive **#include**. For more
+ clarification take a look at the following code for AVX512F :
+
+ .. code:: c
+
+ /*
+ * this definition is used by NumPy utilities as suffixes for the
+ * exported symbols
+ */
+ #define NPY__CPU_TARGET_CURRENT AVX512F
+ /*
+ * The following definitions enable
+ * definitions of the dispatch-able features that are defined within the main
+ * configuration header. These are definitions for the implied features.
+ */
+ #define NPY__CPU_TARGET_SSE
+ #define NPY__CPU_TARGET_SSE2
+ #define NPY__CPU_TARGET_SSE3
+ #define NPY__CPU_TARGET_SSSE3
+ #define NPY__CPU_TARGET_SSE41
+ #define NPY__CPU_TARGET_POPCNT
+ #define NPY__CPU_TARGET_SSE42
+ #define NPY__CPU_TARGET_AVX
+ #define NPY__CPU_TARGET_F16C
+ #define NPY__CPU_TARGET_FMA3
+ #define NPY__CPU_TARGET_AVX2
+ #define NPY__CPU_TARGET_AVX512F
+ // our dispatch-able source
+ #include "/the/absuolate/path/of/hello.dispatch.c"
+
+- **(D) Dispatch-able configuration header**: The infrastructure
+ generates a config header for each dispatch-able source, this header
+ mainly contains two abstract **C** macros used for identifying the
+ generated objects, so they can be used for runtime dispatching
+ certain symbols from the generated objects by any **C** source. It is
+ also used for forward declarations.
+
+ The generated header takes the name of the dispatch-able source after
+ excluding the extension and replace it with ``.h``, for example
+ assume we have a dispatch-able source called ``hello.dispatch.c`` and
+ contains the following:
+
+ .. code:: c
+
+ // hello.dispatch.c
+ /*@targets baseline sse42 avx512f */
+ #include <stdio.h>
+ #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
+
+ #ifndef NPY__CPU_TARGET_CURRENT
+ // wrapping the dispatch-able source only happens to the additional optimizations
+ // but if the keyword 'baseline' provided within the configuration statements,
+ // the infrastructure will add extra compiling for the dispatch-able source by
+ // passing it as-is to the compiler without any changes.
+ #define CURRENT_TARGET(X) X
+ #define NPY__CPU_TARGET_CURRENT baseline // for printing only
+ #else
+ // since we reach to this point, that's mean we're dealing with
+ // the additional optimizations, so it could be SSE42 or AVX512F
+ #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
+ #endif
+ // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
+ // to avoid linking duplications, NumPy already has a macro called
+ // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
+ // numpy/numpy/core/src/common/npy_cpu_dispatch.h
+ // NOTE: we tend to not adding suffixes to the baseline exported symbols
+ void CURRENT_TARGET(simd_whoami)(const char *extra_info)
+ {
+ printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
+ }
+
+ Now assume you attached **hello.dispatch.c** to the source tree, then
+ the infrastructure should generate a temporary config header called
+ **hello.dispatch.h** that can be reached by any source in the source
+ tree, and it should contain the following code :
+
+ .. code:: c
+
+ #ifndef NPY__CPU_DISPATCH_EXPAND_
+ // To expand the macro calls in this header
+ #define NPY__CPU_DISPATCH_EXPAND_(X) X
+ #endif
+ // Undefining the following macros, due to the possibility of including config headers
+ // multiple times within the same source and since each config header represents
+ // different required optimizations according to the specified configuration
+ // statements in the dispatch-able source that derived from it.
+ #undef NPY__CPU_DISPATCH_BASELINE_CALL
+ #undef NPY__CPU_DISPATCH_CALL
+ // nothing strange here, just a normal preprocessor callback
+ // enabled only if 'baseline' specified within the configuration statements
+ #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+ NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
+ // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
+ // the required optimizations that specified within the configuration statements.
+ //
+ // @param CHK, Expected a macro that can be used to detect CPU features
+ // in runtime, which takes a CPU feature name without string quotes and
+ // returns the testing result in a shape of boolean value.
+ // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
+ //
+ // @param CB, a callback macro that expected to be called multiple times depending
+ // on the required optimizations, the callback should receive the following arguments:
+ // 1- The pending calls of @param CHK filled up with the required CPU features,
+ // that need to be tested first in runtime before executing call belong to
+ // the compiled object.
+ // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
+ // 3- Extra arguments in the macro itself
+ //
+ // By default the callback calls are sorted depending on the highest interest
+ // unless the policy "$keep_sort" was in place within the configuration statements
+ // see "Dive into the CPU dispatcher" for more clarification.
+ #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
+ NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
+ NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
+
+ An example of using the config header in light of the above:
+
+ .. code:: c
+
+ // NOTE: The following macros are only defined for demonstration purposes only.
+ // NumPy already has a collections of macros located at
+ // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
+ // and declarations scenarios.
+
+ #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
+ #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
+
+ // An example for setting a macro that calls all the exported symbols at once
+ // after checking if they're supported by the running machine.
+ #define DISPATCH_CALL_ALL(FN, ARGS) \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
+ // The preprocessor callbacks.
+ // The same suffixes as we define it in the dispatch-able source.
+ #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
+ if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+ #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
+ FN NPY_EXPAND(ARGS);
+
+ // An example for setting a macro that calls the exported symbols of highest
+ // interest optimization, after checking if they're supported by the running machine.
+ #define DISPATCH_CALL_HIGH(FN, ARGS) \
+ if (0) {} \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
+ // The preprocessor callbacks
+ // The same suffixes as we define it in the dispatch-able source.
+ #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
+ else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+ #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
+ else { FN NPY_EXPAND(ARGS); }
+
+ // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
+ // for forward declrations any kind of prototypes based on
+ // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+ // However in this example, we just handle it manually.
+ void simd_whoami(const char *extra_info);
+ void simd_whoami_AVX512F(const char *extra_info);
+ void simd_whoami_SSE41(const char *extra_info);
+
+ void trigger_me(void)
+ {
+ // bring the auto-gernreated config header
+ // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
+ // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+ // it highely recomaned to include the config header before exectuing
+ // the dispatching macros in case if there's another header in the scope.
+ #include "hello.dispatch.h"
+ DISPATCH_CALL_ALL(simd_whoami, ("all"))
+ DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
+ // An example of including multiple config headers in the same source
+ // #include "hello2.dispatch.h"
+ // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
+ }
diff --git a/doc/source/reference/simd/index.rst b/doc/source/reference/simd/index.rst
new file mode 100644
index 000000000..230e2dc15
--- /dev/null
+++ b/doc/source/reference/simd/index.rst
@@ -0,0 +1,43 @@
+.. _numpysimd:
+.. currentmodule:: numpysimd
+
+***********************
+CPU/SIMD Optimizations
+***********************
+
+NumPy comes with a flexible working mechanism that allows it to harness the SIMD
+features that CPUs own, in order to provide faster and more stable performance
+on all popular platforms. Currently, NumPy supports the X86, IBM/Power, ARM7 and ARM8
+architectures.
+
+The optimization process in NumPy is carried out in three layers:
+
+- Code is *written* using the universal intrinsics which is a set of types, macros and
+ functions that are mapped to each supported instruction-sets by using guards that
+ will enable use of the them only when the compiler recognizes them.
+ This allow us to generate multiple kernels for the same functionality,
+ in which each generated kernel represents a set of instructions that related one
+ or multiple certain CPU features. The first kernel represents the minimum (baseline)
+ CPU features, and the other kernels represent the additional (dispatched) CPU features.
+
+- At *compile* time, CPU build options are used to define the minimum and
+ additional features to support, based on user choice and compiler support. The
+ appropriate intrinsics are overlaid with the platform / architecture intrinsics,
+ and multiple kernels are compiled.
+
+- At *runtime import*, the CPU is probed for the set of supported CPU
+ features. A mechanism is used to grab the pointer to the most appropriate
+ kernel, and this will be the one called for the function.
+
+.. note::
+
+ NumPy community had a deep discussion before implementing this work,
+ please check `NEP-38`_ for more clarification.
+
+.. toctree::
+
+ build-options
+ how-it-works
+
+.. _`NEP-38`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+
diff --git a/doc/source/reference/simd/log_example.txt b/doc/source/reference/simd/log_example.txt
new file mode 100644
index 000000000..b0c732433
--- /dev/null
+++ b/doc/source/reference/simd/log_example.txt
@@ -0,0 +1,79 @@
+########### EXT COMPILER OPTIMIZATION ###########
+Platform :
+ Architecture: x64
+ Compiler : gcc
+
+CPU baseline :
+ Requested : 'min'
+ Enabled : SSE SSE2 SSE3
+ Flags : -msse -msse2 -msse3
+ Extra checks: none
+
+CPU dispatch :
+ Requested : 'max -xop -fma4'
+ Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+ Generated :
+ :
+ SSE41 : SSE SSE2 SSE3 SSSE3
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1
+ Extra checks: none
+ Detect : SSE SSE2 SSE3 SSSE3 SSE41
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : numpy/core/src/umath/_umath_tests.dispatch.c
+ :
+ SSE42 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2
+ Extra checks: none
+ Detect : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ :
+ AVX2 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mavx2
+ Extra checks: none
+ Detect : AVX F16C AVX2
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : numpy/core/src/umath/_umath_tests.dispatch.c
+ :
+ (FMA3 AVX2) : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2
+ Extra checks: none
+ Detect : AVX F16C FMA3 AVX2
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+ :
+ AVX512F : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f
+ Extra checks: AVX512F_REDUCE
+ Detect : AVX512F
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+ :
+ AVX512_SKX : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq
+ Extra checks: AVX512BW_MASK AVX512DQ_MASK
+ Detect : AVX512_SKX
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+CCompilerOpt.cache_flush[804] : write cache to path -> /home/seiko/work/repos/numpy/build/temp.linux-x86_64-3.9/ccompiler_opt_cache_ext.py
+
+########### CLIB COMPILER OPTIMIZATION ###########
+Platform :
+ Architecture: x64
+ Compiler : gcc
+
+CPU baseline :
+ Requested : 'min'
+ Enabled : SSE SSE2 SSE3
+ Flags : -msse -msse2 -msse3
+ Extra checks: none
+
+CPU dispatch :
+ Requested : 'max -xop -fma4'
+ Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+ Generated : none
diff --git a/doc/source/reference/simd/simd-optimizations-tables-diff.inc b/doc/source/reference/simd/simd-optimizations-tables-diff.inc
deleted file mode 100644
index 41fa96703..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables-diff.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86::Intel Compiler - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- =========== ==================================================================================================================
- Name Implies
- =========== ==================================================================================================================
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD**
- =========== ==================================================================================================================
-
-.. note::
- The following features aren't supported by x86::Intel Compiler:
- **XOP FMA4**
-
-x86::Microsoft Visual C/C++ - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============ =================================================================================================================================
- Name Implies
- ============ =================================================================================================================================
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD** **AVX512_SKX**
- ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` **AVX512_SKX**
- ============ =================================================================================================================================
-
-.. note::
- The following features aren't supported by x86::Microsoft Visual C/C++:
- **AVX512_KNL AVX512_KNM**
-
diff --git a/doc/source/reference/simd/simd-optimizations-tables.inc b/doc/source/reference/simd/simd-optimizations-tables.inc
deleted file mode 100644
index f038a91e1..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables.inc
+++ /dev/null
@@ -1,103 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============ =================================================================================================================
- Name Implies
- ============ =================================================================================================================
- ``SSE`` ``SSE2``
- ``SSE2`` ``SSE``
- ``SSE3`` ``SSE`` ``SSE2``
- ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3``
- ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``
- ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``
- ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``
- ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``
- ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``
- ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``
- ============ =================================================================================================================
-
-x86 - Group names
-~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===================================================== ===========================================================================================================================================================================
- Name Gather Implies
- ============== ===================================================== ===========================================================================================================================================================================
- ``AVX512_KNL`` ``AVX512ER`` ``AVX512PF`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``
- ``AVX512_KNM`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``
- ``AVX512_SKX`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``
- ``AVX512_CLX`` ``AVX512VNNI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``
- ``AVX512_CNL`` ``AVX512IFMA`` ``AVX512VBMI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``
- ``AVX512_ICL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL``
- ============== ===================================================== ===========================================================================================================================================================================
-
-IBM/POWER big-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ======== ================
- Name Implies
- ======== ================
- ``VSX``
- ``VSX2`` ``VSX``
- ``VSX3`` ``VSX`` ``VSX2``
- ======== ================
-
-IBM/POWER little-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ======== ================
- Name Implies
- ======== ================
- ``VSX`` ``VSX2``
- ``VSX2`` ``VSX``
- ``VSX3`` ``VSX`` ``VSX2``
- ======== ================
-
-ARMv7/A32 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===========================================================
- Name Implies
- ============== ===========================================================
- ``NEON``
- ``NEON_FP16`` ``NEON``
- ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``
- ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
- ============== ===========================================================
-
-ARMv8/A64 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===========================================================
- Name Implies
- ============== ===========================================================
- ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD``
- ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``
- ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
- ============== ===========================================================
-
diff --git a/doc/source/reference/simd/simd-optimizations.py b/doc/source/reference/simd/simd-optimizations.py
deleted file mode 100644
index a78302db5..000000000
--- a/doc/source/reference/simd/simd-optimizations.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""
-Generate CPU features tables from CCompilerOpt
-"""
-from os import sys, path
-gen_path = path.dirname(path.realpath(__file__))
-#sys.path.append(path.abspath(path.join(gen_path, *([".."]*4), "numpy", "distutils")))
-#from ccompiler_opt import CCompilerOpt
-from numpy.distutils.ccompiler_opt import CCompilerOpt
-
-class FakeCCompilerOpt(CCompilerOpt):
- fake_info = ("arch", "compiler", "extra_args")
- # disable caching no need for it
- conf_nocache = True
- def __init__(self, *args, **kwargs):
- no_cc = None
- CCompilerOpt.__init__(self, no_cc, **kwargs)
- def dist_compile(self, sources, flags, **kwargs):
- return sources
- def dist_info(self):
- return FakeCCompilerOpt.fake_info
- @staticmethod
- def dist_log(*args, stderr=False):
- # avoid printing
- pass
- def feature_test(self, name, force_flags=None):
- # To speed up
- return True
-
- def gen_features_table(self, features, ignore_groups=True,
- field_names=["Name", "Implies"],
- fstyle=None, fstyle_implies=None, **kwargs):
- rows = []
- if fstyle is None:
- fstyle = lambda ft: f'``{ft}``'
- if fstyle_implies is None:
- fstyle_implies = lambda origin, ft: fstyle(ft)
- for f in self.feature_sorted(features):
- is_group = "group" in self.feature_supported.get(f, {})
- if ignore_groups and is_group:
- continue
- implies = self.feature_sorted(self.feature_implies(f))
- implies = ' '.join([fstyle_implies(f, i) for i in implies])
- rows.append([fstyle(f), implies])
- if rows:
- return self.gen_rst_table(field_names, rows, **kwargs)
-
- def gen_gfeatures_table(self, features,
- field_names=["Name", "Gather", "Implies"],
- fstyle=None, fstyle_implies=None, **kwargs):
- rows = []
- if fstyle is None:
- fstyle = lambda ft: f'``{ft}``'
- if fstyle_implies is None:
- fstyle_implies = lambda origin, ft: fstyle(ft)
- for f in self.feature_sorted(features):
- gather = self.feature_supported.get(f, {}).get("group", None)
- if not gather:
- continue
- implies = self.feature_sorted(self.feature_implies(f))
- implies = ' '.join([fstyle_implies(f, i) for i in implies])
- gather = ' '.join([fstyle_implies(f, i) for i in gather])
- rows.append([fstyle(f), gather, implies])
- if rows:
- return self.gen_rst_table(field_names, rows, **kwargs)
-
- def gen_rst_table(self, field_names, rows, tab_size=4):
- assert(not rows or len(field_names) == len(rows[0]))
- rows.append(field_names)
- fld_len = len(field_names)
- cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
- del rows[-1]
- cformat = ' '.join('{:<%d}' % i for i in cls_len)
- border = cformat.format(*['='*i for i in cls_len])
-
- rows = [cformat.format(*row) for row in rows]
- # header
- rows = [border, cformat.format(*field_names), border] + rows
- # footer
- rows += [border]
- # add left margin
- rows = [(' ' * tab_size) + r for r in rows]
- return '\n'.join(rows)
-
-def features_table_sections(name, ftable=None, gtable=None, tab_size=4):
- tab = ' '*tab_size
- content = ''
- if ftable:
- title = f"{name} - CPU feature names"
- content = (
- f"{title}\n{'~'*len(title)}"
- f"\n.. table::\n{tab}:align: left\n\n"
- f"{ftable}\n\n"
- )
- if gtable:
- title = f"{name} - Group names"
- content += (
- f"{title}\n{'~'*len(title)}"
- f"\n.. table::\n{tab}:align: left\n\n"
- f"{gtable}\n\n"
- )
- return content
-
-def features_table(arch, cc="gcc", pretty_name=None, **kwargs):
- FakeCCompilerOpt.fake_info = (arch, cc, '')
- ccopt = FakeCCompilerOpt(cpu_baseline="max")
- features = ccopt.cpu_baseline_names()
- ftable = ccopt.gen_features_table(features, **kwargs)
- gtable = ccopt.gen_gfeatures_table(features, **kwargs)
-
- if not pretty_name:
- pretty_name = arch + '/' + cc
- return features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
-def features_table_diff(arch, cc, cc_vs="gcc", pretty_name=None, **kwargs):
- FakeCCompilerOpt.fake_info = (arch, cc, '')
- ccopt = FakeCCompilerOpt(cpu_baseline="max")
- fnames = ccopt.cpu_baseline_names()
- features = {f:ccopt.feature_implies(f) for f in fnames}
-
- FakeCCompilerOpt.fake_info = (arch, cc_vs, '')
- ccopt_vs = FakeCCompilerOpt(cpu_baseline="max")
- fnames_vs = ccopt_vs.cpu_baseline_names()
- features_vs = {f:ccopt_vs.feature_implies(f) for f in fnames_vs}
-
- common = set(fnames).intersection(fnames_vs)
- extra_avl = set(fnames).difference(fnames_vs)
- not_avl = set(fnames_vs).difference(fnames)
- diff_impl_f = {f:features[f].difference(features_vs[f]) for f in common}
- diff_impl = {k for k, v in diff_impl_f.items() if v}
-
- fbold = lambda ft: f'**{ft}**' if ft in extra_avl else f'``{ft}``'
- fbold_implies = lambda origin, ft: (
- f'**{ft}**' if ft in diff_impl_f.get(origin, {}) else f'``{ft}``'
- )
- diff_all = diff_impl.union(extra_avl)
- ftable = ccopt.gen_features_table(
- diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
- )
- gtable = ccopt.gen_gfeatures_table(
- diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
- )
- if not pretty_name:
- pretty_name = arch + '/' + cc
- content = features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
- if not_avl:
- not_avl = ccopt_vs.feature_sorted(not_avl)
- not_avl = ' '.join(not_avl)
- content += (
- ".. note::\n"
- f" The following features aren't supported by {pretty_name}:\n"
- f" **{not_avl}**\n\n"
- )
- return content
-
-if __name__ == '__main__':
- pretty_names = {
- "PPC64": "IBM/POWER big-endian",
- "PPC64LE": "IBM/POWER little-endian",
- "ARMHF": "ARMv7/A32",
- "AARCH64": "ARMv8/A64",
- "ICC": "Intel Compiler",
- # "ICCW": "Intel Compiler msvc-like",
- "MSVC": "Microsoft Visual C/C++"
- }
- with open(path.join(gen_path, 'simd-optimizations-tables.inc'), 'wt') as fd:
- fd.write(f'.. generated via {__file__}\n\n')
- for arch in (
- ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
- ):
- pretty_name = pretty_names.get(arch, arch)
- table = features_table(arch=arch, pretty_name=pretty_name)
- assert(table)
- fd.write(table)
-
- with open(path.join(gen_path, 'simd-optimizations-tables-diff.inc'), 'wt') as fd:
- fd.write(f'.. generated via {__file__}\n\n')
- for arch, cc_names in (
- ("x86", ("clang", "ICC", "MSVC")),
- ("PPC64", ("clang",)),
- ("PPC64LE", ("clang",)),
- ("ARMHF", ("clang",)),
- ("AARCH64", ("clang",))
- ):
- arch_pname = pretty_names.get(arch, arch)
- for cc in cc_names:
- pretty_name = f"{arch_pname}::{pretty_names.get(cc, cc)}"
- table = features_table_diff(arch=arch, cc=cc, pretty_name=pretty_name)
- if table:
- fd.write(table)
diff --git a/doc/source/reference/simd/simd-optimizations.rst b/doc/source/reference/simd/simd-optimizations.rst
index 9de6d1734..a18108266 100644
--- a/doc/source/reference/simd/simd-optimizations.rst
+++ b/doc/source/reference/simd/simd-optimizations.rst
@@ -1,527 +1,12 @@
-******************
-SIMD Optimizations
-******************
+:orphan:
-NumPy provides a set of macros that define `Universal Intrinsics`_ to
-abstract out typical platform-specific intrinsics so SIMD code needs to be
-written only once. There are three layers:
+.. raw:: html
-- Code is *written* using the universal intrinsic macros, with guards that
- will enable use of the macros only when the compiler recognizes them.
- In NumPy, these are used to construct multiple ufunc loops. Current policy is
- to create three loops: One loop is the default and uses no intrinsics. One
- uses the minimum intrinsics required on the architecture. And the third is
- written using the maximum set of intrinsics possible.
-- At *compile* time, a distutils command is used to define the minimum and
- maximum features to support, based on user choice and compiler support. The
- appropriate macros are overlaid with the platform / architecture intrinsics,
- and the three loops are compiled.
-- At *runtime import*, the CPU is probed for the set of supported intrinsic
- features. A mechanism is used to grab the pointer to the most appropriate
- function, and this will be the one called for the function.
+ <html>
+ <head>
+ <meta http-equiv="refresh" content="0; url=index.html"/>
+ </head>
+ </html>
-
-Build options for compilation
-=============================
-
-- ``--cpu-baseline``: minimal set of required optimizations. Default
- value is ``min`` which provides the minimum CPU features that can
- safely run on a wide range of platforms within the processor family.
-
-- ``--cpu-dispatch``: dispatched set of additional optimizations.
- The default value is ``max -xop -fma4`` which enables all CPU
- features, except for AMD legacy features(in case of X86).
-
-The command arguments are available in ``build``, ``build_clib``, and
-``build_ext``.
-if ``build_clib`` or ``build_ext`` are not specified by the user, the arguments of
-``build`` will be used instead, which also holds the default values.
-
-Optimization names can be CPU features or groups of features that gather
-several features or :ref:`special options <special-options>` to perform a series of procedures.
-
-
-The following tables show the current supported optimizations sorted from the lowest to the highest interest.
-
-.. include:: simd-optimizations-tables.inc
-
-----
-
-.. _tables-diff:
-
-While the above tables are based on the GCC Compiler, the following tables showing the differences in the
-other compilers:
-
-.. include:: simd-optimizations-tables-diff.inc
-
-.. _special-options:
-
-Special options
-~~~~~~~~~~~~~~~
-
-- ``NONE``: enable no features
-
-- ``NATIVE``: Enables all CPU features that supported by the current
- machine, this operation is based on the compiler flags (``-march=native, -xHost, /QxHost``)
-
-- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
-
- .. table::
- :align: left
-
- ====================================== =======================================
- For Arch Returns
- ====================================== =======================================
- ``x86`` ``SSE`` ``SSE2``
- ``x86`` ``64-bit mode`` ``SSE`` ``SSE2`` ``SSE3``
- ``IBM/POWER`` ``big-endian mode`` ``NONE``
- ``IBM/POWER`` ``little-endian mode`` ``VSX`` ``VSX2``
- ``ARMHF`` ``NONE``
- ``ARM64`` ``AARCH64`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMD``
- ====================================== =======================================
-
-- ``MAX``: Enables all supported CPU features by the Compiler and platform.
-
-- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
-
-NOTES
-~~~~~~~~~~~~~
-- CPU features and other options are case-insensitive.
-
-- The order of the requested optimizations doesn't matter.
-
-- Either commas or spaces can be used as a separator, e.g. ``--cpu-dispatch``\ =
- "avx2 avx512f" or ``--cpu-dispatch``\ = "avx2, avx512f" both work, but the
- arguments must be enclosed in quotes.
-
-- The operand ``+`` is only added for nominal reasons, For example:
- ``--cpu-baseline= "min avx2"`` is equivalent to ``--cpu-baseline="min + avx2"``.
- ``--cpu-baseline="min,avx2"`` is equivalent to ``--cpu-baseline`="min,+avx2"``
-
-- If the CPU feature is not supported by the user platform or
- compiler, it will be skipped rather than raising a fatal error.
-
-- Any specified CPU feature to ``--cpu-dispatch`` will be skipped if
- it's part of CPU baseline features
-
-- The ``--cpu-baseline`` argument force-enables implied features,
- e.g. ``--cpu-baseline``\ ="sse42" is equivalent to
- ``--cpu-baseline``\ ="sse sse2 sse3 ssse3 sse41 popcnt sse42"
-
-- The value of ``--cpu-baseline`` will be treated as "native" if
- compiler native flag ``-march=native`` or ``-xHost`` or ``QxHost`` is
- enabled through environment variable ``CFLAGS``
-
-- The validation process for the requested optimizations when it comes to
- ``--cpu-baseline`` isn't strict. For example, if the user requested
- ``AVX2`` but the compiler doesn't support it then we just skip it and return
- the maximum optimization that the compiler can handle depending on the
- implied features of ``AVX2``, let us assume ``AVX``.
-
-- The user should always check the final report through the build log
- to verify the enabled features.
-
-Special cases
-~~~~~~~~~~~~~
-
-**Interrelated CPU features**: Some exceptional conditions force us to link some features together when it come to certain compilers or architectures, resulting in the impossibility of building them separately.
-These conditions can be divided into two parts, as follows:
-
-- **Architectural compatibility**: The need to align certain CPU features that are assured
- to be supported by successive generations of the same architecture, for example:
-
- - On ppc64le `VSX(ISA 2.06)` and `VSX2(ISA 2.07)` both imply one another since the
- first generation that supports little-endian mode is Power-8`(ISA 2.07)`
- - On AArch64 `NEON` `FP16` `VFPV4` `ASIMD` implies each other since they are part of the
- hardware baseline.
-
-- **Compilation compatibility**: Not all **C/C++** compilers provide independent support for all CPU
- features. For example, **Intel**'s compiler doesn't provide separated flags for `AVX2` and `FMA3`,
- it makes sense since all Intel CPUs that comes with `AVX2` also support `FMA3` and vice versa,
- but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
- Therefore, there are differences in the depiction of CPU features between the C/C++ compilers,
- as shown in the :ref:`tables above <tables-diff>`.
-
-
-Behaviors and Errors
-~~~~~~~~~~~~~~~~~~~~
-
-
-
-Usage and Examples
-~~~~~~~~~~~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-Understanding CPU Dispatching, How the NumPy dispatcher works?
-==============================================================
-
-NumPy dispatcher is based on multi-source compiling, which means taking
-a certain source and compiling it multiple times with different compiler
-flags and also with different **C** definitions that affect the code
-paths to enable certain instruction-sets for each compiled object
-depending on the required optimizations, then combining the returned
-objects together.
-
-.. figure:: ../figures/opt-infra.png
-
-This mechanism should support all compilers and it doesn't require any
-compiler-specific extension, but at the same time it is adds a few steps to
-normal compilation that are explained as follows:
-
-1- Configuration
-~~~~~~~~~~~~~~~~
-
-Configuring the required optimization by the user before starting to build the
-source files via the two command arguments as explained above:
-
-- ``--cpu-baseline``: minimal set of required optimizations.
-
-- ``--cpu-dispatch``: dispatched set of additional optimizations.
-
-
-2- Discovering the environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In this part, we check the compiler and platform architecture
-and cache some of the intermediary results to speed up rebuilding.
-
-3- Validating the requested optimizations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-By testing them against the compiler, and seeing what the compiler can
-support according to the requested optimizations.
-
-4- Generating the main configuration header
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The generated header ``_cpu_dispatch.h`` contains all the definitions and
-headers of instruction-sets for the required optimizations that have been
-validated during the previous step.
-
-It also contains extra C definitions that are used for defining NumPy's
-Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispaٍtch__``.
-
-**What is in this header?**
-
-The example header was dynamically generated by gcc on an X86 machine.
-The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
-``--cpu-dispatch="ssse3 sse41"``, and the result is below.
-
-.. code:: c
-
- // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
- /**NOTE
- ** C definitions prefixed with "NPY_HAVE_" represent
- ** the required optimzations.
- **
- ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
- ** shouldn't be used by any NumPy C sources.
- */
- /******* baseline features *******/
- /** SSE **/
- #define NPY_HAVE_SSE 1
- #include <xmmintrin.h>
- /** SSE2 **/
- #define NPY_HAVE_SSE2 1
- #include <emmintrin.h>
- /** SSE3 **/
- #define NPY_HAVE_SSE3 1
- #include <pmmintrin.h>
-
- /******* dispatch-able features *******/
- #ifdef NPY__CPU_TARGET_SSSE3
- /** SSSE3 **/
- #define NPY_HAVE_SSSE3 1
- #include <tmmintrin.h>
- #endif
- #ifdef NPY__CPU_TARGET_SSE41
- /** SSE41 **/
- #define NPY_HAVE_SSE41 1
- #include <smmintrin.h>
- #endif
-
-**Baseline features** are the minimal set of required optimizations configured
-via ``--cpu-baseline``. They have no preprocessor guards and they're
-always on, which means they can be used in any source.
-
-Does this mean NumPy's infrastructure passes the compiler's flags of
-baseline features to all sources?
-
-Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
-treated differently.
-
-What if the user specifies certain **baseline features** during the
-build but at runtime the machine doesn't support even these
-features? Will the compiled code be called via one of these definitions, or
-maybe the compiler itself auto-generated/vectorized certain piece of code
-based on the provided command line compiler flags?
-
-During the loading of the NumPy module, there's a validation step
-which detects this behavior. It will raise a Python runtime error to inform the
-user. This is to prevent the CPU reaching an illegal instruction error causing
-a segfault.
-
-**Dispatch-able features** are our dispatched set of additional optimizations
-that were configured via ``--cpu-dispatch``. They are not activated by
-default and are always guarded by other C definitions prefixed with
-``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
-enabled within **dispatch-able sources**.
-
-.. _dispatchable-sources:
-
-5- Dispatch-able sources and configuration statements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Dispatch-able sources are special **C** files that can be compiled multiple
-times with different compiler flags and also with different **C**
-definitions. These affect code paths to enable certain
-instruction-sets for each compiled object according to "**the
-configuration statements**" that must be declared between a **C**
-comment\ ``(/**/)`` and start with a special mark **@targets** at the
-top of each dispatch-able source. At the same time, dispatch-able
-sources will be treated as normal **C** sources if the optimization was
-disabled by the command argument ``--disable-optimization`` .
-
-**What are configuration statements?**
-
-Configuration statements are sort of keywords combined together to
-determine the required optimization for the dispatch-able source.
-
-Example:
-
-.. code:: c
-
- /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
- // C code
-
-The keywords mainly represent the additional optimizations configured
-through ``--cpu-dispatch``, but it can also represent other options such as:
-
-- Target groups: pre-configured configuration statements used for
- managing the required optimizations from outside the dispatch-able source.
-
-- Policies: collections of options used for changing the default
- behaviors or forcing the compilers to perform certain things.
-
-- "baseline": a unique keyword represents the minimal optimizations
- that configured through ``--cpu-baseline``
-
-**Numpy's infrastructure handles dispatch-able sources in four steps**:
-
-- **(A) Recognition**: Just like source templates and F2PY, the
- dispatch-able sources requires a special extension ``*.dispatch.c``
- to mark C dispatch-able source files, and for C++
- ``*.dispatch.cpp`` or ``*.dispatch.cxx``
- **NOTE**: C++ not supported yet.
-
-- **(B) Parsing and validating**: In this step, the
- dispatch-able sources that had been filtered by the previous step
- are parsed and validated by the configuration statements for each one
- of them one by one in order to determine the required optimizations.
-
-- **(C) Wrapping**: This is the approach taken by NumPy's
- infrastructure, which has proved to be sufficiently flexible in order
- to compile a single source multiple times with different **C**
- definitions and flags that affect the code paths. The process is
- achieved by creating a temporary **C** source for each required
- optimization that related to the additional optimization, which
- contains the declarations of the **C** definitions and includes the
- involved source via the **C** directive **#include**. For more
- clarification take a look at the following code for AVX512F :
-
- .. code:: c
-
- /*
- * this definition is used by NumPy utilities as suffixes for the
- * exported symbols
- */
- #define NPY__CPU_TARGET_CURRENT AVX512F
- /*
- * The following definitions enable
- * definitions of the dispatch-able features that are defined within the main
- * configuration header. These are definitions for the implied features.
- */
- #define NPY__CPU_TARGET_SSE
- #define NPY__CPU_TARGET_SSE2
- #define NPY__CPU_TARGET_SSE3
- #define NPY__CPU_TARGET_SSSE3
- #define NPY__CPU_TARGET_SSE41
- #define NPY__CPU_TARGET_POPCNT
- #define NPY__CPU_TARGET_SSE42
- #define NPY__CPU_TARGET_AVX
- #define NPY__CPU_TARGET_F16C
- #define NPY__CPU_TARGET_FMA3
- #define NPY__CPU_TARGET_AVX2
- #define NPY__CPU_TARGET_AVX512F
- // our dispatch-able source
- #include "/the/absuolate/path/of/hello.dispatch.c"
-
-- **(D) Dispatch-able configuration header**: The infrastructure
- generates a config header for each dispatch-able source, this header
- mainly contains two abstract **C** macros used for identifying the
- generated objects, so they can be used for runtime dispatching
- certain symbols from the generated objects by any **C** source. It is
- also used for forward declarations.
-
- The generated header takes the name of the dispatch-able source after
- excluding the extension and replace it with '**.h**', for example
- assume we have a dispatch-able source called **hello.dispatch.c** and
- contains the following:
-
- .. code:: c
-
- // hello.dispatch.c
- /*@targets baseline sse42 avx512f */
- #include <stdio.h>
- #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
-
- #ifndef NPY__CPU_TARGET_CURRENT
- // wrapping the dispatch-able source only happens to the additional optimizations
- // but if the keyword 'baseline' provided within the configuration statements,
- // the infrastructure will add extra compiling for the dispatch-able source by
- // passing it as-is to the compiler without any changes.
- #define CURRENT_TARGET(X) X
- #define NPY__CPU_TARGET_CURRENT baseline // for printing only
- #else
- // since we reach to this point, that's mean we're dealing with
- // the additional optimizations, so it could be SSE42 or AVX512F
- #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
- #endif
- // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
- // to avoid linking duplications, NumPy already has a macro called
- // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
- // numpy/numpy/core/src/common/npy_cpu_dispatch.h
- // NOTE: we tend to not adding suffixes to the baseline exported symbols
- void CURRENT_TARGET(simd_whoami)(const char *extra_info)
- {
- printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
- }
-
- Now assume you attached **hello.dispatch.c** to the source tree, then
- the infrastructure should generate a temporary config header called
- **hello.dispatch.h** that can be reached by any source in the source
- tree, and it should contain the following code :
-
- .. code:: c
-
- #ifndef NPY__CPU_DISPATCH_EXPAND_
- // To expand the macro calls in this header
- #define NPY__CPU_DISPATCH_EXPAND_(X) X
- #endif
- // Undefining the following macros, due to the possibility of including config headers
- // multiple times within the same source and since each config header represents
- // different required optimizations according to the specified configuration
- // statements in the dispatch-able source that derived from it.
- #undef NPY__CPU_DISPATCH_BASELINE_CALL
- #undef NPY__CPU_DISPATCH_CALL
- // nothing strange here, just a normal preprocessor callback
- // enabled only if 'baseline' specified within the configuration statements
- #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
- NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
- // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
- // the required optimizations that specified within the configuration statements.
- //
- // @param CHK, Expected a macro that can be used to detect CPU features
- // in runtime, which takes a CPU feature name without string quotes and
- // returns the testing result in a shape of boolean value.
- // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
- //
- // @param CB, a callback macro that expected to be called multiple times depending
- // on the required optimizations, the callback should receive the following arguments:
- // 1- The pending calls of @param CHK filled up with the required CPU features,
- // that need to be tested first in runtime before executing call belong to
- // the compiled object.
- // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
- // 3- Extra arguments in the macro itself
- //
- // By default the callback calls are sorted depending on the highest interest
- // unless the policy "$keep_sort" was in place within the configuration statements
- // see "Dive into the CPU dispatcher" for more clarification.
- #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
- NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
- NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
-
- An example of using the config header in light of the above:
-
- .. code:: c
-
- // NOTE: The following macros are only defined for demonstration purposes only.
- // NumPy already has a collections of macros located at
- // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
- // and declarations scenarios.
-
- #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
- #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
-
- // An example for setting a macro that calls all the exported symbols at once
- // after checking if they're supported by the running machine.
- #define DISPATCH_CALL_ALL(FN, ARGS) \
- NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
- NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
- // The preprocessor callbacks.
- // The same suffixes as we define it in the dispatch-able source.
- #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
- if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
- #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
- FN NPY_EXPAND(ARGS);
-
- // An example for setting a macro that calls the exported symbols of highest
- // interest optimization, after checking if they're supported by the running machine.
- #define DISPATCH_CALL_HIGH(FN, ARGS) \
- if (0) {} \
- NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
- NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
- // The preprocessor callbacks
- // The same suffixes as we define it in the dispatch-able source.
- #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
- else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
- #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
- else { FN NPY_EXPAND(ARGS); }
-
- // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
- // for forward declrations any kind of prototypes based on
- // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
- // However in this example, we just handle it manually.
- void simd_whoami(const char *extra_info);
- void simd_whoami_AVX512F(const char *extra_info);
- void simd_whoami_SSE41(const char *extra_info);
-
- void trigger_me(void)
- {
- // bring the auto-gernreated config header
- // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
- // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
- // it highely recomaned to include the config header before exectuing
- // the dispatching macros in case if there's another header in the scope.
- #include "hello.dispatch.h"
- DISPATCH_CALL_ALL(simd_whoami, ("all"))
- DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
- // An example of including multiple config headers in the same source
- // #include "hello2.dispatch.h"
- // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
- }
-
-
-Dive into the CPU dispatcher
-============================
-
-The baseline
-~~~~~~~~~~~~
-
-Dispatcher
-~~~~~~~~~~
-
-Groups and Policies
-~~~~~~~~~~~~~~~~~~~
-
-Examples
-~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-
-.. _`Universal Intrinsics`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+The location of this document has been changed , if you are not
+redirected in few seconds, `click here <index.html>`_.
diff --git a/doc/source/user/basics.creation.rst b/doc/source/user/basics.creation.rst
index 84ff1c30e..523a05379 100644
--- a/doc/source/user/basics.creation.rst
+++ b/doc/source/user/basics.creation.rst
@@ -74,10 +74,11 @@ assign a new type that satisfies all of the array elements involved in
the computation, here ``uint32`` and ``int32`` can both be represented in
as ``int64``.
-The default NumPy behavior is to create arrays in either 64-bit signed
-integers or double precision floating point numbers, ``int64`` and
-``float``, respectively. If you expect your arrays to be a certain type,
-then you need to specify the ``dtype`` while you create the array.
+The default NumPy behavior is to create arrays in either 32 or 64-bit signed
+integers (platform dependent and matches C int size) or double precision
+floating point numbers, int32/int64 and float, respectively. If you expect your
+integer arrays to be a specific type, then you need to specify the dtype while
+you create the array.
2) Intrinsic NumPy array creation functions
===========================================
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 4c6b09b80..934434370 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -592,17 +592,19 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc,
Py_INCREF(operation_DTypes[i]);
Py_DECREF(out_descrs[i]);
}
- if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver) {
- /*
- * In this one case, the deprecation means that we actually override
- * the signature.
- */
- for (int i = 0; i < nargs; i++) {
- if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
- Py_INCREF(operation_DTypes[i]);
- Py_SETREF(signature[i], operation_DTypes[i]);
- *out_cacheable = 0;
- }
+ /*
+ * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation
+ * warning (ignoring `dtype=`) and cannot be cached.
+ * All datetime ones *should* have a warning, but currently don't,
+ * but ignore all signature passing also. So they can also
+ * not be cached, and they mutate the signature which of course is wrong,
+ * but not doing it would confuse the code later.
+ */
+ for (int i = 0; i < nargs; i++) {
+ if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
+ Py_INCREF(operation_DTypes[i]);
+ Py_SETREF(signature[i], operation_DTypes[i]);
+ *out_cacheable = 0;
}
}
return 0;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 9107323b0..1b310b471 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2737,7 +2737,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
}
PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
- ops, signature, operation_DTypes, NPY_FALSE, NPY_FALSE, NPY_TRUE);
+ ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE, NPY_TRUE);
/* Output can currently get cleared, others XDECREF in case of error */
Py_XDECREF(operation_DTypes[1]);
if (out != NULL) {
@@ -5194,60 +5194,18 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
return NULL;
}
}
-
- PyObject *promoter = NULL;
- if (ufunc->ntypes == 1) {
- npy_bool all_object = NPY_TRUE;
- for (int i = 0; i < ufunc->nargs; i++) {
- if (ufunc->types[i] != NPY_OBJECT) {
- all_object = NPY_FALSE;
- break;
- }
- }
- if (all_object) {
- promoter = PyCapsule_New(&object_only_ufunc_promoter,
- "numpy._ufunc_promoter", NULL);
- if (promoter == NULL) {
- Py_DECREF(ufunc);
- return NULL;
- }
- }
- }
- if (promoter == NULL && ufunc->nin > 1) {
- promoter = PyCapsule_New(&default_ufunc_promoter,
- "numpy._ufunc_promoter", NULL);
- if (promoter == NULL) {
- Py_DECREF(ufunc);
- return NULL;
- }
- }
- if (promoter != NULL) {
- /* Always install default promoter using the common DType */
- PyObject *dtype_tuple = PyTuple_New(ufunc->nargs);
- if (dtype_tuple == NULL) {
- Py_DECREF(promoter);
- Py_DECREF(ufunc);
- return NULL;
- }
- for (int i = 0; i < ufunc->nargs; i++) {
- Py_INCREF(Py_None);
- PyTuple_SET_ITEM(dtype_tuple, i, Py_None);
- }
- PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter);
- Py_DECREF(dtype_tuple);
- Py_DECREF(promoter);
- if (info == NULL) {
- Py_DECREF(ufunc);
- return NULL;
- }
-
- int res = PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
- Py_DECREF(info);
- if (res < 0) {
- Py_DECREF(ufunc);
- return NULL;
- }
- }
+ /*
+ * TODO: I tried adding a default promoter here (either all object for
+ * some special cases, or all homogeneous). Those are reasonable
+ * defaults, but short-cut a deprecated SciPy loop, where the
+ * homogeneous loop `ddd->d` was deprecated, but an inhomogeneous
+ * one `dld->d` should be picked.
+ * The default promoter *is* a reasonable default, but switched that
+ * behaviour.
+ * Another problem appeared due to buggy type-resolution for
+ * datetimes, this meant that `timedelta.sum(dtype="f8")` returned
+ * datetimes (and not floats or error), arguably wrong, but...
+ */
return (PyObject *)ufunc;
}
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index c6a3d4e79..baae77a35 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -2029,11 +2029,17 @@ class TestDateTime:
assert_equal(np.maximum.reduce(a),
np.timedelta64(7, 's'))
+ def test_timedelta_correct_mean(self):
+ # test mainly because it worked only via a bug in that allowed:
+ # `timedelta.sum(dtype="f8")` to ignore the dtype request.
+ a = np.arange(1000, dtype="m8[s]")
+ assert_array_equal(a.mean(), a.sum() / len(a))
+
def test_datetime_no_subtract_reducelike(self):
# subtracting two datetime64 works, but we cannot reduce it, since
# the result of that subtraction will have a different dtype.
arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]")
- msg = r"ufunc 'subtract' did not contain a loop with signature "
+ msg = r"the resolved dtypes are not compatible"
with pytest.raises(TypeError, match=msg):
np.subtract.reduce(arr)