diff options
20 files changed, 1246 insertions, 944 deletions
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index cd5d8484a..076ac32c7 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -37,7 +37,7 @@ jobs: echo "::set-output name=message::$COMMIT_MSG" build_wheels: - name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform }} + name: Build wheel for ${{ matrix.python }}-${{ matrix.platform }} needs: get_commit_message if: >- contains(needs.get_commit_message.outputs.message, '[wheel build]') || @@ -51,35 +51,49 @@ jobs: include: # manylinux builds - os: ubuntu-20.04 - python: "38" + python: "cp38" platform: manylinux_x86_64 - os: ubuntu-20.04 - python: "39" + python: "cp39" platform: manylinux_x86_64 - os: ubuntu-20.04 - python: "310" + python: "cp310" + platform: manylinux_x86_64 + # manylinux pypy builds + - os: ubuntu-20.04 + python: "pp38" platform: manylinux_x86_64 # MacOS builds - os: macos-10.15 - python: "38" + python: "cp38" platform: macosx_* - os: macos-10.15 - python: "39" + python: "cp39" platform: macosx_* - os: macos-10.15 - python: "310" + python: "cp310" platform: macosx_* + # MacOS PyPy builds + # Disabled for now because of a PyPy bug + # that prevents successful compilation + #- os: macos-10.15 + # python: "pp38" + # platform: macosx_x86_64 # Windows builds - os: windows-2019 - python: "38" + python: "cp38" + platform: win_amd64 + - os: windows-2019 + python: "cp39" platform: win_amd64 - os: windows-2019 - python: "39" + python: "cp310" platform: win_amd64 + # Windows PyPy builds - os: windows-2019 - python: "310" + python: "pp38" platform: win_amd64 steps: @@ -94,10 +108,10 @@ jobs: fetch-depth: 0 - name: Build wheels - uses: pypa/cibuildwheel@v2.1.3 + uses: pypa/cibuildwheel@v2.3.0 env: NPY_USE_BLAS_ILP64: 1 - CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform }} + CIBW_BUILD: ${{ matrix.python }}-${{ matrix.platform }} CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_ENVIRONMENT_LINUX: CFLAGS='-std=c99 -fno-strict-aliasing' LDFLAGS='-Wl,--strip-debug' diff --git a/doc/neps/nep-0031-uarray.rst b/doc/neps/nep-0031-uarray.rst index b4ec94077..b746c267d 100644 --- a/doc/neps/nep-0031-uarray.rst +++ b/doc/neps/nep-0031-uarray.rst @@ -302,7 +302,7 @@ This is different from monkeypatching in a few different ways: so there is at least the loose sense of an API contract. Monkeypatching does not provide this ability. * There is the ability of locally switching the backend. -* It has been `suggested <http://numpy-discussion.10968.n7.nabble.com/NEP-31-Context-local-and-global-overrides-of-the-NumPy-API-tp47452p47472.html>`_ +* It has been `suggested <https://mail.python.org/archives/list/numpy-discussion@python.org/message/PS7EN3CRT6XERNTCN56MAYOXFFFEC55G/>`_ that the reason that 1.17 hasn't landed in the Anaconda defaults channel is due to the incompatibility between monkeypatching and ``__array_function__``, as monkeypatching would bypass the protocol completely. @@ -640,9 +640,9 @@ References and Footnotes .. [4] NEP 13 — A Mechanism for Overriding Ufuncs: https://numpy.org/neps/nep-0013-ufunc-overrides.html -.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: http://numpy-discussion.10968.n7.nabble.com/Adding-to-the-non-dispatched-implementation-of-NumPy-methods-tp46816p46874.html +.. [5] Reply to Adding to the non-dispatched implementation of NumPy methods: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/5GUDMALWDIRHITG5YUOCV343J66QSX3U/#5GUDMALWDIRHITG5YUOCV343J66QSX3U -.. [6] Custom Dtype/Units discussion: http://numpy-discussion.10968.n7.nabble.com/Custom-Dtype-Units-discussion-td43262.html +.. [6] Custom Dtype/Units discussion: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA/#RZYCVT6C3F7UDV6NA6FEV4MC5FKS6RDA .. [7] The epic dtype cleanup plan: https://github.com/numpy/numpy/issues/2899 diff --git a/doc/neps/nep-0038-SIMD-optimizations.rst b/doc/neps/nep-0038-SIMD-optimizations.rst index 927228447..2123c4f95 100644 --- a/doc/neps/nep-0038-SIMD-optimizations.rst +++ b/doc/neps/nep-0038-SIMD-optimizations.rst @@ -8,7 +8,7 @@ NEP 38 — Using SIMD optimization instructions for performance :Status: Accepted :Type: Standards :Created: 2019-11-25 -:Resolution: http://numpy-discussion.10968.n7.nabble.com/NEP-38-Universal-SIMD-intrinsics-td47854.html +:Resolution: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB/#PVWJ74UVBRZ5ZWF6MDU7EUSJXVNILAQB Abstract diff --git a/doc/neps/nep-0049.rst b/doc/neps/nep-0049.rst index 3bd1d102c..0f0fd23c9 100644 --- a/doc/neps/nep-0049.rst +++ b/doc/neps/nep-0049.rst @@ -55,8 +55,8 @@ is to create a flexible enough interface without burdening normative users. .. _`issue 5312`: https://github.com/numpy/numpy/issues/5312 .. _`from 2017`: https://github.com/numpy/numpy/issues/5312#issuecomment-315234656 .. _`in 2005`: https://numpy-discussion.scipy.narkive.com/MvmMkJcK/numpy-arrays-data-allocation-and-simd-alignement -.. _`here`: http://numpy-discussion.10968.n7.nabble.com/Aligned-configurable-memory-allocation-td39712.html -.. _`and here`: http://numpy-discussion.10968.n7.nabble.com/Numpy-s-policy-for-releasing-memory-td1533.html +.. _`here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH/#YPC5BGPUMKT2MLBP6O3FMPC35LFM2CCH +.. _`and here`: https://mail.python.org/archives/list/numpy-discussion@python.org/thread/IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A/#IQK3EPIIRE3V4BPNAMJ2ZST3NUG2MK2A .. _`issue 14177`: https://github.com/numpy/numpy/issues/14177 .. _`filprofiler`: https://github.com/pythonspeed/filprofiler/blob/master/design/allocator-overrides.md .. _`electric fence`: https://github.com/boundarydevices/efence diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index a18211cca..24bb6665d 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -26,7 +26,7 @@ For learning how to use NumPy, see the :ref:`complete documentation <numpy_docs_ distutils distutils_guide c-api/index - simd/simd-optimizations + simd/index swig diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst new file mode 100644 index 000000000..80ef2c639 --- /dev/null +++ b/doc/source/reference/simd/build-options.rst @@ -0,0 +1,375 @@ +***************** +CPU build options +***************** + +Description +----------- + +The following options are mainly used to change the default behavior of optimizations +that target certain CPU features: + +- ``--cpu-baseline``: minimal set of required CPU features. + Default value is ``min`` which provides the minimum CPU features that can + safely run on a wide range of platforms within the processor family. + + .. note:: + + During the runtime, NumPy modules will fail to load if any of specified features + are not supported by the target CPU (raises Python runtime error). + +- ``--cpu-dispatch``: dispatched set of additional CPU features. + Default value is ``max -xop -fma4`` which enables all CPU + features, except for AMD legacy features (in case of X86). + + .. note:: + + During the runtime, NumPy modules will skip any specified features + that are not available in the target CPU. + +These options are accessible through :py:mod:`distutils` commands +`distutils.command.build`, `distutils.command.build_clib` and +`distutils.command.build_ext`. +They accept a set of :ref:`CPU features <opt-supported-features>` +or groups of features that gather several features or +:ref:`special options <opt-special-options>` that +perform a series of procedures. + +.. note:: + + If ``build_clib`` or ``build_ext`` are not specified by the user, + the arguments of ``build`` will be used instead, which also holds the default values. + +To customize both ``build_ext`` and ``build_clib``:: + + cd /path/to/numpy + python setup.py build --cpu-baseline="avx2 fma3" install --user + +To customize only ``build_ext``:: + + cd /path/to/numpy + python setup.py build_ext --cpu-baseline="avx2 fma3" install --user + +To customize only ``build_clib``:: + + cd /path/to/numpy + python setup.py build_clib --cpu-baseline="avx2 fma3" install --user + +You can also customize CPU/build options through PIP command:: + + pip install --no-use-pep517 --global-option=build \ + --global-option="--cpu-baseline=avx2 fma3" \ + --global-option="--cpu-dispatch=max" ./ + +Quick Start +----------- + +In general, the default settings tend to not impose certain CPU features that +may not be available on some older processors. Raising the ceiling of the +baseline features will often improve performance and may also reduce +binary size. + + +The following are the most common scenarios that may require changing +the default settings: + + +I am building NumPy for my local use +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +And I do not intend to export the build to other users or target a +different CPU than what the host has. + +Set `native` for baseline, or manualy specify the CPU features in case of option +`native` isn't supported by your platform:: + + python setup.py build --cpu-baseline="native" bdist + +Building NumPy with extra CPU features isn't necessary for this case, +since all supported features are already defined within the baseline features:: + + python setup.py build --cpu-baseline=native --cpu-dispatch=none bdist + +.. note:: + + A fatal error will be raised if `native` isn't supported by the host platform. + +I do not want to support the old processors of the `x86` architecture +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since most of the CPUs nowadays support at least `AVX`, `F16C` features, you can use:: + + python setup.py build --cpu-baseline="avx f16c" bdist + +.. note:: + + ``--cpu-baseline`` force combine all implied features, so there's no need + to add SSE features. + + +I'm facing the same case above but with `ppc64` architecture +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Then raise the ceiling of the baseline features to Power8:: + + python setup.py build --cpu-baseline="vsx2" bdist + +Having issues with `AVX512` features? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may have some reservations about including of `AVX512` or +any other CPU feature and you want to exclude from the dispatched features:: + + python setup.py build --cpu-dispatch="max -avx512f -avx512cd \ + -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl" \ + bdist + +.. _opt-supported-features: + +Supported Features +------------------ + +The names of the features can express one feature or a group of features, +as shown in the following tables supported depend on the lowest interest: + +.. note:: + + The following features may not be supported by all compilers, + also some compilers may produce different set of implied features + when it comes to features like ``AVX512``, ``AVX2``, and ``FMA3``. + See :ref:`opt-platform-differences` for more details. + +.. include:: generated_tables/cpu_features.inc + +.. _opt-special-options: + +Special Options +--------------- + +- ``NONE``: enable no features. + +- ``NATIVE``: Enables all CPU features that supported by the host CPU, + this operation is based on the compiler flags (``-march=native``, ``-xHost``, ``/QxHost``) + +- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms: + + .. table:: + :align: left + + ====================================== ======================================= + For Arch Implies + ====================================== ======================================= + x86 (32-bit mode) ``SSE`` ``SSE2`` + x86_64 ``SSE`` ``SSE2`` ``SSE3`` + IBM/POWER (big-endian mode) ``NONE`` + IBM/POWER (little-endian mode) ``VSX`` ``VSX2`` + ARMHF ``NONE`` + ARM64 A.K. AARCH64 ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` + ``ASIMD`` + ====================================== ======================================= + +- ``MAX``: Enables all supported CPU features by the compiler and platform. + +- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``. + +Behaviors +--------- + +- CPU features and other options are case-insensitive, for example:: + + python setup.py build --cpu-dispatch="SSE41 avx2 FMA3" + +- The order of the requested optimizations doesn't matter:: + + python setup.py build --cpu-dispatch="SSE41 AVX2 FMA3" + # equivalent to + python setup.py build --cpu-dispatch="FMA3 AVX2 SSE41" + +- Either commas or spaces or '+' can be used as a separator, + for example:: + + python setup.py build --cpu-dispatch="avx2 avx512f" + # or + python setup.py build --cpu-dispatch=avx2,avx512f + # or + python setup.py build --cpu-dispatch="avx2+avx512f" + + all works but arguments should be enclosed in quotes or escaped + by backslash if any spaces are used. + +- ``--cpu-baseline`` combines all implied CPU features, for example:: + + python setup.py build --cpu-baseline=sse42 + # equivalent to + python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42" + +- ``--cpu-baseline`` will be treated as "native" if compiler native flag + ``-march=native`` or ``-xHost`` or ``/QxHost`` is enabled through environment variable + `CFLAGS`:: + + export CFLAGS="-march=native" + python setup.py install --user + # is equivalent to + python setup.py build --cpu-baseline=native install --user + +- ``--cpu-baseline`` escapes any specified features that aren't supported + by the target platform or compiler rather than raising fatal errors. + + .. note:: + + Since ``--cpu-baseline`` combines all implied features, the maximum + supported of implied features will be enabled rather than escape all of them. + For example:: + + # Requesting `AVX2,FMA3` but the compiler only support **SSE** features + python setup.py build --cpu-baseline="avx2 fma3" + # is equivalent to + python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42" + +- ``--cpu-dispatch`` does not combain any of implied CPU features, + so you must add them unless you want to disable one or all of them:: + + # Only dispatches AVX2 and FMA3 + python setup.py build --cpu-dispatch=avx2,fma3 + # Dispatches AVX and SSE features + python setup.py build --cpu-baseline=ssse3,sse41,sse42,avx,avx2,fma3 + +- ``--cpu-dispatch`` escapes any specified baseline features and also escapes + any features not supported by the target platform or compiler without rasing + fatal errors. + +Eventually, you should always check the final report through the build log +to verify the enabled features. See :ref:`opt-build-report` for more details. + +.. _opt-platform-differences: + +Platform differences +-------------------- + +Some exceptional conditions force us to link some features together when it come to +certain compilers or architectures, resulting in the impossibility of building them separately. + +These conditions can be divided into two parts, as follows: + +**Architectural compatibility** + +The need to align certain CPU features that are assured to be supported by +successive generations of the same architecture, some cases: + +- On ppc64le ``VSX(ISA 2.06)`` and ``VSX2(ISA 2.07)`` both imply one another since the + first generation that supports little-endian mode is Power-8`(ISA 2.07)` +- On AArch64 ``NEON NEON_FP16 NEON_VFPV4 ASIMD`` implies each other since they are part of the + hardware baseline. + +For example:: + + # On ARMv8/A64, specify NEON is going to enable Advanced SIMD + # and all predecessor extensions + python setup.py build --cpu-baseline=neon + # which equivalent to + python setup.py build --cpu-baseline="neon neon_fp16 neon_vfpv4 asimd" + +.. note:: + + Please take a deep look at :ref:`opt-supported-features`, + in order to determine the features that imply one another. + +**Compilation compatibility** + +Some compilers don't provide independent support for all CPU features. For instance +**Intel**'s compiler doesn't provide separated flags for ``AVX2`` and ``FMA3``, +it makes sense since all Intel CPUs that comes with ``AVX2`` also support ``FMA3``, +but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**. + +For example:: + + # Specify AVX2 will force enables FMA3 on Intel compilers + python setup.py build --cpu-baseline=avx2 + # which equivalent to + python setup.py build --cpu-baseline="avx2 fma3" + + +The following tables only show the differences imposed by some compilers from the +general context that been shown in the :ref:`opt-supported-features` tables: + +.. note:: + + Features names with strikeout represent the unsupported CPU features. + +.. raw:: html + + <style> + .enabled-feature {color:green; font-weight:bold;} + .disabled-feature {color:red; text-decoration: line-through;} + </style> + +.. role:: enabled + :class: enabled-feature + +.. role:: disabled + :class: disabled-feature + +.. include:: generated_tables/compilers-diff.inc + +.. _opt-build-report: + +Build report +------------ + +In most cases, the CPU build options do not produce any fatal errors that lead to hanging the build. +Most of the errors that may appear in the build log serve as heavy warnings due to the lack of some +expected CPU features by the compiler. + +So we strongly recommend checking the final report log, to be aware of what kind of CPU features +are enabled and what are not. + +You can find the final report of CPU optimizations at the end of the build log, +and here is how it looks on x86_64/gcc: + +.. raw:: html + + <style>#build-report .highlight-bash pre{max-height:450px; overflow-y: scroll;}</style> + +.. literalinclude:: log_example.txt + :language: bash + +As you see, there is a separate report for each of ``build_ext`` and ``build_clib`` +that includes several sections, and each section has several values, representing the following: + +**Platform**: + +- :enabled:`Architecture`: The architecture name of target CPU. It should be one of + ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64`` or ``unknown``. + +- :enabled:`Compiler`: The compiler name. It should be one of + gcc, clang, msvc, icc, iccw or unix-like. + +**CPU baseline**: + +- :enabled:`Requested`: The specific features and options to ``--cpu-baseline`` as-is. +- :enabled:`Enabled`: The final set of enabled CPU features. +- :enabled:`Flags`: The compiler flags that were used to all NumPy `C/C++` sources + during the compilation except for temporary sources that have been used for generating + the binary objects of dispatched features. +- :enabled:`Extra checks`: list of internal checks that activate certain functionality + or intrinsics related to the enabled features, useful for debugging when it comes + to developing SIMD kernels. + +**CPU dispatch**: + +- :enabled:`Requested`: The specific features and options to ``--cpu-dispatch`` as-is. +- :enabled:`Enabled`: The final set of enabled CPU features. +- :enabled:`Generated`: At the beginning of the next row of this property, + the features for which optimizations have been generated are shown in the + form of several sections with similar properties explained as follows: + + - :enabled:`One or multiple dispatched feature`: The implied CPU features. + - :enabled:`Flags`: The compiler flags that been used for these features. + - :enabled:`Extra checks`: Similar to the baseline but for these dispatched features. + - :enabled:`Detect`: Set of CPU features that need be detected in runtime in order to + execute the generated optimizations. + - The lines that come after the above property and end with a ':' on a separate line, + represent the paths of c/c++ sources that define the generated optimizations. + +Runtime Trace +------------- +To be completed. diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py new file mode 100644 index 000000000..d74d54016 --- /dev/null +++ b/doc/source/reference/simd/gen_features.py @@ -0,0 +1,194 @@ +""" +Generate CPU features tables from CCompilerOpt +""" +from os import sys, path +from numpy.distutils.ccompiler_opt import CCompilerOpt + +class FakeCCompilerOpt(CCompilerOpt): + # disable caching no need for it + conf_nocache = True + + def __init__(self, arch, cc, *args, **kwargs): + self.fake_info = (arch, cc, '') + CCompilerOpt.__init__(self, None, **kwargs) + + def dist_compile(self, sources, flags, **kwargs): + return sources + + def dist_info(self): + return self.fake_info + + @staticmethod + def dist_log(*args, stderr=False): + # avoid printing + pass + + def feature_test(self, name, force_flags=None, macros=[]): + # To speed up + return True + +class Features: + def __init__(self, arch, cc): + self.copt = FakeCCompilerOpt(arch, cc, cpu_baseline="max") + + def names(self): + return self.copt.cpu_baseline_names() + + def serialize(self, features_names): + result = [] + for f in self.copt.feature_sorted(features_names): + gather = self.copt.feature_supported.get(f, {}).get("group", []) + implies = self.copt.feature_sorted(self.copt.feature_implies(f)) + result.append((f, implies, gather)) + return result + + def table(self, **kwargs): + return self.gen_table(self.serialize(self.names()), **kwargs) + + def table_diff(self, vs, **kwargs): + fnames = set(self.names()) + fnames_vs = set(vs.names()) + common = fnames.intersection(fnames_vs) + extra = fnames.difference(fnames_vs) + notavl = fnames_vs.difference(fnames) + iextra = {} + inotavl = {} + idiff = set() + for f in common: + implies = self.copt.feature_implies(f) + implies_vs = vs.copt.feature_implies(f) + e = implies.difference(implies_vs) + i = implies_vs.difference(implies) + if not i and not e: + continue + if e: + iextra[f] = e + if i: + inotavl[f] = e + idiff.add(f) + + def fbold(f): + if f in extra: + return f':enabled:`{f}`' + if f in notavl: + return f':disabled:`{f}`' + return f + + def fbold_implies(f, i): + if i in iextra.get(f, {}): + return f':enabled:`{i}`' + if f in notavl or i in inotavl.get(f, {}): + return f':disabled:`{i}`' + return i + + diff_all = self.serialize(idiff.union(extra)) + diff_all += vs.serialize(notavl) + content = self.gen_table( + diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs + ) + return content + + def gen_table(self, serialized_features, fstyle=None, fstyle_implies=None, + **kwargs): + + if fstyle is None: + fstyle = lambda ft: f'``{ft}``' + if fstyle_implies is None: + fstyle_implies = lambda origin, ft: fstyle(ft) + + rows = [] + have_gather = False + for f, implies, gather in serialized_features: + if gather: + have_gather = True + name = fstyle(f) + implies = ' '.join([fstyle_implies(f, i) for i in implies]) + gather = ' '.join([fstyle_implies(f, i) for i in gather]) + rows.append((name, implies, gather)) + if not rows: + return '' + fields = ["Name", "Implies", "Gathers"] + if not have_gather: + del fields[2] + rows = [(name, implies) for name, implies, _ in rows] + return self.gen_rst_table(fields, rows, **kwargs) + + def gen_rst_table(self, field_names, rows, tab_size=4): + assert(not rows or len(field_names) == len(rows[0])) + rows.append(field_names) + fld_len = len(field_names) + cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)] + del rows[-1] + cformat = ' '.join('{:<%d}' % i for i in cls_len) + border = cformat.format(*['='*i for i in cls_len]) + + rows = [cformat.format(*row) for row in rows] + # header + rows = [border, cformat.format(*field_names), border] + rows + # footer + rows += [border] + # add left margin + rows = [(' ' * tab_size) + r for r in rows] + return '\n'.join(rows) + +def wrapper_section(title, content, tab_size=4): + tab = ' '*tab_size + if content: + return ( + f"{title}\n{'~'*len(title)}" + f"\n.. table::\n{tab}:align: left\n\n" + f"{content}\n\n" + ) + return '' + +def wrapper_tab(title, table, tab_size=4): + tab = ' '*tab_size + if table: + ('\n' + tab).join(( + '.. tab:: ' + title, + tab + '.. table::', + tab + 'align: left', + table + '\n\n' + )) + return '' + + +if __name__ == '__main__': + + pretty_names = { + "PPC64": "IBM/POWER big-endian", + "PPC64LE": "IBM/POWER little-endian", + "ARMHF": "ARMv7/A32", + "AARCH64": "ARMv8/A64", + "ICC": "Intel Compiler", + # "ICCW": "Intel Compiler msvc-like", + "MSVC": "Microsoft Visual C/C++" + } + gen_path = path.join( + path.dirname(path.realpath(__file__)), "generated_tables" + ) + with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd: + fd.write(f'.. generated via {__file__}\n\n') + for arch in ( + ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64") + ): + title = "On " + pretty_names.get(arch, arch) + table = Features(arch, 'gcc').table() + fd.write(wrapper_section(title, table)) + + with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd: + fd.write(f'.. generated via {__file__}\n\n') + for arch, cc_names in ( + ("x86", ("clang", "ICC", "MSVC")), + ("PPC64", ("clang",)), + ("PPC64LE", ("clang",)), + ("ARMHF", ("clang",)), + ("AARCH64", ("clang",)) + ): + arch_pname = pretty_names.get(arch, arch) + for cc in cc_names: + title = f"On {arch_pname}::{pretty_names.get(cc, cc)}" + table = Features(arch, cc).table_diff(Features(arch, "gcc")) + fd.write(wrapper_section(title, table)) + + diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc new file mode 100644 index 000000000..4b9009a68 --- /dev/null +++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc @@ -0,0 +1,33 @@ +.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py + +On x86::Intel Compiler +~~~~~~~~~~~~~~~~~~~~~~ +.. table:: + :align: left + + ================ ========================================================================================================================================== + Name Implies + ================ ========================================================================================================================================== + FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2` + AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3` + AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` + :disabled:`XOP` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` + :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` + ================ ========================================================================================================================================== + +On x86::Microsoft Visual C/C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. table:: + :align: left + + ====================== ============================================================================================================================================================================================================================================================= ============================================================================= + Name Implies Gathers + ====================== ============================================================================================================================================================================================================================================================= ============================================================================= + FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2` + AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3` + AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX` + AVX512CD SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX` + :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512ER` :disabled:`AVX512PF` + :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ` + ====================== ============================================================================================================================================================================================================================================================= ============================================================================= + diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc new file mode 100644 index 000000000..a7eae5652 --- /dev/null +++ b/doc/source/reference/simd/generated_tables/cpu_features.inc @@ -0,0 +1,93 @@ +.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py + +On x86 +~~~~~~ +.. table:: + :align: left + + ============== =========================================================================================================================================================================== ===================================================== + Name Implies Gathers + ============== =========================================================================================================================================================================== ===================================================== + ``SSE`` ``SSE2`` + ``SSE2`` ``SSE`` + ``SSE3`` ``SSE`` ``SSE2`` + ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3`` + ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` + ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` + ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` + ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` + ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` + ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` + ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` + ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` + ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` + ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` + ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` + ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512ER`` ``AVX512PF`` + ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ`` + ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ`` + ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512VNNI`` + ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512IFMA`` ``AVX512VBMI`` + ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` + ============== =========================================================================================================================================================================== ===================================================== + +On IBM/POWER big-endian +~~~~~~~~~~~~~~~~~~~~~~~ +.. table:: + :align: left + + ======== ================ + Name Implies + ======== ================ + ``VSX`` + ``VSX2`` ``VSX`` + ``VSX3`` ``VSX`` ``VSX2`` + ======== ================ + +On IBM/POWER little-endian +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. table:: + :align: left + + ======== ================ + Name Implies + ======== ================ + ``VSX`` ``VSX2`` + ``VSX2`` ``VSX`` + ``VSX3`` ``VSX`` ``VSX2`` + ======== ================ + +On ARMv7/A32 +~~~~~~~~~~~~ +.. table:: + :align: left + + ============== =========================================================== + Name Implies + ============== =========================================================== + ``NEON`` + ``NEON_FP16`` ``NEON`` + ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` + ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` + ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP`` + ============== =========================================================== + +On ARMv8/A64 +~~~~~~~~~~~~ +.. table:: + :align: left + + ============== =========================================================== + Name Implies + ============== =========================================================== + ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD`` + ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD`` + ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` + ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` + ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP`` + ============== =========================================================== + diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst new file mode 100644 index 000000000..a2882f484 --- /dev/null +++ b/doc/source/reference/simd/how-it-works.rst @@ -0,0 +1,349 @@ +********************************** +How does the CPU dispatcher work? +********************************** + +NumPy dispatcher is based on multi-source compiling, which means taking +a certain source and compiling it multiple times with different compiler +flags and also with different **C** definitions that affect the code +paths. This enables certain instruction-sets for each compiled object +depending on the required optimizations and ends with linking the +returned objects together. + +.. figure:: ../figures/opt-infra.png + +This mechanism should support all compilers and it doesn't require any +compiler-specific extension, but at the same time it adds a few steps to +normal compilation that are explained as follows. + +1- Configuration +~~~~~~~~~~~~~~~~ + +Configuring the required optimization by the user before starting to build the +source files via the two command arguments as explained above: + +- ``--cpu-baseline``: minimal set of required optimizations. + +- ``--cpu-dispatch``: dispatched set of additional optimizations. + + +2- Discovering the environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this part, we check the compiler and platform architecture +and cache some of the intermediary results to speed up rebuilding. + +3- Validating the requested optimizations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By testing them against the compiler, and seeing what the compiler can +support according to the requested optimizations. + +4- Generating the main configuration header +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The generated header ``_cpu_dispatch.h`` contains all the definitions and +headers of instruction-sets for the required optimizations that have been +validated during the previous step. + +It also contains extra C definitions that are used for defining NumPy's +Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispatch__``. + +**What is in this header?** + +The example header was dynamically generated by gcc on an X86 machine. +The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and +``--cpu-dispatch="ssse3 sse41"``, and the result is below. + +.. code:: c + + // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h + /**NOTE + ** C definitions prefixed with "NPY_HAVE_" represent + ** the required optimzations. + ** + ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and + ** shouldn't be used by any NumPy C sources. + */ + /******* baseline features *******/ + /** SSE **/ + #define NPY_HAVE_SSE 1 + #include <xmmintrin.h> + /** SSE2 **/ + #define NPY_HAVE_SSE2 1 + #include <emmintrin.h> + /** SSE3 **/ + #define NPY_HAVE_SSE3 1 + #include <pmmintrin.h> + + /******* dispatch-able features *******/ + #ifdef NPY__CPU_TARGET_SSSE3 + /** SSSE3 **/ + #define NPY_HAVE_SSSE3 1 + #include <tmmintrin.h> + #endif + #ifdef NPY__CPU_TARGET_SSE41 + /** SSE41 **/ + #define NPY_HAVE_SSE41 1 + #include <smmintrin.h> + #endif + +**Baseline features** are the minimal set of required optimizations configured +via ``--cpu-baseline``. They have no preprocessor guards and they're +always on, which means they can be used in any source. + +Does this mean NumPy's infrastructure passes the compiler's flags of +baseline features to all sources? + +Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are +treated differently. + +What if the user specifies certain **baseline features** during the +build but at runtime the machine doesn't support even these +features? Will the compiled code be called via one of these definitions, or +maybe the compiler itself auto-generated/vectorized certain piece of code +based on the provided command line compiler flags? + +During the loading of the NumPy module, there's a validation step +which detects this behavior. It will raise a Python runtime error to inform the +user. This is to prevent the CPU reaching an illegal instruction error causing +a segfault. + +**Dispatch-able features** are our dispatched set of additional optimizations +that were configured via ``--cpu-dispatch``. They are not activated by +default and are always guarded by other C definitions prefixed with +``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only +enabled within **dispatch-able sources**. + +.. _dispatchable-sources: + +5- Dispatch-able sources and configuration statements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dispatch-able sources are special **C** files that can be compiled multiple +times with different compiler flags and also with different **C** +definitions. These affect code paths to enable certain +instruction-sets for each compiled object according to "**the +configuration statements**" that must be declared between a **C** +comment\ ``(/**/)`` and start with a special mark **@targets** at the +top of each dispatch-able source. At the same time, dispatch-able +sources will be treated as normal **C** sources if the optimization was +disabled by the command argument ``--disable-optimization`` . + +**What are configuration statements?** + +Configuration statements are sort of keywords combined together to +determine the required optimization for the dispatch-able source. + +Example: + +.. code:: c + + /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */ + // C code + +The keywords mainly represent the additional optimizations configured +through ``--cpu-dispatch``, but it can also represent other options such as: + +- Target groups: pre-configured configuration statements used for + managing the required optimizations from outside the dispatch-able source. + +- Policies: collections of options used for changing the default + behaviors or forcing the compilers to perform certain things. + +- "baseline": a unique keyword represents the minimal optimizations + that configured through ``--cpu-baseline`` + +**Numpy's infrastructure handles dispatch-able sources in four steps**: + +- **(A) Recognition**: Just like source templates and F2PY, the + dispatch-able sources requires a special extension ``*.dispatch.c`` + to mark C dispatch-able source files, and for C++ + ``*.dispatch.cpp`` or ``*.dispatch.cxx`` + **NOTE**: C++ not supported yet. + +- **(B) Parsing and validating**: In this step, the + dispatch-able sources that had been filtered by the previous step + are parsed and validated by the configuration statements for each one + of them one by one in order to determine the required optimizations. + +- **(C) Wrapping**: This is the approach taken by NumPy's + infrastructure, which has proved to be sufficiently flexible in order + to compile a single source multiple times with different **C** + definitions and flags that affect the code paths. The process is + achieved by creating a temporary **C** source for each required + optimization that related to the additional optimization, which + contains the declarations of the **C** definitions and includes the + involved source via the **C** directive **#include**. For more + clarification take a look at the following code for AVX512F : + + .. code:: c + + /* + * this definition is used by NumPy utilities as suffixes for the + * exported symbols + */ + #define NPY__CPU_TARGET_CURRENT AVX512F + /* + * The following definitions enable + * definitions of the dispatch-able features that are defined within the main + * configuration header. These are definitions for the implied features. + */ + #define NPY__CPU_TARGET_SSE + #define NPY__CPU_TARGET_SSE2 + #define NPY__CPU_TARGET_SSE3 + #define NPY__CPU_TARGET_SSSE3 + #define NPY__CPU_TARGET_SSE41 + #define NPY__CPU_TARGET_POPCNT + #define NPY__CPU_TARGET_SSE42 + #define NPY__CPU_TARGET_AVX + #define NPY__CPU_TARGET_F16C + #define NPY__CPU_TARGET_FMA3 + #define NPY__CPU_TARGET_AVX2 + #define NPY__CPU_TARGET_AVX512F + // our dispatch-able source + #include "/the/absuolate/path/of/hello.dispatch.c" + +- **(D) Dispatch-able configuration header**: The infrastructure + generates a config header for each dispatch-able source, this header + mainly contains two abstract **C** macros used for identifying the + generated objects, so they can be used for runtime dispatching + certain symbols from the generated objects by any **C** source. It is + also used for forward declarations. + + The generated header takes the name of the dispatch-able source after + excluding the extension and replace it with ``.h``, for example + assume we have a dispatch-able source called ``hello.dispatch.c`` and + contains the following: + + .. code:: c + + // hello.dispatch.c + /*@targets baseline sse42 avx512f */ + #include <stdio.h> + #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR + + #ifndef NPY__CPU_TARGET_CURRENT + // wrapping the dispatch-able source only happens to the additional optimizations + // but if the keyword 'baseline' provided within the configuration statements, + // the infrastructure will add extra compiling for the dispatch-able source by + // passing it as-is to the compiler without any changes. + #define CURRENT_TARGET(X) X + #define NPY__CPU_TARGET_CURRENT baseline // for printing only + #else + // since we reach to this point, that's mean we're dealing with + // the additional optimizations, so it could be SSE42 or AVX512F + #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT) + #endif + // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols, + // to avoid linking duplications, NumPy already has a macro called + // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at + // numpy/numpy/core/src/common/npy_cpu_dispatch.h + // NOTE: we tend to not adding suffixes to the baseline exported symbols + void CURRENT_TARGET(simd_whoami)(const char *extra_info) + { + printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info); + } + + Now assume you attached **hello.dispatch.c** to the source tree, then + the infrastructure should generate a temporary config header called + **hello.dispatch.h** that can be reached by any source in the source + tree, and it should contain the following code : + + .. code:: c + + #ifndef NPY__CPU_DISPATCH_EXPAND_ + // To expand the macro calls in this header + #define NPY__CPU_DISPATCH_EXPAND_(X) X + #endif + // Undefining the following macros, due to the possibility of including config headers + // multiple times within the same source and since each config header represents + // different required optimizations according to the specified configuration + // statements in the dispatch-able source that derived from it. + #undef NPY__CPU_DISPATCH_BASELINE_CALL + #undef NPY__CPU_DISPATCH_CALL + // nothing strange here, just a normal preprocessor callback + // enabled only if 'baseline' specified within the configuration statements + #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \ + NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__)) + // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching + // the required optimizations that specified within the configuration statements. + // + // @param CHK, Expected a macro that can be used to detect CPU features + // in runtime, which takes a CPU feature name without string quotes and + // returns the testing result in a shape of boolean value. + // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement. + // + // @param CB, a callback macro that expected to be called multiple times depending + // on the required optimizations, the callback should receive the following arguments: + // 1- The pending calls of @param CHK filled up with the required CPU features, + // that need to be tested first in runtime before executing call belong to + // the compiled object. + // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT' + // 3- Extra arguments in the macro itself + // + // By default the callback calls are sorted depending on the highest interest + // unless the policy "$keep_sort" was in place within the configuration statements + // see "Dive into the CPU dispatcher" for more clarification. + #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \ + NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \ + NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__)) + + An example of using the config header in light of the above: + + .. code:: c + + // NOTE: The following macros are only defined for demonstration purposes only. + // NumPy already has a collections of macros located at + // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching + // and declarations scenarios. + + #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE + #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND + + // An example for setting a macro that calls all the exported symbols at once + // after checking if they're supported by the running machine. + #define DISPATCH_CALL_ALL(FN, ARGS) \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \ + NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS) + // The preprocessor callbacks. + // The same suffixes as we define it in the dispatch-able source. + #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \ + if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; } + #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \ + FN NPY_EXPAND(ARGS); + + // An example for setting a macro that calls the exported symbols of highest + // interest optimization, after checking if they're supported by the running machine. + #define DISPATCH_CALL_HIGH(FN, ARGS) \ + if (0) {} \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \ + NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS) + // The preprocessor callbacks + // The same suffixes as we define it in the dispatch-able source. + #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \ + else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; } + #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \ + else { FN NPY_EXPAND(ARGS); } + + // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used + // for forward declrations any kind of prototypes based on + // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'. + // However in this example, we just handle it manually. + void simd_whoami(const char *extra_info); + void simd_whoami_AVX512F(const char *extra_info); + void simd_whoami_SSE41(const char *extra_info); + + void trigger_me(void) + { + // bring the auto-gernreated config header + // which contains config macros 'NPY__CPU_DISPATCH_CALL' and + // 'NPY__CPU_DISPATCH_BASELINE_CALL'. + // it highely recomaned to include the config header before exectuing + // the dispatching macros in case if there's another header in the scope. + #include "hello.dispatch.h" + DISPATCH_CALL_ALL(simd_whoami, ("all")) + DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest")) + // An example of including multiple config headers in the same source + // #include "hello2.dispatch.h" + // DISPATCH_CALL_HIGH(another_function, ("the highest interest")) + } diff --git a/doc/source/reference/simd/index.rst b/doc/source/reference/simd/index.rst new file mode 100644 index 000000000..230e2dc15 --- /dev/null +++ b/doc/source/reference/simd/index.rst @@ -0,0 +1,43 @@ +.. _numpysimd: +.. currentmodule:: numpysimd + +*********************** +CPU/SIMD Optimizations +*********************** + +NumPy comes with a flexible working mechanism that allows it to harness the SIMD +features that CPUs own, in order to provide faster and more stable performance +on all popular platforms. Currently, NumPy supports the X86, IBM/Power, ARM7 and ARM8 +architectures. + +The optimization process in NumPy is carried out in three layers: + +- Code is *written* using the universal intrinsics which is a set of types, macros and + functions that are mapped to each supported instruction-sets by using guards that + will enable use of the them only when the compiler recognizes them. + This allow us to generate multiple kernels for the same functionality, + in which each generated kernel represents a set of instructions that related one + or multiple certain CPU features. The first kernel represents the minimum (baseline) + CPU features, and the other kernels represent the additional (dispatched) CPU features. + +- At *compile* time, CPU build options are used to define the minimum and + additional features to support, based on user choice and compiler support. The + appropriate intrinsics are overlaid with the platform / architecture intrinsics, + and multiple kernels are compiled. + +- At *runtime import*, the CPU is probed for the set of supported CPU + features. A mechanism is used to grab the pointer to the most appropriate + kernel, and this will be the one called for the function. + +.. note:: + + NumPy community had a deep discussion before implementing this work, + please check `NEP-38`_ for more clarification. + +.. toctree:: + + build-options + how-it-works + +.. _`NEP-38`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html + diff --git a/doc/source/reference/simd/log_example.txt b/doc/source/reference/simd/log_example.txt new file mode 100644 index 000000000..b0c732433 --- /dev/null +++ b/doc/source/reference/simd/log_example.txt @@ -0,0 +1,79 @@ +########### EXT COMPILER OPTIMIZATION ########### +Platform : + Architecture: x64 + Compiler : gcc + +CPU baseline : + Requested : 'min' + Enabled : SSE SSE2 SSE3 + Flags : -msse -msse2 -msse3 + Extra checks: none + +CPU dispatch : + Requested : 'max -xop -fma4' + Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL + Generated : + : + SSE41 : SSE SSE2 SSE3 SSSE3 + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 + Extra checks: none + Detect : SSE SSE2 SSE3 SSSE3 SSE41 + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c + : numpy/core/src/umath/_umath_tests.dispatch.c + : + SSE42 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 + Extra checks: none + Detect : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 + : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c + : + AVX2 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mavx2 + Extra checks: none + Detect : AVX F16C AVX2 + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c + : numpy/core/src/umath/_umath_tests.dispatch.c + : + (FMA3 AVX2) : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 + Extra checks: none + Detect : AVX F16C FMA3 AVX2 + : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c + : + AVX512F : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f + Extra checks: AVX512F_REDUCE + Detect : AVX512F + : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c + : + AVX512_SKX : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD + Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq + Extra checks: AVX512BW_MASK AVX512DQ_MASK + Detect : AVX512_SKX + : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c + : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c +CCompilerOpt.cache_flush[804] : write cache to path -> /home/seiko/work/repos/numpy/build/temp.linux-x86_64-3.9/ccompiler_opt_cache_ext.py + +########### CLIB COMPILER OPTIMIZATION ########### +Platform : + Architecture: x64 + Compiler : gcc + +CPU baseline : + Requested : 'min' + Enabled : SSE SSE2 SSE3 + Flags : -msse -msse2 -msse3 + Extra checks: none + +CPU dispatch : + Requested : 'max -xop -fma4' + Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL + Generated : none diff --git a/doc/source/reference/simd/simd-optimizations-tables-diff.inc b/doc/source/reference/simd/simd-optimizations-tables-diff.inc deleted file mode 100644 index 41fa96703..000000000 --- a/doc/source/reference/simd/simd-optimizations-tables-diff.inc +++ /dev/null @@ -1,37 +0,0 @@ -.. generated via source/reference/simd/simd-optimizations.py - -x86::Intel Compiler - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - =========== ================================================================================================================== - Name Implies - =========== ================================================================================================================== - ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2** - ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3** - ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD** - =========== ================================================================================================================== - -.. note:: - The following features aren't supported by x86::Intel Compiler: - **XOP FMA4** - -x86::Microsoft Visual C/C++ - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ============ ================================================================================================================================= - Name Implies - ============ ================================================================================================================================= - ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2** - ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3** - ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD** **AVX512_SKX** - ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` **AVX512_SKX** - ============ ================================================================================================================================= - -.. note:: - The following features aren't supported by x86::Microsoft Visual C/C++: - **AVX512_KNL AVX512_KNM** - diff --git a/doc/source/reference/simd/simd-optimizations-tables.inc b/doc/source/reference/simd/simd-optimizations-tables.inc deleted file mode 100644 index f038a91e1..000000000 --- a/doc/source/reference/simd/simd-optimizations-tables.inc +++ /dev/null @@ -1,103 +0,0 @@ -.. generated via source/reference/simd/simd-optimizations.py - -x86 - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ============ ================================================================================================================= - Name Implies - ============ ================================================================================================================= - ``SSE`` ``SSE2`` - ``SSE2`` ``SSE`` - ``SSE3`` ``SSE`` ``SSE2`` - ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3`` - ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` - ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` - ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` - ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` - ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` - ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` - ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` - ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` - ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` - ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` - ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` - ============ ================================================================================================================= - -x86 - Group names -~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ============== ===================================================== =========================================================================================================================================================================== - Name Gather Implies - ============== ===================================================== =========================================================================================================================================================================== - ``AVX512_KNL`` ``AVX512ER`` ``AVX512PF`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` - ``AVX512_KNM`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL`` - ``AVX512_SKX`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` - ``AVX512_CLX`` ``AVX512VNNI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` - ``AVX512_CNL`` ``AVX512IFMA`` ``AVX512VBMI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` - ``AVX512_ICL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` - ============== ===================================================== =========================================================================================================================================================================== - -IBM/POWER big-endian - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ======== ================ - Name Implies - ======== ================ - ``VSX`` - ``VSX2`` ``VSX`` - ``VSX3`` ``VSX`` ``VSX2`` - ======== ================ - -IBM/POWER little-endian - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ======== ================ - Name Implies - ======== ================ - ``VSX`` ``VSX2`` - ``VSX2`` ``VSX`` - ``VSX3`` ``VSX`` ``VSX2`` - ======== ================ - -ARMv7/A32 - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ============== =========================================================== - Name Implies - ============== =========================================================== - ``NEON`` - ``NEON_FP16`` ``NEON`` - ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` - ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` - ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` - ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` - ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP`` - ============== =========================================================== - -ARMv8/A64 - CPU feature names -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. table:: - :align: left - - ============== =========================================================== - Name Implies - ============== =========================================================== - ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` - ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD`` - ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD`` - ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` - ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` - ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` - ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP`` - ============== =========================================================== - diff --git a/doc/source/reference/simd/simd-optimizations.py b/doc/source/reference/simd/simd-optimizations.py deleted file mode 100644 index a78302db5..000000000 --- a/doc/source/reference/simd/simd-optimizations.py +++ /dev/null @@ -1,190 +0,0 @@ -""" -Generate CPU features tables from CCompilerOpt -""" -from os import sys, path -gen_path = path.dirname(path.realpath(__file__)) -#sys.path.append(path.abspath(path.join(gen_path, *([".."]*4), "numpy", "distutils"))) -#from ccompiler_opt import CCompilerOpt -from numpy.distutils.ccompiler_opt import CCompilerOpt - -class FakeCCompilerOpt(CCompilerOpt): - fake_info = ("arch", "compiler", "extra_args") - # disable caching no need for it - conf_nocache = True - def __init__(self, *args, **kwargs): - no_cc = None - CCompilerOpt.__init__(self, no_cc, **kwargs) - def dist_compile(self, sources, flags, **kwargs): - return sources - def dist_info(self): - return FakeCCompilerOpt.fake_info - @staticmethod - def dist_log(*args, stderr=False): - # avoid printing - pass - def feature_test(self, name, force_flags=None): - # To speed up - return True - - def gen_features_table(self, features, ignore_groups=True, - field_names=["Name", "Implies"], - fstyle=None, fstyle_implies=None, **kwargs): - rows = [] - if fstyle is None: - fstyle = lambda ft: f'``{ft}``' - if fstyle_implies is None: - fstyle_implies = lambda origin, ft: fstyle(ft) - for f in self.feature_sorted(features): - is_group = "group" in self.feature_supported.get(f, {}) - if ignore_groups and is_group: - continue - implies = self.feature_sorted(self.feature_implies(f)) - implies = ' '.join([fstyle_implies(f, i) for i in implies]) - rows.append([fstyle(f), implies]) - if rows: - return self.gen_rst_table(field_names, rows, **kwargs) - - def gen_gfeatures_table(self, features, - field_names=["Name", "Gather", "Implies"], - fstyle=None, fstyle_implies=None, **kwargs): - rows = [] - if fstyle is None: - fstyle = lambda ft: f'``{ft}``' - if fstyle_implies is None: - fstyle_implies = lambda origin, ft: fstyle(ft) - for f in self.feature_sorted(features): - gather = self.feature_supported.get(f, {}).get("group", None) - if not gather: - continue - implies = self.feature_sorted(self.feature_implies(f)) - implies = ' '.join([fstyle_implies(f, i) for i in implies]) - gather = ' '.join([fstyle_implies(f, i) for i in gather]) - rows.append([fstyle(f), gather, implies]) - if rows: - return self.gen_rst_table(field_names, rows, **kwargs) - - def gen_rst_table(self, field_names, rows, tab_size=4): - assert(not rows or len(field_names) == len(rows[0])) - rows.append(field_names) - fld_len = len(field_names) - cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)] - del rows[-1] - cformat = ' '.join('{:<%d}' % i for i in cls_len) - border = cformat.format(*['='*i for i in cls_len]) - - rows = [cformat.format(*row) for row in rows] - # header - rows = [border, cformat.format(*field_names), border] + rows - # footer - rows += [border] - # add left margin - rows = [(' ' * tab_size) + r for r in rows] - return '\n'.join(rows) - -def features_table_sections(name, ftable=None, gtable=None, tab_size=4): - tab = ' '*tab_size - content = '' - if ftable: - title = f"{name} - CPU feature names" - content = ( - f"{title}\n{'~'*len(title)}" - f"\n.. table::\n{tab}:align: left\n\n" - f"{ftable}\n\n" - ) - if gtable: - title = f"{name} - Group names" - content += ( - f"{title}\n{'~'*len(title)}" - f"\n.. table::\n{tab}:align: left\n\n" - f"{gtable}\n\n" - ) - return content - -def features_table(arch, cc="gcc", pretty_name=None, **kwargs): - FakeCCompilerOpt.fake_info = (arch, cc, '') - ccopt = FakeCCompilerOpt(cpu_baseline="max") - features = ccopt.cpu_baseline_names() - ftable = ccopt.gen_features_table(features, **kwargs) - gtable = ccopt.gen_gfeatures_table(features, **kwargs) - - if not pretty_name: - pretty_name = arch + '/' + cc - return features_table_sections(pretty_name, ftable, gtable, **kwargs) - -def features_table_diff(arch, cc, cc_vs="gcc", pretty_name=None, **kwargs): - FakeCCompilerOpt.fake_info = (arch, cc, '') - ccopt = FakeCCompilerOpt(cpu_baseline="max") - fnames = ccopt.cpu_baseline_names() - features = {f:ccopt.feature_implies(f) for f in fnames} - - FakeCCompilerOpt.fake_info = (arch, cc_vs, '') - ccopt_vs = FakeCCompilerOpt(cpu_baseline="max") - fnames_vs = ccopt_vs.cpu_baseline_names() - features_vs = {f:ccopt_vs.feature_implies(f) for f in fnames_vs} - - common = set(fnames).intersection(fnames_vs) - extra_avl = set(fnames).difference(fnames_vs) - not_avl = set(fnames_vs).difference(fnames) - diff_impl_f = {f:features[f].difference(features_vs[f]) for f in common} - diff_impl = {k for k, v in diff_impl_f.items() if v} - - fbold = lambda ft: f'**{ft}**' if ft in extra_avl else f'``{ft}``' - fbold_implies = lambda origin, ft: ( - f'**{ft}**' if ft in diff_impl_f.get(origin, {}) else f'``{ft}``' - ) - diff_all = diff_impl.union(extra_avl) - ftable = ccopt.gen_features_table( - diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs - ) - gtable = ccopt.gen_gfeatures_table( - diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs - ) - if not pretty_name: - pretty_name = arch + '/' + cc - content = features_table_sections(pretty_name, ftable, gtable, **kwargs) - - if not_avl: - not_avl = ccopt_vs.feature_sorted(not_avl) - not_avl = ' '.join(not_avl) - content += ( - ".. note::\n" - f" The following features aren't supported by {pretty_name}:\n" - f" **{not_avl}**\n\n" - ) - return content - -if __name__ == '__main__': - pretty_names = { - "PPC64": "IBM/POWER big-endian", - "PPC64LE": "IBM/POWER little-endian", - "ARMHF": "ARMv7/A32", - "AARCH64": "ARMv8/A64", - "ICC": "Intel Compiler", - # "ICCW": "Intel Compiler msvc-like", - "MSVC": "Microsoft Visual C/C++" - } - with open(path.join(gen_path, 'simd-optimizations-tables.inc'), 'wt') as fd: - fd.write(f'.. generated via {__file__}\n\n') - for arch in ( - ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64") - ): - pretty_name = pretty_names.get(arch, arch) - table = features_table(arch=arch, pretty_name=pretty_name) - assert(table) - fd.write(table) - - with open(path.join(gen_path, 'simd-optimizations-tables-diff.inc'), 'wt') as fd: - fd.write(f'.. generated via {__file__}\n\n') - for arch, cc_names in ( - ("x86", ("clang", "ICC", "MSVC")), - ("PPC64", ("clang",)), - ("PPC64LE", ("clang",)), - ("ARMHF", ("clang",)), - ("AARCH64", ("clang",)) - ): - arch_pname = pretty_names.get(arch, arch) - for cc in cc_names: - pretty_name = f"{arch_pname}::{pretty_names.get(cc, cc)}" - table = features_table_diff(arch=arch, cc=cc, pretty_name=pretty_name) - if table: - fd.write(table) diff --git a/doc/source/reference/simd/simd-optimizations.rst b/doc/source/reference/simd/simd-optimizations.rst index 9de6d1734..a18108266 100644 --- a/doc/source/reference/simd/simd-optimizations.rst +++ b/doc/source/reference/simd/simd-optimizations.rst @@ -1,527 +1,12 @@ -****************** -SIMD Optimizations -****************** +:orphan: -NumPy provides a set of macros that define `Universal Intrinsics`_ to -abstract out typical platform-specific intrinsics so SIMD code needs to be -written only once. There are three layers: +.. raw:: html -- Code is *written* using the universal intrinsic macros, with guards that - will enable use of the macros only when the compiler recognizes them. - In NumPy, these are used to construct multiple ufunc loops. Current policy is - to create three loops: One loop is the default and uses no intrinsics. One - uses the minimum intrinsics required on the architecture. And the third is - written using the maximum set of intrinsics possible. -- At *compile* time, a distutils command is used to define the minimum and - maximum features to support, based on user choice and compiler support. The - appropriate macros are overlaid with the platform / architecture intrinsics, - and the three loops are compiled. -- At *runtime import*, the CPU is probed for the set of supported intrinsic - features. A mechanism is used to grab the pointer to the most appropriate - function, and this will be the one called for the function. + <html> + <head> + <meta http-equiv="refresh" content="0; url=index.html"/> + </head> + </html> - -Build options for compilation -============================= - -- ``--cpu-baseline``: minimal set of required optimizations. Default - value is ``min`` which provides the minimum CPU features that can - safely run on a wide range of platforms within the processor family. - -- ``--cpu-dispatch``: dispatched set of additional optimizations. - The default value is ``max -xop -fma4`` which enables all CPU - features, except for AMD legacy features(in case of X86). - -The command arguments are available in ``build``, ``build_clib``, and -``build_ext``. -if ``build_clib`` or ``build_ext`` are not specified by the user, the arguments of -``build`` will be used instead, which also holds the default values. - -Optimization names can be CPU features or groups of features that gather -several features or :ref:`special options <special-options>` to perform a series of procedures. - - -The following tables show the current supported optimizations sorted from the lowest to the highest interest. - -.. include:: simd-optimizations-tables.inc - ----- - -.. _tables-diff: - -While the above tables are based on the GCC Compiler, the following tables showing the differences in the -other compilers: - -.. include:: simd-optimizations-tables-diff.inc - -.. _special-options: - -Special options -~~~~~~~~~~~~~~~ - -- ``NONE``: enable no features - -- ``NATIVE``: Enables all CPU features that supported by the current - machine, this operation is based on the compiler flags (``-march=native, -xHost, /QxHost``) - -- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms: - - .. table:: - :align: left - - ====================================== ======================================= - For Arch Returns - ====================================== ======================================= - ``x86`` ``SSE`` ``SSE2`` - ``x86`` ``64-bit mode`` ``SSE`` ``SSE2`` ``SSE3`` - ``IBM/POWER`` ``big-endian mode`` ``NONE`` - ``IBM/POWER`` ``little-endian mode`` ``VSX`` ``VSX2`` - ``ARMHF`` ``NONE`` - ``ARM64`` ``AARCH64`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` - ``ASIMD`` - ====================================== ======================================= - -- ``MAX``: Enables all supported CPU features by the Compiler and platform. - -- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``. - -NOTES -~~~~~~~~~~~~~ -- CPU features and other options are case-insensitive. - -- The order of the requested optimizations doesn't matter. - -- Either commas or spaces can be used as a separator, e.g. ``--cpu-dispatch``\ = - "avx2 avx512f" or ``--cpu-dispatch``\ = "avx2, avx512f" both work, but the - arguments must be enclosed in quotes. - -- The operand ``+`` is only added for nominal reasons, For example: - ``--cpu-baseline= "min avx2"`` is equivalent to ``--cpu-baseline="min + avx2"``. - ``--cpu-baseline="min,avx2"`` is equivalent to ``--cpu-baseline`="min,+avx2"`` - -- If the CPU feature is not supported by the user platform or - compiler, it will be skipped rather than raising a fatal error. - -- Any specified CPU feature to ``--cpu-dispatch`` will be skipped if - it's part of CPU baseline features - -- The ``--cpu-baseline`` argument force-enables implied features, - e.g. ``--cpu-baseline``\ ="sse42" is equivalent to - ``--cpu-baseline``\ ="sse sse2 sse3 ssse3 sse41 popcnt sse42" - -- The value of ``--cpu-baseline`` will be treated as "native" if - compiler native flag ``-march=native`` or ``-xHost`` or ``QxHost`` is - enabled through environment variable ``CFLAGS`` - -- The validation process for the requested optimizations when it comes to - ``--cpu-baseline`` isn't strict. For example, if the user requested - ``AVX2`` but the compiler doesn't support it then we just skip it and return - the maximum optimization that the compiler can handle depending on the - implied features of ``AVX2``, let us assume ``AVX``. - -- The user should always check the final report through the build log - to verify the enabled features. - -Special cases -~~~~~~~~~~~~~ - -**Interrelated CPU features**: Some exceptional conditions force us to link some features together when it come to certain compilers or architectures, resulting in the impossibility of building them separately. -These conditions can be divided into two parts, as follows: - -- **Architectural compatibility**: The need to align certain CPU features that are assured - to be supported by successive generations of the same architecture, for example: - - - On ppc64le `VSX(ISA 2.06)` and `VSX2(ISA 2.07)` both imply one another since the - first generation that supports little-endian mode is Power-8`(ISA 2.07)` - - On AArch64 `NEON` `FP16` `VFPV4` `ASIMD` implies each other since they are part of the - hardware baseline. - -- **Compilation compatibility**: Not all **C/C++** compilers provide independent support for all CPU - features. For example, **Intel**'s compiler doesn't provide separated flags for `AVX2` and `FMA3`, - it makes sense since all Intel CPUs that comes with `AVX2` also support `FMA3` and vice versa, - but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**. - Therefore, there are differences in the depiction of CPU features between the C/C++ compilers, - as shown in the :ref:`tables above <tables-diff>`. - - -Behaviors and Errors -~~~~~~~~~~~~~~~~~~~~ - - - -Usage and Examples -~~~~~~~~~~~~~~~~~~ - -Report and Trace -~~~~~~~~~~~~~~~~ - -Understanding CPU Dispatching, How the NumPy dispatcher works? -============================================================== - -NumPy dispatcher is based on multi-source compiling, which means taking -a certain source and compiling it multiple times with different compiler -flags and also with different **C** definitions that affect the code -paths to enable certain instruction-sets for each compiled object -depending on the required optimizations, then combining the returned -objects together. - -.. figure:: ../figures/opt-infra.png - -This mechanism should support all compilers and it doesn't require any -compiler-specific extension, but at the same time it is adds a few steps to -normal compilation that are explained as follows: - -1- Configuration -~~~~~~~~~~~~~~~~ - -Configuring the required optimization by the user before starting to build the -source files via the two command arguments as explained above: - -- ``--cpu-baseline``: minimal set of required optimizations. - -- ``--cpu-dispatch``: dispatched set of additional optimizations. - - -2- Discovering the environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In this part, we check the compiler and platform architecture -and cache some of the intermediary results to speed up rebuilding. - -3- Validating the requested optimizations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By testing them against the compiler, and seeing what the compiler can -support according to the requested optimizations. - -4- Generating the main configuration header -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The generated header ``_cpu_dispatch.h`` contains all the definitions and -headers of instruction-sets for the required optimizations that have been -validated during the previous step. - -It also contains extra C definitions that are used for defining NumPy's -Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispaٍtch__``. - -**What is in this header?** - -The example header was dynamically generated by gcc on an X86 machine. -The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and -``--cpu-dispatch="ssse3 sse41"``, and the result is below. - -.. code:: c - - // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h - /**NOTE - ** C definitions prefixed with "NPY_HAVE_" represent - ** the required optimzations. - ** - ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and - ** shouldn't be used by any NumPy C sources. - */ - /******* baseline features *******/ - /** SSE **/ - #define NPY_HAVE_SSE 1 - #include <xmmintrin.h> - /** SSE2 **/ - #define NPY_HAVE_SSE2 1 - #include <emmintrin.h> - /** SSE3 **/ - #define NPY_HAVE_SSE3 1 - #include <pmmintrin.h> - - /******* dispatch-able features *******/ - #ifdef NPY__CPU_TARGET_SSSE3 - /** SSSE3 **/ - #define NPY_HAVE_SSSE3 1 - #include <tmmintrin.h> - #endif - #ifdef NPY__CPU_TARGET_SSE41 - /** SSE41 **/ - #define NPY_HAVE_SSE41 1 - #include <smmintrin.h> - #endif - -**Baseline features** are the minimal set of required optimizations configured -via ``--cpu-baseline``. They have no preprocessor guards and they're -always on, which means they can be used in any source. - -Does this mean NumPy's infrastructure passes the compiler's flags of -baseline features to all sources? - -Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are -treated differently. - -What if the user specifies certain **baseline features** during the -build but at runtime the machine doesn't support even these -features? Will the compiled code be called via one of these definitions, or -maybe the compiler itself auto-generated/vectorized certain piece of code -based on the provided command line compiler flags? - -During the loading of the NumPy module, there's a validation step -which detects this behavior. It will raise a Python runtime error to inform the -user. This is to prevent the CPU reaching an illegal instruction error causing -a segfault. - -**Dispatch-able features** are our dispatched set of additional optimizations -that were configured via ``--cpu-dispatch``. They are not activated by -default and are always guarded by other C definitions prefixed with -``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only -enabled within **dispatch-able sources**. - -.. _dispatchable-sources: - -5- Dispatch-able sources and configuration statements -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dispatch-able sources are special **C** files that can be compiled multiple -times with different compiler flags and also with different **C** -definitions. These affect code paths to enable certain -instruction-sets for each compiled object according to "**the -configuration statements**" that must be declared between a **C** -comment\ ``(/**/)`` and start with a special mark **@targets** at the -top of each dispatch-able source. At the same time, dispatch-able -sources will be treated as normal **C** sources if the optimization was -disabled by the command argument ``--disable-optimization`` . - -**What are configuration statements?** - -Configuration statements are sort of keywords combined together to -determine the required optimization for the dispatch-able source. - -Example: - -.. code:: c - - /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */ - // C code - -The keywords mainly represent the additional optimizations configured -through ``--cpu-dispatch``, but it can also represent other options such as: - -- Target groups: pre-configured configuration statements used for - managing the required optimizations from outside the dispatch-able source. - -- Policies: collections of options used for changing the default - behaviors or forcing the compilers to perform certain things. - -- "baseline": a unique keyword represents the minimal optimizations - that configured through ``--cpu-baseline`` - -**Numpy's infrastructure handles dispatch-able sources in four steps**: - -- **(A) Recognition**: Just like source templates and F2PY, the - dispatch-able sources requires a special extension ``*.dispatch.c`` - to mark C dispatch-able source files, and for C++ - ``*.dispatch.cpp`` or ``*.dispatch.cxx`` - **NOTE**: C++ not supported yet. - -- **(B) Parsing and validating**: In this step, the - dispatch-able sources that had been filtered by the previous step - are parsed and validated by the configuration statements for each one - of them one by one in order to determine the required optimizations. - -- **(C) Wrapping**: This is the approach taken by NumPy's - infrastructure, which has proved to be sufficiently flexible in order - to compile a single source multiple times with different **C** - definitions and flags that affect the code paths. The process is - achieved by creating a temporary **C** source for each required - optimization that related to the additional optimization, which - contains the declarations of the **C** definitions and includes the - involved source via the **C** directive **#include**. For more - clarification take a look at the following code for AVX512F : - - .. code:: c - - /* - * this definition is used by NumPy utilities as suffixes for the - * exported symbols - */ - #define NPY__CPU_TARGET_CURRENT AVX512F - /* - * The following definitions enable - * definitions of the dispatch-able features that are defined within the main - * configuration header. These are definitions for the implied features. - */ - #define NPY__CPU_TARGET_SSE - #define NPY__CPU_TARGET_SSE2 - #define NPY__CPU_TARGET_SSE3 - #define NPY__CPU_TARGET_SSSE3 - #define NPY__CPU_TARGET_SSE41 - #define NPY__CPU_TARGET_POPCNT - #define NPY__CPU_TARGET_SSE42 - #define NPY__CPU_TARGET_AVX - #define NPY__CPU_TARGET_F16C - #define NPY__CPU_TARGET_FMA3 - #define NPY__CPU_TARGET_AVX2 - #define NPY__CPU_TARGET_AVX512F - // our dispatch-able source - #include "/the/absuolate/path/of/hello.dispatch.c" - -- **(D) Dispatch-able configuration header**: The infrastructure - generates a config header for each dispatch-able source, this header - mainly contains two abstract **C** macros used for identifying the - generated objects, so they can be used for runtime dispatching - certain symbols from the generated objects by any **C** source. It is - also used for forward declarations. - - The generated header takes the name of the dispatch-able source after - excluding the extension and replace it with '**.h**', for example - assume we have a dispatch-able source called **hello.dispatch.c** and - contains the following: - - .. code:: c - - // hello.dispatch.c - /*@targets baseline sse42 avx512f */ - #include <stdio.h> - #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR - - #ifndef NPY__CPU_TARGET_CURRENT - // wrapping the dispatch-able source only happens to the additional optimizations - // but if the keyword 'baseline' provided within the configuration statements, - // the infrastructure will add extra compiling for the dispatch-able source by - // passing it as-is to the compiler without any changes. - #define CURRENT_TARGET(X) X - #define NPY__CPU_TARGET_CURRENT baseline // for printing only - #else - // since we reach to this point, that's mean we're dealing with - // the additional optimizations, so it could be SSE42 or AVX512F - #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT) - #endif - // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols, - // to avoid linking duplications, NumPy already has a macro called - // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at - // numpy/numpy/core/src/common/npy_cpu_dispatch.h - // NOTE: we tend to not adding suffixes to the baseline exported symbols - void CURRENT_TARGET(simd_whoami)(const char *extra_info) - { - printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info); - } - - Now assume you attached **hello.dispatch.c** to the source tree, then - the infrastructure should generate a temporary config header called - **hello.dispatch.h** that can be reached by any source in the source - tree, and it should contain the following code : - - .. code:: c - - #ifndef NPY__CPU_DISPATCH_EXPAND_ - // To expand the macro calls in this header - #define NPY__CPU_DISPATCH_EXPAND_(X) X - #endif - // Undefining the following macros, due to the possibility of including config headers - // multiple times within the same source and since each config header represents - // different required optimizations according to the specified configuration - // statements in the dispatch-able source that derived from it. - #undef NPY__CPU_DISPATCH_BASELINE_CALL - #undef NPY__CPU_DISPATCH_CALL - // nothing strange here, just a normal preprocessor callback - // enabled only if 'baseline' specified within the configuration statements - #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \ - NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__)) - // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching - // the required optimizations that specified within the configuration statements. - // - // @param CHK, Expected a macro that can be used to detect CPU features - // in runtime, which takes a CPU feature name without string quotes and - // returns the testing result in a shape of boolean value. - // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement. - // - // @param CB, a callback macro that expected to be called multiple times depending - // on the required optimizations, the callback should receive the following arguments: - // 1- The pending calls of @param CHK filled up with the required CPU features, - // that need to be tested first in runtime before executing call belong to - // the compiled object. - // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT' - // 3- Extra arguments in the macro itself - // - // By default the callback calls are sorted depending on the highest interest - // unless the policy "$keep_sort" was in place within the configuration statements - // see "Dive into the CPU dispatcher" for more clarification. - #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \ - NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \ - NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__)) - - An example of using the config header in light of the above: - - .. code:: c - - // NOTE: The following macros are only defined for demonstration purposes only. - // NumPy already has a collections of macros located at - // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching - // and declarations scenarios. - - #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE - #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND - - // An example for setting a macro that calls all the exported symbols at once - // after checking if they're supported by the running machine. - #define DISPATCH_CALL_ALL(FN, ARGS) \ - NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \ - NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS) - // The preprocessor callbacks. - // The same suffixes as we define it in the dispatch-able source. - #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \ - if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; } - #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \ - FN NPY_EXPAND(ARGS); - - // An example for setting a macro that calls the exported symbols of highest - // interest optimization, after checking if they're supported by the running machine. - #define DISPATCH_CALL_HIGH(FN, ARGS) \ - if (0) {} \ - NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \ - NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS) - // The preprocessor callbacks - // The same suffixes as we define it in the dispatch-able source. - #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \ - else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; } - #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \ - else { FN NPY_EXPAND(ARGS); } - - // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used - // for forward declrations any kind of prototypes based on - // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'. - // However in this example, we just handle it manually. - void simd_whoami(const char *extra_info); - void simd_whoami_AVX512F(const char *extra_info); - void simd_whoami_SSE41(const char *extra_info); - - void trigger_me(void) - { - // bring the auto-gernreated config header - // which contains config macros 'NPY__CPU_DISPATCH_CALL' and - // 'NPY__CPU_DISPATCH_BASELINE_CALL'. - // it highely recomaned to include the config header before exectuing - // the dispatching macros in case if there's another header in the scope. - #include "hello.dispatch.h" - DISPATCH_CALL_ALL(simd_whoami, ("all")) - DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest")) - // An example of including multiple config headers in the same source - // #include "hello2.dispatch.h" - // DISPATCH_CALL_HIGH(another_function, ("the highest interest")) - } - - -Dive into the CPU dispatcher -============================ - -The baseline -~~~~~~~~~~~~ - -Dispatcher -~~~~~~~~~~ - -Groups and Policies -~~~~~~~~~~~~~~~~~~~ - -Examples -~~~~~~~~ - -Report and Trace -~~~~~~~~~~~~~~~~ - - -.. _`Universal Intrinsics`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html +The location of this document has been changed , if you are not +redirected in few seconds, `click here <index.html>`_. diff --git a/doc/source/user/basics.creation.rst b/doc/source/user/basics.creation.rst index 84ff1c30e..523a05379 100644 --- a/doc/source/user/basics.creation.rst +++ b/doc/source/user/basics.creation.rst @@ -74,10 +74,11 @@ assign a new type that satisfies all of the array elements involved in the computation, here ``uint32`` and ``int32`` can both be represented in as ``int64``. -The default NumPy behavior is to create arrays in either 64-bit signed -integers or double precision floating point numbers, ``int64`` and -``float``, respectively. If you expect your arrays to be a certain type, -then you need to specify the ``dtype`` while you create the array. +The default NumPy behavior is to create arrays in either 32 or 64-bit signed +integers (platform dependent and matches C int size) or double precision +floating point numbers, int32/int64 and float, respectively. If you expect your +integer arrays to be a specific type, then you need to specify the dtype while +you create the array. 2) Intrinsic NumPy array creation functions =========================================== diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c index 4c6b09b80..934434370 100644 --- a/numpy/core/src/umath/dispatching.c +++ b/numpy/core/src/umath/dispatching.c @@ -592,17 +592,19 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc, Py_INCREF(operation_DTypes[i]); Py_DECREF(out_descrs[i]); } - if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver) { - /* - * In this one case, the deprecation means that we actually override - * the signature. - */ - for (int i = 0; i < nargs; i++) { - if (signature[i] != NULL && signature[i] != operation_DTypes[i]) { - Py_INCREF(operation_DTypes[i]); - Py_SETREF(signature[i], operation_DTypes[i]); - *out_cacheable = 0; - } + /* + * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation + * warning (ignoring `dtype=`) and cannot be cached. + * All datetime ones *should* have a warning, but currently don't, + * but ignore all signature passing also. So they can also + * not be cached, and they mutate the signature which of course is wrong, + * but not doing it would confuse the code later. + */ + for (int i = 0; i < nargs; i++) { + if (signature[i] != NULL && signature[i] != operation_DTypes[i]) { + Py_INCREF(operation_DTypes[i]); + Py_SETREF(signature[i], operation_DTypes[i]); + *out_cacheable = 0; } } return 0; diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 9107323b0..1b310b471 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -2737,7 +2737,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc, } PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc, - ops, signature, operation_DTypes, NPY_FALSE, NPY_FALSE, NPY_TRUE); + ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE, NPY_TRUE); /* Output can currently get cleared, others XDECREF in case of error */ Py_XDECREF(operation_DTypes[1]); if (out != NULL) { @@ -5194,60 +5194,18 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi return NULL; } } - - PyObject *promoter = NULL; - if (ufunc->ntypes == 1) { - npy_bool all_object = NPY_TRUE; - for (int i = 0; i < ufunc->nargs; i++) { - if (ufunc->types[i] != NPY_OBJECT) { - all_object = NPY_FALSE; - break; - } - } - if (all_object) { - promoter = PyCapsule_New(&object_only_ufunc_promoter, - "numpy._ufunc_promoter", NULL); - if (promoter == NULL) { - Py_DECREF(ufunc); - return NULL; - } - } - } - if (promoter == NULL && ufunc->nin > 1) { - promoter = PyCapsule_New(&default_ufunc_promoter, - "numpy._ufunc_promoter", NULL); - if (promoter == NULL) { - Py_DECREF(ufunc); - return NULL; - } - } - if (promoter != NULL) { - /* Always install default promoter using the common DType */ - PyObject *dtype_tuple = PyTuple_New(ufunc->nargs); - if (dtype_tuple == NULL) { - Py_DECREF(promoter); - Py_DECREF(ufunc); - return NULL; - } - for (int i = 0; i < ufunc->nargs; i++) { - Py_INCREF(Py_None); - PyTuple_SET_ITEM(dtype_tuple, i, Py_None); - } - PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter); - Py_DECREF(dtype_tuple); - Py_DECREF(promoter); - if (info == NULL) { - Py_DECREF(ufunc); - return NULL; - } - - int res = PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0); - Py_DECREF(info); - if (res < 0) { - Py_DECREF(ufunc); - return NULL; - } - } + /* + * TODO: I tried adding a default promoter here (either all object for + * some special cases, or all homogeneous). Those are reasonable + * defaults, but short-cut a deprecated SciPy loop, where the + * homogeneous loop `ddd->d` was deprecated, but an inhomogeneous + * one `dld->d` should be picked. + * The default promoter *is* a reasonable default, but switched that + * behaviour. + * Another problem appeared due to buggy type-resolution for + * datetimes, this meant that `timedelta.sum(dtype="f8")` returned + * datetimes (and not floats or error), arguably wrong, but... + */ return (PyObject *)ufunc; } diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py index c6a3d4e79..baae77a35 100644 --- a/numpy/core/tests/test_datetime.py +++ b/numpy/core/tests/test_datetime.py @@ -2029,11 +2029,17 @@ class TestDateTime: assert_equal(np.maximum.reduce(a), np.timedelta64(7, 's')) + def test_timedelta_correct_mean(self): + # test mainly because it worked only via a bug in that allowed: + # `timedelta.sum(dtype="f8")` to ignore the dtype request. + a = np.arange(1000, dtype="m8[s]") + assert_array_equal(a.mean(), a.sum() / len(a)) + def test_datetime_no_subtract_reducelike(self): # subtracting two datetime64 works, but we cannot reduce it, since # the result of that subtraction will have a different dtype. arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]") - msg = r"ufunc 'subtract' did not contain a loop with signature " + msg = r"the resolved dtypes are not compatible" with pytest.raises(TypeError, match=msg): np.subtract.reduce(arr) |
