ENH: [1/7] enable multi-platform SIMD compiler optimizations

Implement new distutils class `CCompilerOpt`, used for handling the CPU/hardware optimization, starting from parsing the command arguments, to managing the relationship between the CPU baseline and dispatch-able features, also generating the required C headers and ending with compiling the sources with proper compiler's flags. `CCompilerOpt` mainly used as a helper class for `CCompiler`, and doesn't provide any runtime detection for the CPU features, instead only focuses on the compiler side, but it generates abstract C headers that can be used later for the final runtime dispatching process.
author: Sayed Adel <seiko@imavr.com> 2020-06-13 18:15:25 +0200
committer: Sayed Adel <seiko@imavr.com> 2020-06-15 22:48:29 +0200
commit: da21d28ef69e65c5bfef8dc22840fe16fec52540 (patch)
tree: dcff74da7f8712f0328d3c1ed49527a0fd6f131d /numpy
parent: 5345c2575a28fa2dfbbec83c99636669476c2745 (diff)
download: numpy-da21d28ef69e65c5bfef8dc22840fe16fec52540.tar.gz
35 files changed, 2808 insertions, 0 deletions
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
new file mode 100644
index 000000000..0488173ca
--- /dev/null
+++ b/numpy/distutils/ccompiler_opt.py
@@ -0,0 +1,2438 @@
+"""Provides the `CCompilerOpt` class, used for handling the CPU/hardware
+optimization, starting from parsing the command arguments, to managing the
+relation between the CPU baseline and dispatch-able features,
+also generating the required C headers and ending with compiling
+the sources with proper compiler's flags.
+
+`CCompilerOpt` doesn't provide runtime detection for the CPU features,
+instead only focuses on the compiler side, but it creates abstract C headers
+that can be used later for the final runtime dispatching process."""
+
+import sys, io, os, re, textwrap, pprint, inspect, atexit, subprocess
+
+class _Config:
+    """An abstract class holds all configurable attributes of `CCompilerOpt`,
+    these class attributes can be used to change the default behavior
+    of `CCompilerOpt` in order to fit other requirements.
+
+    Attributes
+    ----------
+    conf_nocache : bool
+        Set True to disable memory and file cache.
+        Default is False.
+
+    conf_noopt : bool
+        Set True to forces the optimization to be disabled,
+        in this case `CCompilerOpt` tends to generate all
+        expected headers in order to 'not' break the build.
+        Default is False.
+
+    conf_cache_factors : list
+        Add extra factors to the primary caching factors. The caching factors
+        are utilized to determine if there are changes had happened that
+        requires to discard the cache and re-updating it. The primary factors
+        are the arguments of `CCompilerOpt` and `CCompiler`'s properties(type, flags, etc).
+        Default is list of two items, containing the time of last modification
+        of `ccompiler_opt` and value of attribute "conf_noopt"
+
+    conf_tmp_path : str,
+        The path of temporary directory. Default is auto-created
+        temporary directory via ``tempfile.mkdtemp()``.
+
+    conf_check_path : str
+        The path of testing files. Each added CPU feature must have a
+        **C** source file contains at least one intrinsic or instruction that
+        related to this feature, so it can be tested against the compiler.
+        Default is ``./distutils/checks``.
+
+    conf_target_groups : dict
+        Extra tokens that can be reached from dispatch-able sources through
+        the special mark ``@targets``. Default is an empty dictionary.
+
+        **Notes**:
+            - case-insensitive for tokens and group names
+            - sign '#' must stick in the begin of group name and only within ``@targets``
+
+        **Example**:
+            .. code-block:: console
+
+                $ "@targets #avx_group other_tokens" > group_inside.c
+
+            >>> CCompilerOpt.conf_target_groups["avx_group"] = \\
+            "$werror $maxopt avx2 avx512f avx512_skx"
+            >>> cco = CCompilerOpt(cc_instance)
+            >>> cco.try_dispatch(["group_inside.c"])
+
+    conf_c_prefix : str
+        The prefix of public C definitions. Default is ``"NPY_"``.
+
+    conf_c_prefix_ : str
+        The prefix of internal C definitions. Default is ``"NPY__"``.
+
+    conf_cc_flags : dict
+        Nested dictionaries defining several compiler flags
+        that linked to some major functions, the main key
+        represent the compiler name and sub-keys represent
+        flags names. Default is already covers all supported
+        **C** compilers.
+
+        Sub-keys explained as follows:
+
+        "native": str or None
+            used by argument option `native`, to detect the current
+            machine support via the compiler.
+        "werror": str or None
+            utilized to treat warning as errors during testing CPU features
+            against the compiler and also for target's policy `$werror`
+            via dispatch-able sources.
+        "maxopt": str or None
+            utilized for target's policy '$maxopt' and the value should
+            contains the maximum acceptable optimization by the compiler.
+            e.g. in gcc `'-O3'`
+
+        **Notes**:
+            * case-sensitive for compiler names and flags
+            * use space to separate multiple flags
+            * any flag will tested against the compiler and it will skipped
+              if it's not applicable.
+
+    conf_min_features : dict
+        A dictionary defines the used CPU features for
+        argument option `'min'`, the key represent the CPU architecture
+        name e.g. `'x86'`. Default values provide the best effort
+        on wide range of users platforms.
+
+        **Note**: case-sensitive for architecture names.
+
+    conf_features : dict
+        Nested dictionaries used for identifying the CPU features.
+        the primary key is represented as a feature name or group name
+        that gathers several features. Default values covers all
+        supported features but without the major options like "flags",
+        these undefined options handle it by method `conf_features_partial()`.
+        Default value is covers almost all CPU features for *X86*, *IBM/Power64*
+        and *ARM 7/8*.
+
+        Sub-keys explained as follows:
+
+        "implies" : str or list, optional,
+            List of CPU feature names to be implied by it,
+            the feature name must be defined within `conf_features`.
+            Default is None.
+
+        "flags": str or list, optional
+            List of compiler flags. Default is None.
+
+        "detect": str or list, optional
+            List of CPU feature names that required to be detected
+            in runtime. By default, its the feature name or features
+            in "group" if its specified.
+
+        "implies_detect": bool, optional
+            If True, all "detect" of implied features will be combined.
+            Default is True. see `feature_detect()`.
+
+        "group": str or list, optional
+            Same as "implies" but doesn't require the feature name to be
+            defined within `conf_features`.
+
+        "interest": int, required
+            a key for sorting CPU features
+
+        "headers": str or list, optional
+            intrinsics C header file
+
+        "disable": str, optional
+            force disable feature, the string value should contains the
+            reason of disabling.
+
+        "autovec": bool or None, optional
+            True or False to declare that CPU feature can be auto-vectorized
+            by the compiler.
+            By default(None), treated as True if the feature contains at
+            least one applicable flag. see `feature_can_autovec()`
+
+        **NOTES**:
+            * space can be used as separator with options that supports "str or list"
+            * case-sensitive for all values and feature name must be in upper-case.
+            * if flags aren't applicable, its will skipped rather than disable the
+              CPU feature
+            * the CPU feature will disabled if the compiler fail to compile
+              the test file
+    """
+    conf_nocache = False
+    conf_noopt = False
+    conf_cache_factors = None
+    conf_tmp_path = None
+    conf_check_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "checks"
+    )
+    conf_target_groups = {}
+    conf_c_prefix = 'NPY_'
+    conf_c_prefix_ = 'NPY__'
+    conf_cc_flags = dict(
+        gcc = dict(
+            # native should always fail on arm and ppc64,
+            # native usually works only with x86
+            native = '-march=native',
+            opt = '-O3',
+            werror = '-Werror'
+        ),
+        clang = dict(
+            native = '-march=native',
+            opt = "-O3",
+            werror = '-Werror'
+        ),
+        icc = dict(
+            native = '-xHost',
+            opt = '-O3',
+            werror = '-Werror'
+        ),
+        iccw = dict(
+            native = '/QxHost',
+            opt = '/O3',
+            werror = '/Werror'
+        ),
+        msvc = dict(
+            native = None,
+            opt = '/O2',
+            werror = '/WX'
+        )
+    )
+    conf_min_features = dict(
+        x86 = "SSE SSE2",
+        x64 = "SSE SSE2 SSE3",
+        ppc64 = '', # play it safe
+        ppc64le = "VSX VSX2",
+        armhf = '', # play it safe
+        aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD"
+    )
+    conf_features = dict(
+        # X86
+        SSE = dict(
+            interest=1, headers="xmmintrin.h",
+            # enabling SSE without SSE2 is useless also
+            # it's non-optional for x86_64
+            implies="SSE2"
+        ),
+        SSE2   = dict(interest=2, implies="SSE", headers="emmintrin.h"),
+        SSE3   = dict(interest=3, implies="SSE2", headers="pmmintrin.h"),
+        SSSE3  = dict(interest=4, implies="SSE3", headers="tmmintrin.h"),
+        SSE41  = dict(interest=5, implies="SSSE3", headers="smmintrin.h"),
+        POPCNT = dict(interest=6, implies="SSE41", headers="popcntintrin.h"),
+        SSE42  = dict(interest=7, implies="POPCNT"),
+        AVX    = dict(
+            interest=8, implies="SSE42", headers="immintrin.h",
+            implies_detect=False
+        ),
+        XOP    = dict(interest=9, implies="AVX", headers="x86intrin.h"),
+        FMA4   = dict(interest=10, implies="AVX", headers="x86intrin.h"),
+        F16C   = dict(interest=11, implies="AVX"),
+        FMA3   = dict(interest=12, implies="F16C"),
+        AVX2   = dict(interest=13, implies="F16C"),
+        AVX512F = dict(interest=20, implies="FMA3 AVX2", implies_detect=False),
+        AVX512CD = dict(interest=21, implies="AVX512F"),
+        AVX512_KNL = dict(
+            interest=40, implies="AVX512CD", group="AVX512ER AVX512PF",
+            detect="AVX512_KNL", implies_detect=False
+        ),
+        AVX512_KNM = dict(
+            interest=41, implies="AVX512_KNL",
+            group="AVX5124FMAPS AVX5124VNNIW AVX512VPOPCNTDQ",
+            detect="AVX512_KNM", implies_detect=False
+        ),
+        AVX512_SKX = dict(
+            interest=42, implies="AVX512CD", group="AVX512VL AVX512BW AVX512DQ",
+            detect="AVX512_SKX", implies_detect=False
+        ),
+        AVX512_CLX = dict(
+            interest=43, implies="AVX512_SKX", group="AVX512VNNI",
+            detect="AVX512_CLX"
+        ),
+        AVX512_CNL = dict(
+            interest=44, implies="AVX512_SKX", group="AVX512IFMA AVX512VBMI",
+            detect="AVX512_CNL", implies_detect=False
+        ),
+        AVX512_ICL = dict(
+            interest=45, implies="AVX512_CLX AVX512_CNL",
+            group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ",
+            detect="AVX512_ICL", implies_detect=False
+        ),
+        # IBM/Power
+        ## Power7/ISA 2.06
+        VSX = dict(interest=1, headers="altivec.h"),
+        ## Power8/ISA 2.07
+        VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
+        ## Power9/ISA 3.00
+        VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+        # ARM
+        NEON  = dict(interest=1, headers="arm_neon.h"),
+        NEON_FP16 = dict(interest=2, implies="NEON"),
+        ## FMA
+        NEON_VFPV4 = dict(interest=3, implies="NEON_FP16"),
+        ## Advanced SIMD
+        ASIMD = dict(interest=4, implies="NEON_FP16 NEON_VFPV4", implies_detect=False),
+        ## ARMv8.2 half-precision & vector arithm
+        ASIMDHP = dict(interest=5, implies="ASIMD"),
+        ## ARMv8.2 dot product
+        ASIMDDP = dict(interest=6, implies="ASIMD"),
+        ## ARMv8.2 Single & half-precision Multiply
+        ASIMDFHM = dict(interest=7, implies="ASIMDHP"),
+    )
+    def conf_features_partial(self):
+        """Return a dictionary of supported CPU features by the platform,
+        and accumulate the rest of undefined options in `conf_features`,
+        the returned dict has same rules and notes in
+        class attribute `conf_features`, also its override
+        any options that been set in 'conf_features'.
+        """
+        if self.cc_noopt:
+            # optimization is disabled
+            return {}
+
+        on_x86 = self.cc_on_x86 or self.cc_on_x64
+        is_unix = self.cc_is_gcc or self.cc_is_clang
+
+        if on_x86 and is_unix: return dict(
+            SSE    = dict(flags="-msse"),
+            SSE2   = dict(flags="-msse2"),
+            SSE3   = dict(flags="-msse3"),
+            SSSE3  = dict(flags="-mssse3"),
+            SSE41  = dict(flags="-msse4.1"),
+            POPCNT = dict(flags="-mpopcnt"),
+            SSE42  = dict(flags="-msse4.2"),
+            AVX    = dict(flags="-mavx"),
+            F16C   = dict(flags="-mf16c"),
+            XOP    = dict(flags="-mxop"),
+            FMA4   = dict(flags="-mfma4"),
+            FMA3   = dict(flags="-mfma"),
+            AVX2   = dict(flags="-mavx2"),
+            AVX512F = dict(flags="-mavx512f"),
+            AVX512CD = dict(flags="-mavx512cd"),
+            AVX512_KNL = dict(flags="-mavx512er -mavx512pf"),
+            AVX512_KNM = dict(
+                flags="-mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq"
+            ),
+            AVX512_SKX = dict(flags="-mavx512vl -mavx512bw -mavx512dq"),
+            AVX512_CLX = dict(flags="-mavx512vnni"),
+            AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"),
+            AVX512_ICL = dict(
+                flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq"
+            )
+        )
+        if on_x86 and self.cc_is_icc: return dict(
+            SSE    = dict(flags="-msse"),
+            SSE2   = dict(flags="-msse2"),
+            SSE3   = dict(flags="-msse3"),
+            SSSE3  = dict(flags="-mssse3"),
+            SSE41  = dict(flags="-msse4.1"),
+            POPCNT = {},
+            SSE42  = dict(flags="-msse4.2"),
+            AVX    = dict(flags="-mavx"),
+            F16C   = {},
+            XOP    = dict(disable="Intel Compiler doesn't support it"),
+            FMA4   = dict(disable="Intel Compiler doesn't support it"),
+            # Intel Compiler doesn't support AVX2 or FMA3 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="-march=core-avx2"
+            ),
+            AVX2 = dict(implies="FMA3", flags="-march=core-avx2"),
+            # Intel Compiler doesn't support AVX512F or AVX512CD independently
+            AVX512F = dict(
+                implies="AVX2 AVX512CD", flags="-march=common-avx512"
+            ),
+            AVX512CD = dict(
+                implies="AVX2 AVX512F", flags="-march=common-avx512"
+            ),
+            AVX512_KNL = dict(flags="-xKNL"),
+            AVX512_KNM = dict(flags="-xKNM"),
+            AVX512_SKX = dict(flags="-xSKYLAKE-AVX512"),
+            AVX512_CLX = dict(flags="-xCASCADELAKE"),
+            AVX512_CNL = dict(flags="-xCANNONLAKE"),
+            AVX512_ICL = dict(flags="-xICELAKE-CLIENT"),
+        )
+        if on_x86 and self.cc_is_iccw: return dict(
+            SSE    = dict(flags="/arch:SSE"),
+            SSE2   = dict(flags="/arch:SSE2"),
+            SSE3   = dict(flags="/arch:SSE3"),
+            SSSE3  = dict(flags="/arch:SSSE3"),
+            SSE41  = dict(flags="/arch:SSE4.1"),
+            POPCNT = {},
+            SSE42  = dict(flags="/arch:SSE4.2"),
+            AVX    = dict(flags="/arch:AVX"),
+            F16C   = {},
+            XOP    = dict(disable="Intel Compiler doesn't support it"),
+            FMA4   = dict(disable="Intel Compiler doesn't support it"),
+            # Intel Compiler doesn't support FMA3 or AVX2 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="/arch:CORE-AVX2"
+            ),
+            AVX2 = dict(
+                implies="FMA3", flags="/arch:CORE-AVX2"
+            ),
+            # Intel Compiler doesn't support AVX512F or AVX512CD independently
+            AVX512F = dict(
+                implies="AVX2 AVX512CD", flags="/Qx:COMMON-AVX512"
+            ),
+            AVX512CD = dict(
+                implies="AVX2 AVX512F", flags="/Qx:COMMON-AVX512"
+            ),
+            AVX512_KNL = dict(flags="/Qx:KNL"),
+            AVX512_KNM = dict(flags="/Qx:KNM"),
+            AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"),
+            AVX512_CLX = dict(flags="/Qx:CASCADELAKE"),
+            AVX512_CNL = dict(flags="/Qx:CANNONLAKE"),
+            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT")
+        )
+        if on_x86 and self.cc_is_msvc: return dict(
+            SSE    = dict(flags="/arch:SSE"),
+            SSE2   = dict(flags="/arch:SSE2"),
+            SSE3   = {},
+            SSSE3  = {},
+            SSE41  = {},
+            POPCNT = dict(headers="nmmintrin.h"),
+            SSE42  = {},
+            AVX    = dict(flags="/arch:AVX"),
+            F16C   = {},
+            XOP    = dict(headers="ammintrin.h"),
+            FMA4   = dict(headers="ammintrin.h"),
+            # MSVC doesn't support FMA3 or AVX2 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="/arch:AVX2"
+            ),
+            AVX2 = dict(
+                implies="F16C FMA3", flags="/arch:AVX2"
+            ),
+            # MSVC doesn't support AVX512F or AVX512CD independently,
+            # always generate instructions belong to (VL/VW/DQ)
+            AVX512F = dict(
+                implies="AVX2 AVX512CD AVX512_SKX", flags="/arch:AVX512"
+            ),
+            AVX512CD = dict(
+                implies="AVX512F AVX512_SKX", flags="/arch:AVX512"
+            ),
+            AVX512_KNL = dict(
+                disable="MSVC compiler doesn't support it"
+            ),
+            AVX512_KNM = dict(
+                disable="MSVC compiler doesn't support it"
+            ),
+            AVX512_SKX = dict(flags="/arch:AVX512"),
+            AVX512_CLX = {},
+            AVX512_CNL = {},
+            AVX512_ICL = {}
+        )
+
+        on_power = self.cc_on_ppc64le or self.cc_on_ppc64
+        if on_power:
+            partial = dict(
+                VSX = dict(
+                    implies=("VSX2" if self.cc_on_ppc64le else ""),
+                    flags="-mvsx"
+                ),
+                VSX2 = dict(
+                    flags="-mcpu=power8", implies_detect=False
+                ),
+                VSX3 = dict(
+                    flags="-mcpu=power9 -mtune=power9", implies_detect=False
+                )
+            )
+            if self.cc_is_clang:
+                partial["VSX"]["flags"]  = "-maltivec -mvsx"
+                partial["VSX2"]["flags"] = "-mpower8-vector"
+                partial["VSX3"]["flags"] = "-mpower9-vector"
+
+            return partial
+
+        if self.cc_on_aarch64 and is_unix: return dict(
+            NEON = dict(
+                implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True
+            ),
+            NEON_FP16 = dict(
+                implies="NEON NEON_VFPV4 ASIMD", autovec=True
+            ),
+            NEON_VFPV4 = dict(
+                implies="NEON NEON_FP16 ASIMD", autovec=True
+            ),
+            ASIMD = dict(
+                implies="NEON NEON_FP16 NEON_VFPV4", autovec=True
+            ),
+            ASIMDHP = dict(
+                flags="-march=armv8.2-a+fp16"
+            ),
+            ASIMDDP = dict(
+                flags="-march=armv8.2-a+dotprod"
+            ),
+            ASIMDFHM = dict(
+                flags="-march=armv8.2-a+fp16fml"
+            ),
+        )
+        if self.cc_on_armhf and is_unix: return dict(
+            NEON = dict(
+                flags="-mfpu=neon"
+            ),
+            NEON_FP16 = dict(
+                flags="-mfpu=neon-fp16 -mfp16-format=ieee"
+            ),
+            NEON_VFPV4 = dict(
+                flags="-mfpu=neon-vfpv4",
+            ),
+            ASIMD = dict(
+                flags="-mfpu=neon-fp-armv8 -march=armv8-a+simd",
+            ),
+            ASIMDHP = dict(
+                flags="-march=armv8.2-a+fp16"
+            ),
+            ASIMDDP = dict(
+                flags="-march=armv8.2-a+dotprod",
+            ),
+            ASIMDFHM = dict(
+                flags="-march=armv8.2-a+fp16fml"
+            )
+        )
+        # TODO: ARM MSVC
+        return {}
+
+    def __init__(self):
+        if self.conf_tmp_path is None:
+            import tempfile, shutil
+            tmp = tempfile.mkdtemp()
+            def rm_temp():
+                try:
+                    shutil.rmtree(tmp)
+                except IOError:
+                    pass
+            atexit.register(rm_temp)
+            self.conf_tmp_path = tmp
+
+        if self.conf_cache_factors is None:
+            self.conf_cache_factors = [
+                os.path.getmtime(__file__),
+                self.conf_nocache
+            ]
+
+class _Distutils:
+    """A helper class that provides a collection of fundamental methods
+    implemented in a top of Python and NumPy Distutils.
+
+    The idea behind this class is to gather all methods that it may
+    need to override in case of reuse 'CCompilerOpt' in environment
+    different than of what NumPy has.
+
+    Parameters
+    ----------
+    ccompiler : `CCompiler`
+        The generate instance that returned from `distutils.ccompiler.new_compiler()`.
+    """
+    def __init__(self, ccompiler):
+        self._ccompiler = ccompiler
+
+    def dist_compile(self, sources, flags, **kwargs):
+        """Wrap CCompiler.compile()"""
+        assert(isinstance(sources, list))
+        assert(isinstance(flags, list))
+        flags = kwargs.pop("extra_postargs", []) + flags
+        return self._ccompiler.compile(
+            sources, extra_postargs=flags, **kwargs
+        )
+
+    def dist_test(self, source, flags):
+        """Return True if 'CCompiler.compile()' able to compile
+        a source file with certain flags.
+        """
+        assert(isinstance(source, str))
+        from distutils.errors import CompileError
+        cc = self._ccompiler;
+        bk_spawn = getattr(cc, 'spawn', None)
+        if bk_spawn:
+            cc_type = getattr(self._ccompiler, "compiler_type", "")
+            if cc_type in ("msvc",):
+                setattr(cc, 'spawn', self._dist_test_spawn_paths)
+            else:
+                setattr(cc, 'spawn', self._dist_test_spawn)
+        test = False
+        try:
+            self.dist_compile(
+                [source], flags, output_dir=self.conf_tmp_path
+            )
+            test = True
+        except CompileError as e:
+            self.dist_log(str(e), stderr=True)
+        if bk_spawn:
+            setattr(cc, 'spawn', bk_spawn)
+        return test
+
+    def dist_info(self):
+        """Return a string containing all environment information, required
+        by the abstract class '_CCompiler' to discovering the platform
+        environment, also used as a cache factor in order to detect
+        any changes from outside.
+        """
+        if hasattr(self, "_dist_info"):
+            return self._dist_info
+        # play it safe
+        cc_info = ""
+        compiler = getattr(self._ccompiler, "compiler", None)
+        if compiler is not None:
+            if isinstance(compiler, str):
+                cc_info += compiler
+            elif hasattr(compiler, "__iter__"):
+                cc_info += ' '.join(compiler)
+        # in case if 'compiler' attribute doesn't provide anything
+        cc_type = getattr(self._ccompiler, "compiler_type", "")
+        if cc_type in ("intelem", "intelemw", "mingw64"):
+            cc_info += "x86_64"
+        elif cc_type in ("intel", "intelw", "intele"):
+            cc_info += "x86"
+        elif cc_type in ("msvc", "mingw32"):
+            import platform
+            if platform.architecture()[0] == "32bit":
+                cc_info += "x86"
+            else:
+                cc_info += "x86_64"
+        else:
+            # the last hope, too bad for cross-compiling
+            import platform
+            cc_info += platform.machine()
+
+        cc_info += cc_type
+        cflags = os.environ.get("CFLAGS", "")
+        if cflags not in cc_info:
+            cc_info += cflags
+
+        self._dist_info = cc_info
+        return cc_info
+
+    @staticmethod
+    def dist_error(*args):
+        """Raise a compiler error"""
+        from distutils.errors import CompileError
+        raise CompileError(_Distutils._dist_str(*args))
+
+    @staticmethod
+    def dist_fatal(*args):
+        """Raise a distutils error"""
+        from distutils.errors import DistutilsError
+        raise DistutilsError(_Distutils._dist_str(*args))
+
+    @staticmethod
+    def dist_log(*args, stderr=False):
+        """Print a console message"""
+        from numpy.distutils import log
+        out = _Distutils._dist_str(*args)
+        if stderr:
+            log.warn(out)
+        else:
+            log.info(out)
+
+    @staticmethod
+    def dist_load_module(name, path):
+        """Load a module from file, required by the abstract class '_Cache'."""
+        from numpy.compat import npy_load_module
+        try:
+            return npy_load_module(name, path)
+        except Exception as e:
+            _Distutils.dist_log(e, stderr=True)
+        return None
+
+    @staticmethod
+    def _dist_str(*args):
+        """Return a string to print by log and errors."""
+        def to_str(arg):
+            if not isinstance(arg, str) and hasattr(arg, '__iter__'):
+                ret = []
+                for a in arg:
+                    ret.append(to_str(a))
+                return '('+ ' '.join(ret) + ')'
+            return str(arg)
+
+        stack = inspect.stack()[2]
+        start = "CCompilerOpt.%s[%d] : " % (stack.function, stack.lineno)
+        out = ' '.join([
+            to_str(a)
+            for a in (*args,)
+        ])
+        return start + out
+
+    def _dist_test_spawn_paths(self, cmd, display=None):
+        """
+        Fix msvc SDK ENV path same as distutils do
+        without it we get c1: fatal error C1356: unable to find mspdbcore.dll
+        """
+        if not hasattr(self._ccompiler, "_paths"):
+            self._dist_test_spawn(cmd)
+            return
+        old_path = os.getenv("path")
+        try:
+            os.environ["path"] = self._ccompiler._paths
+            self._dist_test_spawn(cmd)
+        finally:
+            os.environ["path"] = old_path
+
+    _dist_warn_regex = re.compile(
+        # intel and msvc compilers don't raise
+        # fatal errors when flags are wrong or unsupported
+        ".*("
+        "ignoring unknown option|" # msvc
+        "invalid argument for option" # intel
+        ").*"
+    )
+    @staticmethod
+    def _dist_test_spawn(cmd, display=None):
+        from distutils.errors import CompileError
+        try:
+            o = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+            if isinstance(o, bytes):
+                o = o.decode()
+            if o and re.match(_Distutils._dist_warn_regex, o):
+                _Distutils.dist_error(
+                    "Flags in command", cmd ,"aren't supported by the compiler"
+                    ", output -> \n%s" % o
+                )
+        except subprocess.CalledProcessError as exc:
+            o = exc.output
+            s = exc.returncode
+        except OSError:
+            o = b''
+            s = 127
+        else:
+            return None
+        o = o.decode()
+        _Distutils.dist_error(
+            "Command", cmd, "failed with exit status %d output -> \n%s" % (
+            s, o
+        ))
+
+_share_cache = {}
+class _Cache:
+    """An abstract class handles caching functionality, provides two
+    levels of caching, in-memory by share instances attributes among
+    each other and by store attributes into files.
+
+    **Note**:
+        any attributes that start with ``_`` or ``conf_`` will be ignored.
+
+    Parameters
+    ----------
+    cache_path: str or None
+        The path of cache file, if None then cache in file will disabled.
+
+    *factors:
+        The caching factors that need to utilize next to `conf_cache_factors`.
+
+    Attributes
+    ----------
+    cache_private: set
+        Hold the attributes that need be skipped from "in-memory cache".
+
+    cache_infile: bool
+        Utilized during initializing this class, to determine if the cache was able
+        to loaded from the specified cache path in 'cache_path'.
+    """
+
+    # skip attributes from cache
+    _cache_ignore = re.compile("^(_|conf_)")
+
+    def __init__(self, cache_path=None, *factors):
+        self.cache_me = {}
+        self.cache_private = set()
+        self.cache_infile = False
+
+        if self.conf_nocache:
+            self.dist_log("cache is disabled by `Config`")
+            return
+
+        chash = self.cache_hash(*factors, *self.conf_cache_factors)
+        if cache_path:
+            if os.path.exists(cache_path):
+                self.dist_log("load cache from file ->", cache_path)
+                cache_mod = self.dist_load_module("cache", cache_path)
+                if not cache_mod:
+                    self.dist_log(
+                        "unable to load the cache file as a module",
+                        stderr=True
+                    )
+                elif not hasattr(cache_mod, "hash") or \
+                     not hasattr(cache_mod, "data"):
+                    self.dist_log("invalid cache file", stderr=True)
+                elif chash == cache_mod.hash:
+                    self.dist_log("hit the file cache")
+                    for attr, val in cache_mod.data.items():
+                        setattr(self, attr, val)
+                    self.cache_infile = True
+                else:
+                    self.dist_log("miss the file cache")
+
+            atexit.register(self._cache_write, cache_path, chash)
+
+        if not self.cache_infile:
+            other_cache = _share_cache.get(chash)
+            if other_cache:
+                self.dist_log("hit the memory cache")
+                for attr, val in other_cache.__dict__.items():
+                    if attr in other_cache.cache_private or \
+                               re.match(self._cache_ignore, attr):
+                        continue
+                    setattr(self, attr, val)
+
+        _share_cache[chash] = self
+
+    def __del__(self):
+        # TODO: remove the cache form share on del
+        pass
+
+    def _cache_write(self, cache_path, cache_hash):
+        # TODO: don't write if the cache doesn't change
+        self.dist_log("write cache to path ->", cache_path)
+        for attr in list(self.__dict__.keys()):
+            if re.match(self._cache_ignore, attr):
+                self.__dict__.pop(attr)
+
+        d = os.path.dirname(cache_path)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+        repr_dict = pprint.pformat(self.__dict__, compact=True)
+        with open(cache_path, "w") as f:
+            f.write(textwrap.dedent("""\
+            # AUTOGENERATED DON'T EDIT
+            # Please make changes to the code generator \
+            (distutils/ccompiler_opt.py)
+            hash = {}
+            data = \\
+            """).format(cache_hash))
+            f.write(repr_dict)
+
+    def cache_hash(self, *factors):
+        # is there a built-in non-crypto hash?
+        # sdbm
+        chash = 0
+        for f in factors:
+            for char in str(f):
+                chash  = ord(char) + (chash << 6) + (chash << 16) - chash
+                chash &= 0xFFFFFFFF
+        return chash
+
+    @staticmethod
+    def me(cb):
+        """
+        A static method that can be treated as a decorator to
+        dynamically cache certain methods.
+        """
+        def cache_wrap_me(self, *args, **kwargs):
+            # good for normal args
+            cache_key = str((
+                cb.__name__, *args, *kwargs.keys(), *kwargs.values()
+            ))
+            if cache_key in self.cache_me:
+                return self.cache_me[cache_key]
+            ccb = cb(self, *args, **kwargs)
+            self.cache_me[cache_key] = ccb
+            return ccb
+        return cache_wrap_me
+
+class _CCompiler(object):
+    """A helper class for `CCompilerOpt` containing all utilities that
+    related to the fundamental compiler's functions.
+
+    Attributes
+    ----------
+    cc_on_x86 : bool
+        True when the target architecture is 32-bit x86
+    cc_on_x64 : bool
+        True when the target architecture is 64-bit x86
+    cc_on_ppc64 : bool
+        True when the target architecture is 64-bit big-endian PowerPC
+    cc_on_armhf : bool
+        True when the target architecture is 32-bit ARMv7+
+    cc_on_aarch64 : bool
+        True when the target architecture is 64-bit Armv8-a+
+    cc_on_noarch : bool
+        True when the target architecture is unknown or not supported
+    cc_is_gcc : bool
+        True if the compiler is GNU or
+        if the compiler is unknown
+    cc_is_clang : bool
+        True if the compiler is Clang
+    cc_is_icc : bool
+        True if the compiler is Intel compiler (unix like)
+    cc_is_iccw : bool
+        True if the compiler is Intel compiler (msvc like)
+    cc_is_nocc : bool
+        True if the compiler isn't supported directly,
+        Note: that cause a fail-back to gcc
+    cc_has_debug : bool
+        True if the compiler has debug flags
+    cc_has_native : bool
+        True if the compiler has native flags
+    cc_noopt : bool
+        True if the compiler has definition 'DISABLE_OPT*',
+        or 'cc_on_noarch' is True
+    cc_march : str
+        The target architecture name, or "unknown" if
+        the architecture isn't supported
+    cc_name : str
+        The compiler name, or "unknown" if the compiler isn't supported
+    cc_flags : dict
+        Dictionary containing the initialized flags of `_Config.conf_cc_flags`
+    """
+    def __init__(self):
+        if hasattr(self, "cc_is_cached"):
+            return
+        to_detect = (
+            #        attr                regex
+            (
+                ("cc_on_x64",      "^(x|x86_|amd)64"),
+                ("cc_on_x86",      "^(x86|i386|i686)"),
+                ("cc_on_ppc64le",  "^(powerpc|ppc)64(el|le)"),
+                ("cc_on_ppc64",    "^(powerpc|ppc)64"),
+                ("cc_on_armhf",    "^arm"),
+                ("cc_on_aarch64",  "^aarch64"),
+                # priority is given to first of string
+                # if it fail we search in the rest, due
+                # to append platform.machine() at the end,
+                # check method 'dist_info()' for more clarification.
+                ("cc_on_x64",      ".*(x|x86_|amd)64.*"),
+                ("cc_on_x86",      ".*(x86|i386|i686).*"),
+                ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*"),
+                ("cc_on_ppc64",    ".*(powerpc|ppc)64.*"),
+                ("cc_on_armhf",    ".*arm.*"),
+                ("cc_on_aarch64",  ".*aarch64.*"),
+                # undefined platform
+                ("cc_on_noarch",    ""),
+            ),
+            (
+                ("cc_is_gcc",     r".*(gcc|gnu\-g).*"),
+                ("cc_is_clang",    ".*clang.*"),
+                ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*"), # intel msvc like
+                ("cc_is_icc",      ".*(intel|icc).*"), # intel unix like
+                ("cc_is_msvc",     ".*msvc.*"),
+                ("cc_is_nocc",     ""),
+            ),
+               (("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*"),),
+               (("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"),),
+               # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
+               (("cc_noopt", ".*DISABLE_OPT.*"),),
+        )
+        for section in to_detect:
+            for attr, rgex in section:
+                setattr(self, attr, False)
+
+        dist_info = self.dist_info()
+        for section in to_detect:
+            for attr, rgex in section:
+                if rgex and not re.match(rgex, dist_info, re.IGNORECASE):
+                    continue
+                setattr(self, attr, True)
+                break
+
+        if self.cc_on_noarch:
+            self.dist_log(
+                "unable to detect CPU arch via compiler info, "
+                "optimization is disabled \ninfo << %s >> " % dist_info,
+                stderr=True
+            )
+            self.cc_noopt = True
+
+        if self.conf_noopt:
+            self.dist_log("Optimization is disabled by the Config", stderr=True)
+            self.cc_noopt = True
+
+        if self.cc_is_nocc:
+            """
+            mingw can be treated as a gcc, and also xlc even if it based on clang,
+            but still has the same gcc optimization flags.
+            """
+            self.dist_log(
+                "unable to detect compiler name via info <<\n%s\n>> "
+                "treating it as a gcc" % dist_info,
+                stderr=True
+            )
+            self.cc_is_gcc = True
+
+        self.cc_march = "unknown"
+        for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"):
+            if getattr(self, "cc_on_" + arch):
+                self.cc_march = arch
+                break
+
+        self.cc_name = "unknown"
+        for name in ("gcc", "clang", "iccw", "icc", "msvc"):
+            if getattr(self, "cc_is_" + name):
+                self.cc_name = name
+                break
+
+        self.cc_flags = {}
+        compiler_flags = self.conf_cc_flags.get(self.cc_name)
+        if compiler_flags is None:
+            self.dist_fatal(
+                "undefined flag for compiler '%s', "
+                "leave an empty dict instead" % self.cc_name
+            )
+        for name, flags in compiler_flags.items():
+            self.cc_flags[name] = nflags = []
+            if flags:
+                assert(isinstance(flags, str))
+                flags = flags.split()
+                for f in flags:
+                    if self.cc_test_flags([f]):
+                        nflags.append(f)
+
+        self.cc_is_cached = True
+
+    @_Cache.me
+    def cc_test_flags(self, flags):
+        """
+        Returns True if the compiler supports 'flags'.
+        """
+        assert(isinstance(flags, list))
+        self.dist_log("testing flags", flags)
+        test_path = os.path.join(self.conf_check_path, "test_flags.c")
+        test = self.dist_test(test_path, flags)
+        if not test:
+            self.dist_log("testing failed", stderr=True)
+        return test
+
+    def cc_normalize_flags(self, flags):
+        """
+        Remove the conflicts that caused due gathering implied features flags.
+
+        Parameters
+        ----------
+        'flags' list, compiler flags
+            flags should be sorted from the lowest to the highest interest.
+
+        Returns
+        -------
+        list, filtered from any conflicts.
+
+        Examples
+        --------
+        >>> self.cc_normalize_flags(['-march=armv8.2-a+fp16', '-march=armv8.2-a+dotprod'])
+        ['armv8.2-a+fp16+dotprod']
+
+        >>> self.cc_normalize_flags(
+            ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-mavx', '-march=core-avx2']
+        )
+        ['-march=core-avx2']
+        """
+        assert(isinstance(flags, list))
+        if self.cc_is_gcc or self.cc_is_clang or self.cc_is_icc:
+            return self._cc_normalize_unix(flags)
+
+        if self.cc_is_msvc or self.cc_is_iccw:
+            return self._cc_normalize_win(flags)
+        return flags
+
+    _cc_normalize_unix_mrgx = re.compile(
+        # 1- to check the highest of
+        r"^(-mcpu=|-march=|-x[A-Z0-9\-])"
+    )
+    _cc_normalize_unix_frgx = re.compile(
+        # 2- to remove any flags starts with
+        # -march, -mcpu, -x(INTEL) and '-m' without '='
+        r"^(?!(-mcpu=|-march=|-x[A-Z0-9\-]))(?!-m[a-z0-9\-\.]*.$)"
+    )
+    _cc_normalize_unix_krgx = re.compile(
+        # 3- keep only the highest of
+        r"^(-mfpu|-mtune)"
+    )
+    _cc_normalize_arch_ver = re.compile(
+        r"[0-9.]"
+    )
+    def _cc_normalize_unix(self, flags):
+        def ver_flags(f):
+            #        arch ver  subflag
+            # -march=armv8.2-a+fp16fml
+            tokens = f.split('+')
+            ver = float('0' + ''.join(
+                re.findall(self._cc_normalize_arch_ver, tokens[0])
+            ))
+            return ver, tokens[0], tokens[1:]
+
+        if len(flags) <= 1:
+            return flags
+        # get the highest matched flag
+        for i, cur_flag in enumerate(reversed(flags)):
+            if not re.match(self._cc_normalize_unix_mrgx, cur_flag):
+                continue
+            lower_flags = flags[:-(i+1)]
+            upper_flags = flags[-i:]
+            filterd = list(filter(
+                self._cc_normalize_unix_frgx.search, lower_flags
+            ))
+            # gather subflags
+            ver, arch, subflags = ver_flags(cur_flag)
+            if ver > 0 and len(subflags) > 0:
+                for xflag in lower_flags:
+                    xver, _, xsubflags = ver_flags(xflag)
+                    if ver == xver:
+                        subflags = xsubflags + subflags
+                cur_flag = arch + '+' + '+'.join(subflags)
+
+            flags = filterd + [cur_flag]
+            if i > 0:
+                flags += upper_flags
+            break
+
+        # to remove overridable flags
+        final_flags = []
+        matched = set()
+        for f in reversed(flags):
+            match = re.match(self._cc_normalize_unix_krgx, f)
+            if not match:
+                pass
+            elif match[0] in matched:
+                continue
+            else:
+                matched.add(match[0])
+            final_flags.insert(0, f)
+        return final_flags
+
+    _cc_normalize_win_frgx = re.compile(
+        r"^(?!(/arch\:|/Qx\:))"
+    )
+    _cc_normalize_win_mrgx = re.compile(
+        r"^(/arch|/Qx:)"
+    )
+    def _cc_normalize_win(self, flags):
+        for i, f in enumerate(reversed(flags)):
+            if not re.match(self._cc_normalize_win_mrgx, f):
+                continue
+            i += 1
+            return list(filter(
+                self._cc_normalize_win_frgx.search, flags[:-i]
+            )) + flags[-i:]
+        return flags
+
+class _Feature:
+    """A helper class for `CCompilerOpt` that managing CPU features.
+
+    Attributes
+    ----------
+    feature_supported : dict
+        Dictionary containing all CPU features that supported
+        by the platform, according to the specified values in attribute
+        `_Config.conf_features` and `_Config.conf_features_partial()`
+
+    feature_min : set
+        The minimum support of CPU features, according to
+        the specified values in attribute `_Config.conf_min_features`.
+    """
+    def __init__(self):
+        if hasattr(self, "feature_is_cached"):
+            return
+        self.feature_supported = pfeatures = self.conf_features_partial()
+        for feature_name in list(pfeatures.keys()):
+            feature  = pfeatures[feature_name]
+            cfeature = self.conf_features[feature_name]
+            feature.update({
+                k:v for k,v in cfeature.items() if k not in feature
+            })
+            disabled = feature.get("disable")
+            if disabled is not None:
+                pfeatures.pop(feature_name)
+                self.dist_log(
+                    "feature '%s' is disabled," % feature_name,
+                    disabled, stderr=True
+                )
+                continue
+            # list is used internally for these options
+            for option in (
+                "implies", "group", "detect", "headers", "flags"
+            ) :
+                oval = feature.get(option)
+                if isinstance(oval, str):
+                    feature[option] = oval.split()
+
+        self.feature_min = set()
+        min_f = self.conf_min_features.get(self.cc_march, "")
+        for F in min_f.upper().split():
+            if F in self.feature_supported:
+                self.feature_min.add(F)
+
+        self.feature_is_cached = True
+
+    def feature_names(self, names=None, force_flags=None):
+        """
+        Returns a set of CPU feature names that supported by platform and the **C** compiler.
+
+        Parameters
+        ----------
+        'names': sequence or None, optional
+            Specify certain CPU features to test it against the **C** compiler.
+            if None(default), it will test all current supported features.
+            **Note**: feature names must be in upper-case.
+
+        'force_flags': list or None, optional
+            If None(default), default compiler flags for every CPU feature will be used
+            during the test.
+        """
+        assert(
+            names is None or (
+                not isinstance(names, str) and
+                hasattr(names, "__iter__")
+            )
+        )
+        assert(force_flags is None or isinstance(force_flags, list))
+        if names is None:
+            names = self.feature_supported.keys()
+        supported_names = set()
+        for f in names:
+            if self.feature_is_supported(f, force_flags=force_flags):
+                supported_names.add(f)
+        return supported_names
+
+    def feature_is_exist(self, name):
+        """
+        Returns True if a certain feature is exist and covered within
+        `_Config.conf_features`.
+
+        Parameters
+        ----------
+        'name': str
+            feature name in uppercase.
+        """
+        assert(name.isupper())
+        return name in self.conf_features
+
+    def feature_sorted(self, names, reverse=False):
+        """
+        Sort a list of CPU features ordered by the lowest interest.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of supported feature names in uppercase.
+        'reverse': bool, optional
+            If true, the sorted features is reversed. (highest interest)
+
+        Returns
+        -------
+        list, sorted CPU features
+        """
+        def sort_cb(k):
+            if isinstance(k, str):
+                return self.feature_supported[k]["interest"]
+            # multiple features
+            rank = max([self.feature_supported[f]["interest"] for f in k])
+            # FIXME: that's not a safe way to increase the rank for
+            # multi targets
+            rank += len(k) -1
+            return rank
+        return sorted(names, reverse=reverse, key=sort_cb)
+
+    def feature_implies(self, names):
+        """Return a set of CPU features that implied by 'names'"""
+        def get_implies(name, _caller=[]):
+            implies = set()
+            d = self.feature_supported[name]
+            for i in d.get("implies", []):
+                implies.add(i)
+                if i in _caller:
+                    # infinity recursive guard since
+                    # features can imply each other
+                    continue
+                _caller.append(name)
+                implies = implies.union(get_implies(i, _caller))
+            return implies
+
+        if isinstance(names, str):
+            return get_implies(names)
+
+        assert(hasattr(names, "__iter__"))
+        implies = set()
+        for n in names:
+            implies = implies.union(get_implies(n))
+        return implies
+
+    def feature_implies_c(self, names):
+        """same as feature_implies() but combining 'names'"""
+        if isinstance(names, str):
+            names = set((names,))
+        else:
+            names = set(names)
+        return names.union(self.feature_implies(names))
+
+    def feature_ahead(self, names):
+        """
+        Return list of features in 'names' after remove any
+        implied features and keep the origins.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of CPU feature names in uppercase.
+
+        Returns
+        -------
+        list of CPU features sorted as-is 'names'
+
+        Examples
+        --------
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41"])
+        ["SSE41"]
+        # assume AVX2 and FMA3 implies each other and AVX2
+        # is the highest interest
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+        ["AVX2"]
+        # assume AVX2 and FMA3 don't implies each other
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+        ["AVX2", "FMA3"]
+        """
+        assert(
+            not isinstance(names, str)
+            and hasattr(names, '__iter__')
+        )
+        implies = self.feature_implies(names)
+        ahead = [n for n in names if n not in implies]
+        if len(ahead) == 0:
+            # return the highest interested feature
+            # if all features imply each other
+            ahead = self.feature_sorted(names, reverse=True)[:1]
+        return ahead
+
+    def feature_untied(self, names):
+        """
+        same as 'feature_ahead()' but if both features implied each other
+        and keep the highest interest.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of CPU feature names in uppercase.
+
+        Returns
+        -------
+        list of CPU features sorted as-is 'names'
+
+        Examples
+        --------
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41"])
+        ["SSE2", "SSE3", "SSE41"]
+        # assume AVX2 and FMA3 implies each other
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "FMA3", "AVX2"])
+        ["SSE2", "SSE3", "SSE41", "AVX2"]
+        """
+        assert(
+            not isinstance(names, str)
+            and hasattr(names, '__iter__')
+        )
+        final = []
+        for n in names:
+            implies = self.feature_implies(n)
+            tied = [
+                nn for nn in final
+                if nn in implies and n in self.feature_implies(nn)
+            ]
+            if tied:
+                tied = self.feature_sorted(tied + [n])
+                if n not in tied[1:]:
+                    continue
+                final.remove(tied[:1][0])
+            final.append(n)
+        return final
+
+    def feature_get_til(self, names, keyisfalse):
+        """
+        same as `feature_implies_c()` but stop collecting implied
+        features when feature's option that provided through
+        parameter 'keyisfalse' is False, also sorting the returned
+        features.
+        """
+        def til(tnames):
+            # sort from highest to lowest interest then cut if "key" is False
+            tnames = self.feature_implies_c(tnames)
+            tnames = self.feature_sorted(tnames, reverse=True)
+            for i, n in enumerate(tnames):
+                if not self.feature_supported[n].get(keyisfalse, True):
+                    tnames = tnames[:i+1]
+                    break
+            return tnames
+
+        if isinstance(names, str) or len(names) <= 1:
+            names = til(names)
+            # normalize the sort
+            names.reverse()
+            return names
+
+        names = self.feature_ahead(names)
+        names = {t for n in names for t in til(n)}
+        return self.feature_sorted(names)
+
+    def feature_detect(self, names):
+        """
+        Return a list of CPU features that required to be detected
+        sorted from the lowest to highest interest.
+        """
+        names = self.feature_get_til(names, "implies_detect")
+        detect = []
+        for n in names:
+            d = self.feature_supported[n]
+            detect += d.get("detect", d.get("group", [n]))
+        return detect
+
+    @_Cache.me
+    def feature_flags(self, names):
+        """
+        Return a list of CPU features flags sorted from the lowest
+        to highest interest.
+        """
+        names = self.feature_sorted(self.feature_implies_c(names))
+        flags = []
+        for n in names:
+            d = self.feature_supported[n]
+            f = d.get("flags", [])
+            if not f or not self.cc_test_flags(f):
+                continue
+            flags += f
+        return self.cc_normalize_flags(flags)
+
+    @_Cache.me
+    def feature_test(self, name, force_flags=None):
+        """
+        Test a certain CPU feature against the compiler through its own
+        check file.
+
+        Parameters
+        ----------
+        'name': str
+            Supported CPU feature name.
+
+        'force_flags': list or None, optional
+            If None(default), the returned flags from `feature_flags()`
+            will be used.
+       """
+        if force_flags is None:
+            force_flags = self.feature_flags(name)
+
+        self.dist_log(
+            "testing feature '%s' with flags (%s)" % (
+            name, ' '.join(force_flags)
+        ))
+        # Each CPU feature must have C source code contains at
+        # least one intrinsic or instruction related to this feature.
+        test_path = os.path.join(
+            self.conf_check_path, "cpu_%s.c" % name.lower()
+        )
+        if not os.path.exists(test_path):
+            self.dist_fatal("feature test file is not exist", path)
+
+        test = self.dist_test(test_path, force_flags + self.cc_flags["werror"])
+        if not test:
+            self.dist_log("testing failed", stderr=True)
+        return test
+
+    @_Cache.me
+    def feature_is_supported(self, name, force_flags=None):
+        """
+        Check if a certain CPU feature is supported by the platform and compiler.
+
+        Parameters
+        ----------
+        'name': str
+            CPU feature name in uppercase.
+
+        'force_flags': list or None, optional
+            If None(default), default compiler flags for every CPU feature will be used
+            during test.
+        """
+        assert(name.isupper())
+        assert(force_flags is None or isinstance(force_flags, list))
+
+        supported = name in self.feature_supported
+        if supported:
+            for impl in self.feature_implies(name):
+                if not self.feature_test(impl, force_flags):
+                    return False
+            if not self.feature_test(name, force_flags):
+                return False
+        return supported
+
+    @_Cache.me
+    def feature_can_autovec(self, name):
+        """
+        check if the feature can be auto-vectorized by the compiler
+        """
+        assert(isinstance(name, str))
+        d = self.feature_supported[name]
+        can = d.get("autovec", None)
+        if can is None:
+            valid_flags = [
+                self.cc_test_flags([f]) for f in d.get("flags", [])
+            ]
+            can = valid_flags and any(valid_flags)
+        return can
+
+    def feature_c_preprocessor(self, feature_name, tabs=0):
+        """
+        Generate C preprocessor definitions and include headers of a CPU feature.
+
+        Parameters
+        ----------
+        'feature_name': str
+            CPU feature name in uppercase.
+        'tabs': int
+            if > 0, align the generated strings to the right depend on number of tabs.
+
+        Returns
+        -------
+        str, generated C preprocessor
+
+        Examples
+        --------
+        >>> self.feature_c_preprocessor("SSE3")
+        /** SSE3 **/
+        #define NPY_HAVE_SSE3 1
+        #include <pmmintrin.h>
+        """
+        assert(feature_name.isupper())
+        feature = self.feature_supported.get(feature_name)
+        assert(feature is not None)
+
+        prepr = [
+            "/** %s **/" % feature_name,
+            "#define %sHAVE_%s 1" % (self.conf_c_prefix, feature_name)
+        ]
+        prepr += [
+            "#include <%s>" % h for h in feature.get("headers", [])
+        ]
+        group = feature.get("group", [])
+        for f in group:
+            # Guard features in case of duplicate definitions
+            prepr += [
+                "#ifndef %sHAVE_%s" % (self.conf_c_prefix, f),
+                "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, f),
+                "#endif",
+            ]
+        if tabs > 0:
+            prepr = [('\t'*tabs) + l for l in prepr]
+        return '\n'.join(prepr)
+
+class _Parse:
+    """A helper class that parsing main arguments of `CCompilerOpt`,
+    also parsing configuration statements in dispatch-able sources.
+
+    Parameters
+    ----------
+    cpu_baseline: str or None
+        minimal set of required CPU features or special options.
+
+    cpu_dispatch: str or None
+        dispatched set of additional CPU features or special options.
+
+    Special options can be:
+        - **MIN**: Enables the minimum CPU features that utilized via `_Config.conf_min_features`
+        - **MAX**: Enables all supported CPU features by the Compiler and platform.
+        - **NATIVE**: Enables all CPU features that supported by the current machine.
+        - **NONE**: Enables nothing
+        - **Operand +/-**: remove or add features, useful with options **MAX**, **MIN** and **NATIVE**.
+            NOTE: operand + is only added for nominal reason.
+
+    NOTES:
+        - Case-insensitive among all CPU features and special options.
+        - Comma or space can be used as a separator.
+        - If the CPU feature is not supported by the user platform or compiler,
+          it will be skipped rather than raising a fatal error.
+        - Any specified CPU features to 'cpu_dispatch' will be skipped if its part of CPU baseline features
+        - 'cpu_baseline' force enables implied features.
+
+    Attributes
+    ----------
+    parse_baseline_names : list
+        Final CPU baseline's feature names(sorted from low to high)
+    parse_baseline_flags : list
+        Compiler flags of baseline features
+    parse_dispatch_names : list
+        Final CPU dispatch-able feature names(sorted from low to high)
+    parse_target_groups : dict
+        Dictionary containing initialized target groups that configured
+        through class attribute `conf_target_groups`.
+
+        The key is represent the group name and value is a tuple
+        contains three items :
+            - bool, True if group has the 'baseline' option.
+            - list, list of CPU features.
+            - list, list of extra compiler flags.
+
+    """
+    def __init__(self, cpu_baseline, cpu_dispatch):
+        self._parse_policies = dict(
+            # POLICY NAME, (HAVE, NOT HAVE, [DEB])
+            KEEP_BASELINE = (
+                None, self._parse_policy_not_keepbase,
+                []
+            ),
+            KEEP_SORT = (
+                self._parse_policy_keepsort,
+                self._parse_policy_not_keepsort,
+                []
+            ),
+            MAXOPT = (
+                self._parse_policy_maxopt, None,
+                []
+            ),
+            WERROR = (
+                self._parse_policy_werror, None,
+                []
+            ),
+            AUTOVEC = (
+                self._parse_policy_autovec, None,
+                ["MAXOPT"]
+            )
+        )
+        if hasattr(self, "parse_is_cached"):
+            return
+
+        self.parse_baseline_names = []
+        self.parse_baseline_flags = []
+        self.parse_dispatch_names = []
+        self.parse_target_groups = {}
+
+        if self.cc_noopt:
+            # skip parsing baseline and dispatch args and keep parsing target groups
+            cpu_baseline = cpu_dispatch = None
+
+        self.dist_log("check requested baseline")
+        if cpu_baseline is not None:
+            cpu_baseline = self._parse_arg_features("cpu_baseline", cpu_baseline)
+            baseline_names = self.feature_names(cpu_baseline)
+            self.parse_baseline_flags = self.feature_flags(baseline_names)
+            self.parse_baseline_names = self.feature_sorted(
+                self.feature_implies_c(baseline_names)
+            )
+
+        self.dist_log("check requested dispatch-able features")
+        if cpu_dispatch is not None:
+            cpu_dispatch_ = self._parse_arg_features("cpu_dispatch", cpu_dispatch)
+            cpu_dispatch = {
+                f for f in cpu_dispatch_
+                if f not in self.parse_baseline_names
+            }
+            conflict_baseline = cpu_dispatch_.difference(cpu_dispatch)
+            self.parse_dispatch_names = self.feature_sorted(
+                self.feature_names(cpu_dispatch)
+            )
+            if len(conflict_baseline) > 0:
+                self.dist_log(
+                    "skip features", conflict_baseline, "since its part of baseline"
+                )
+
+        self.dist_log("initialize targets groups")
+        for group_name, tokens in self.conf_target_groups.items():
+            self.dist_log("parse target group", group_name)
+            GROUP_NAME = group_name.upper()
+            if not tokens or not tokens.strip():
+                # allow empty groups, useful in case if there's a need
+                # to disable certain group since '_parse_target_tokens()'
+                # requires at least one valid target
+                self.parse_target_groups[GROUP_NAME] = (
+                    False, [], []
+                )
+                continue
+            has_baseline, features, extra_flags = \
+                self._parse_target_tokens(tokens)
+            self.parse_target_groups[GROUP_NAME] = (
+                has_baseline, features, extra_flags
+            )
+
+        self.parse_is_cached = True
+
+    def parse_targets(self, source):
+        """
+        Fetch and parse configuration statements that required for
+        defining the targeted CPU features, statements should be declared
+        in the top of source in between **C** comment and start
+        with a special mark **@targets**.
+
+        Configuration statements are sort of keywords representing
+        CPU features names, group of statements and policies, combined
+        together to determine the required optimization.
+
+        Parameters
+        ----------
+        source: str
+            the path of **C** source file.
+
+        Returns
+        -------
+        - bool, True if group has the 'baseline' option
+        - list, list of CPU features
+        - list, list of extra compiler flags
+        """
+        self.dist_log("looking for '@targets' inside -> ", source)
+        # get lines between /*@targets and */
+        with open(source) as fd:
+            tokens = ""
+            max_to_reach = 1000 # good enough, isn't?
+            start_with = "@targets"
+            start_pos = -1
+            end_with = "*/"
+            end_pos = -1
+            for current_line, line in enumerate(fd):
+                if current_line == max_to_reach:
+                    self.dist_fatal("reached the max of lines")
+                    break
+                if start_pos == -1:
+                    start_pos = line.find(start_with)
+                    if start_pos == -1:
+                        continue
+                    start_pos += len(start_with)
+                tokens += line
+                end_pos = line.find(end_with)
+                if end_pos != -1:
+                    end_pos += len(tokens) - len(line)
+                    break
+
+        if start_pos == -1:
+            self.dist_fatal("expected to find '%s' within a C comment" % start_with)
+        if end_pos == -1:
+            self.dist_fatal("expected to end with '%s'" % end_with)
+
+        tokens = tokens[start_pos:end_pos]
+        return self._parse_target_tokens(tokens)
+
+    _parse_regex_arg = re.compile(r'\s|[,]|([+-])')
+    def _parse_arg_features(self, arg_name, req_features):
+        if not isinstance(req_features, str):
+            self.dist_fatal("expected a string in '%s'" % arg_name)
+
+        final_features = set()
+        # space and comma can be used as a separator
+        tokens = list(filter(None, re.split(self._parse_regex_arg, req_features)))
+        append = True # append is the default
+        for tok in tokens:
+            if tok[0] in ("#", "$"):
+                self.dist_fatal(
+                    arg_name, "target groups and policies "
+                    "aren't allowed from arguments, "
+                    "only from dispatch-able sources"
+                )
+            if tok == '+':
+                append = True
+                continue
+            if tok == '-':
+                append = False
+                continue
+
+            TOK = tok.upper() # we use upper-case internally
+            features_to = set()
+            if TOK == "NONE":
+                pass
+            elif TOK == "NATIVE":
+                native = self.cc_flags["native"]
+                if not native:
+                    self.dist_fatal(arg_name,
+                        "native option isn't supported by the compiler"
+                    )
+                features_to = self.feature_names(force_flags=native)
+            elif TOK == "MAX":
+                features_to = self.feature_supported.keys()
+            elif TOK == "MIN":
+                features_to = self.feature_min
+            else:
+                if TOK in self.feature_supported:
+                    features_to.add(TOK)
+                else:
+                    if not self.feature_is_exist(TOK):
+                        self.dist_fatal(arg_name,
+                            ", '%s' isn't a known feature or option" % tok
+                        )
+            if append:
+                final_features = final_features.union(features_to)
+            else:
+                final_features = final_features.difference(features_to)
+
+            append = True # back to default
+
+        return final_features
+
+    _parse_regex_target = re.compile(r'\s|[*,/]|([()])')
+    def _parse_target_tokens(self, tokens):
+        assert(isinstance(tokens, str))
+        final_targets = [] # to keep it sorted as specified
+        extra_flags = []
+        has_baseline = False
+
+        skipped  = set()
+        policies = set()
+        multi_target = None
+
+        tokens = list(filter(None, re.split(self._parse_regex_target, tokens)))
+        if not tokens:
+            self.dist_fatal("expected one token at least")
+
+        for tok in tokens:
+            TOK = tok.upper()
+            ch = tok[0]
+            if ch in ('+', '-'):
+                self.dist_fatal(
+                    "+/- are 'not' allowed from target's groups or @targets, "
+                    "only from cpu_baseline and cpu_dispatch parms"
+                )
+            elif ch == '$':
+                if multi_target is not None:
+                    self.dist_fatal(
+                        "policies aren't allowed inside multi-target '()'"
+                        ", only CPU features"
+                    )
+                policies.add(self._parse_token_policy(TOK))
+            elif ch == '#':
+                if multi_target is not None:
+                    self.dist_fatal(
+                        "target groups aren't allowed inside multi-target '()'"
+                        ", only CPU features"
+                    )
+                has_baseline, final_targets, extra_flags = \
+                self._parse_token_group(TOK, has_baseline, final_targets, extra_flags)
+            elif ch == '(':
+                if multi_target is not None:
+                    self.dist_fatal("unclosed multi-target, missing ')'")
+                multi_target = set()
+            elif ch == ')':
+                if multi_target is None:
+                    self.dist_fatal("multi-target opener '(' wasn't found")
+                targets = self._parse_multi_target(multi_target)
+                if targets is None:
+                    skipped.add(tuple(multi_target))
+                else:
+                    if len(targets) == 1:
+                        targets = targets[0]
+                    if targets and targets not in final_targets:
+                        final_targets.append(targets)
+                multi_target = None # back to default
+            else:
+                if TOK == "BASELINE":
+                    if multi_target is not None:
+                        self.dist_fatal("baseline isn't allowed inside multi-target '()'")
+                    has_baseline = True
+                    continue
+
+                if multi_target is not None:
+                    multi_target.add(TOK)
+                    continue
+
+                if not self.feature_is_exist(TOK):
+                    self.dist_fatal("invalid target name '%s'" % TOK)
+
+                is_enabled = (
+                    TOK in self.parse_baseline_names or
+                    TOK in self.parse_dispatch_names
+                )
+                if  is_enabled:
+                    if TOK not in final_targets:
+                        final_targets.append(TOK)
+                    continue
+
+                skipped.add(TOK)
+
+        if multi_target is not None:
+            self.dist_fatal("unclosed multi-target, missing ')'")
+        if skipped:
+            self.dist_log(
+                "skip targets", skipped,
+                "not part of baseline or dispatch-able features"
+            )
+
+        final_targets = self.feature_untied(final_targets)
+
+        # add polices dependencies
+        for p in list(policies):
+            _, _, deps = self._parse_policies[p]
+            for d in deps:
+                if d in policies:
+                    continue
+                self.dist_log(
+                    "policy '%s' force enables '%s'" % (
+                    p, d
+                ))
+                policies.add(d)
+
+        # release policies filtrations
+        for p, (have, nhave, _) in self._parse_policies.items():
+            func = None
+            if p in policies:
+                func = have
+                self.dist_log("policy '%s' is ON" % p)
+            else:
+                func = nhave
+            if not func:
+                continue
+            has_baseline, final_targets, extra_flags = func(
+                has_baseline, final_targets, extra_flags
+            )
+
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_token_policy(self, token):
+        """validate policy token"""
+        if len(token) <= 1 or token[-1:] == token[0]:
+            self.dist_fatal("'$' must stuck in the begin of policy name")
+        token = token[1:]
+        if token not in self._parse_policies:
+            self.dist_fatal(
+                "'%s' is an invalid policy name, available policies are" % token,
+                self._parse_policies.keys()
+            )
+        return token
+
+    def _parse_token_group(self, token, has_baseline, final_targets, extra_flags):
+        """validate group token"""
+        if len(token) <= 1 or token[-1:] == token[0]:
+            self.dist_fatal("'#' must stuck in the begin of group name")
+
+        token = token[1:]
+        ghas_baseline, gtargets, gextra_flags = self.parse_target_groups.get(
+            token, (False, None, [])
+        )
+        if gtargets is None:
+            self.dist_fatal(
+                "'%s' is an invalid target group name, " % token + \
+                "available target groups are",
+                self.parse_target_groups.keys()
+            )
+        if ghas_baseline:
+            has_baseline = True
+        # always keep sorting as specified
+        final_targets += [f for f in gtargets if f not in final_targets]
+        extra_flags += [f for f in gextra_flags if f not in extra_flags]
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_multi_target(self, targets):
+        """validate multi targets that defined between parentheses()"""
+        # remove any implied features and keep the origins
+        if not targets:
+            self.dist_fatal("empty multi-target '()'")
+        if not all([
+            self.feature_is_exist(tar) for tar in targets
+        ]) :
+            self.dist_fatal("invalid target name in multi-target", targets)
+        if not all([
+            (
+                tar in self.parse_baseline_names or
+                tar in self.parse_dispatch_names
+            )
+            for tar in targets
+        ]) :
+            return None
+        targets = self.feature_ahead(targets)
+        if not targets:
+            return None
+        # force sort multi targets, so it can be comparable
+        targets = self.feature_sorted(targets)
+        targets = tuple(targets) # hashable
+        return targets
+
+    def _parse_policy_not_keepbase(self, has_baseline, final_targets, extra_flags):
+        """skip all baseline features"""
+        skipped = []
+        for tar in final_targets[:]:
+            is_base = False
+            if isinstance(tar, str):
+                is_base = tar in self.parse_baseline_names
+            else:
+                # multi targets
+                is_base = all([
+                    f in self.parse_baseline_names
+                    for f in tar
+                ])
+            if is_base:
+                skipped.append(tar)
+                final_targets.remove(tar)
+
+        if skipped:
+            self.dist_log("skip baseline features", skipped)
+
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_keepsort(self, has_baseline, final_targets, extra_flags):
+        """leave a notice that $keep_sort is on"""
+        self.dist_log(
+            "policy 'keep_sort' is on, dispatch-able targets", final_targets, "\n"
+            "are 'not' sorted depend on the highest interest but"
+            "as specified in the dispatch-able source or the extra group"
+        )
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_not_keepsort(self, has_baseline, final_targets, extra_flags):
+        """sorted depend on the highest interest"""
+        final_targets = self.feature_sorted(final_targets, reverse=True)
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_maxopt(self, has_baseline, final_targets, extra_flags):
+        """append the compiler optimization flags"""
+        if self.cc_has_debug:
+            self.dist_log("debug mode is detected, policy 'maxopt' is skipped.")
+        elif self.cc_noopt:
+            self.dist_log("optimization is disabled, policy 'maxopt' is skipped.")
+        else:
+            flags = self.cc_flags["opt"]
+            if not flags:
+                self.dist_log(
+                    "current compiler doesn't support optimization flags, "
+                    "policy 'maxopt' is skipped", stderr=True
+                )
+            else:
+                extra_flags += flags
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_werror(self, has_baseline, final_targets, extra_flags):
+        """force warnings to treated as errors"""
+        flags = self.cc_flags["werror"]
+        if not flags:
+            self.dist_log(
+                "current compiler doesn't support werror flags, "
+                "warnings will 'not' treated as errors", stderr=True
+            )
+        else:
+            self.dist_log("compiler warnings are treated as errors")
+            extra_flags += flags
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_autovec(self, has_baseline, final_targets, extra_flags):
+        """skip features that has no auto-vectorized support by compiler"""
+        skipped = []
+        for tar in final_targets[:]:
+            if isinstance(tar, str):
+                can = self.feature_can_autovec(tar)
+            else: # multiple target
+                can = all([
+                    self.feature_can_autovec(t)
+                    for t in tar
+                ])
+            if not can:
+                final_targets.remove(tar)
+                skipped.append(tar)
+
+        if skipped:
+            self.dist_log("skip non auto-vectorized features", skipped)
+
+        return has_baseline, final_targets, extra_flags
+
+class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
+    """
+    A helper class for `CCompiler` aims to provide extra build options
+    to effectively control of compiler optimizations that are directly
+    related to CPU features.
+    """
+    def __init__(self, ccompiler, cpu_baseline="min", cpu_dispatch="max", cache_path=None):
+        _Config.__init__(self)
+        _Distutils.__init__(self, ccompiler)
+        _Cache.__init__(self, cache_path, self.dist_info(), cpu_baseline, cpu_dispatch)
+        _CCompiler.__init__(self)
+        _Feature.__init__(self)
+        if not self.cc_noopt and self.cc_has_native:
+            self.dist_log(
+                "native flag is specified through environment variables. "
+                "force cpu-baseline='native'"
+            )
+            cpu_baseline = "native"
+        _Parse.__init__(self, cpu_baseline, cpu_dispatch)
+        # keep the requested features untouched, need it later for report
+        # and trace purposes
+        self._requested_baseline = cpu_baseline
+        self._requested_dispatch = cpu_dispatch
+        # key is the dispatch-able source and value is a tuple
+        # contains two items (has_baseline[boolean], dispatched-features[list])
+        self.sources_status = getattr(self, "sources_status", {})
+        # every instance should has a separate one
+        self.cache_private.add("sources_status")
+        # set it at the end to make sure the cache writing was done after init
+        # this class
+        self.hit_cache = hasattr(self, "hit_cache")
+
+    def is_cached(self):
+        """
+        Returns True if the class loaded from the cache file
+        """
+        return self.cache_infile and self.hit_cache
+
+    def cpu_baseline_flags(self):
+        """
+        Returns a list of final CPU baseline compiler flags
+        """
+        return self.parse_baseline_flags
+
+    def cpu_baseline_names(self):
+        """
+        return a list of final CPU baseline feature names
+        """
+        return self.parse_baseline_names
+
+    def cpu_dispatch_names(self):
+        """
+        return a list of final CPU dispatch feature names
+        """
+        return self.parse_dispatch_names
+
+    def try_dispatch(self, sources, src_dir=None, **kwargs):
+        """
+        Compile one or more dispatch-able sources and generates object files,
+        also generates abstract C config headers and macros that
+        used later for the final runtime dispatching process.
+
+        The mechanism behind it is to takes each source file that specified
+        in 'sources' and branching it into several files depend on
+        special configuration statements that must be declared in the
+        top of each source which contains targeted CPU features,
+        then it compiles every branched source with the proper compiler flags.
+
+        Parameters
+        ----------
+        sources : list
+            Must be a list of dispatch-able sources file paths,
+            and configuration statements must be declared inside
+            each file.
+
+        src_dir : str
+            Path of parent directory for the generated headers and wrapped sources.
+            If None(default) the files will generated in-place.
+
+        **kwargs : any
+            Arguments to pass on to the `CCompiler.compile()`
+
+        Returns
+        -------
+        list : generated object files
+
+        Raises
+        ------
+        CompileError
+            Raises by `CCompiler.compile()` on compiling failure.
+        DistutilsError
+            Some errors during checking the sanity of configuration statements.
+
+        See Also
+        --------
+        parse_targets() :
+            Parsing the configuration statements of dispatch-able sources.
+        """
+        to_compile = {}
+        baseline_flags = self.cpu_baseline_flags()
+        include_dirs = kwargs.setdefault("include_dirs", [])
+
+        for src in sources:
+            output_dir = os.path.dirname(src)
+            if src_dir and not output_dir.startswith(src_dir):
+                output_dir = os.path.join(src_dir, output_dir)
+                if output_dir not in include_dirs:
+                    include_dirs.append(output_dir)
+
+            has_baseline, targets, extra_flags = self.parse_targets(src)
+            nochange = self._generate_config(output_dir, src, targets, has_baseline)
+            for tar in targets:
+                tar_src = self._wrap_target(output_dir, src, tar, nochange=nochange)
+                flags = tuple(extra_flags + self.feature_flags(tar))
+                to_compile.setdefault(flags, []).append(tar_src)
+
+            if has_baseline:
+                flags = tuple(extra_flags + baseline_flags)
+                to_compile.setdefault(flags, []).append(src)
+
+            self.sources_status[src] = (has_baseline, targets)
+
+        # For these reasons, the sources are compiled in a separate loop:
+        # - Gathering all sources with the same flags to benefit from
+        #   the parallel compiling as much as possible.
+        # - To generate all config headers of the dispatchable sources,
+        #   before the compilation in case if there are dependency relationships
+        #   among them.
+        objects = []
+        for flags, srcs in to_compile.items():
+            objects += self.dist_compile(srcs, list(flags), **kwargs)
+        return objects
+
+    def generate_dispatch_header(self, header_path):
+        """
+        Generate the dispatch header which containing all definitions
+        and headers of instruction-sets for the enabled CPU baseline and
+        dispatch-able features.
+
+        Its highly recommended to take a look at the generated header
+        also the generated source files via `try_dispatch()`
+        in order to get the full picture.
+        """
+        self.dist_log("generate CPU dispatch header: (%s)" % header_path)
+
+        baseline_names = self.cpu_baseline_names()
+        dispatch_names = self.cpu_dispatch_names()
+        baseline_len = len(baseline_names)
+        dispatch_len = len(dispatch_names)
+
+        with open(header_path, 'w') as f:
+            baseline_calls = ' \\\n'.join([
+                (
+                    "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+                ) % (self.conf_c_prefix, f)
+                for f in baseline_names
+            ])
+            dispatch_calls = ' \\\n'.join([
+                (
+                    "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+                ) % (self.conf_c_prefix, f)
+                for f in dispatch_names
+            ])
+            f.write(textwrap.dedent("""\
+                /*
+                 * AUTOGENERATED DON'T EDIT
+                 * Please make changes to the code generator (distutils/ccompiler_opt.py)
+                */
+                #define {pfx}WITH_CPU_BASELINE  "{baseline_str}"
+                #define {pfx}WITH_CPU_DISPATCH  "{dispatch_str}"
+                #define {pfx}WITH_CPU_BASELINE_N {baseline_len}
+                #define {pfx}WITH_CPU_DISPATCH_N {dispatch_len}
+                #define {pfx}WITH_CPU_EXPAND_(X) X
+                #define {pfx}WITH_CPU_BASELINE_CALL(MACRO_TO_CALL, ...) \\
+                {baseline_calls}
+                #define {pfx}WITH_CPU_DISPATCH_CALL(MACRO_TO_CALL, ...) \\
+                {dispatch_calls}
+            """).format(
+                pfx=self.conf_c_prefix, baseline_str=" ".join(baseline_names),
+                dispatch_str=" ".join(dispatch_names), baseline_len=baseline_len,
+                dispatch_len=dispatch_len, baseline_calls=baseline_calls,
+                dispatch_calls=dispatch_calls
+            ))
+            baseline_pre = ''
+            for name in baseline_names:
+                baseline_pre += self.feature_c_preprocessor(name, tabs=1) + '\n'
+
+            dispatch_pre = ''
+            for name in dispatch_names:
+                dispatch_pre += textwrap.dedent("""\
+                #ifdef {pfx}CPU_TARGET_{name}
+                {pre}
+                #endif /*{pfx}CPU_TARGET_{name}*/
+                """).format(
+                    pfx=self.conf_c_prefix_, name=name, pre=self.feature_c_preprocessor(
+                    name, tabs=1
+                ))
+
+            f.write(textwrap.dedent("""\
+            /******* baseline features *******/
+            {baseline_pre}
+            /******* dispatch features *******/
+            {dispatch_pre}
+            """).format(
+                pfx=self.conf_c_prefix_, baseline_pre=baseline_pre,
+                dispatch_pre=dispatch_pre
+            ))
+
+    def report(self, full=False):
+        report = []
+        baseline_rows = []
+        dispatch_rows = []
+        report.append(("CPU baseline", baseline_rows))
+        report.append(("", ""))
+        report.append(("CPU dispatch", dispatch_rows))
+
+        ########## baseline ##########
+        if self.cc_noopt:
+            baseline_rows.append((
+                "Requested", "optimization disabled %s" % (
+                    "(unsupported arch)" if self.cc_on_noarch else ""
+                )
+            ))
+        else:
+            baseline_rows.append(("Requested", repr(self._requested_baseline)))
+
+        baseline_names = self.cpu_baseline_names()
+        baseline_rows.append((
+            "Enabled", (' '.join(baseline_names) if baseline_names else "none")
+        ))
+        baseline_flags = self.cpu_baseline_flags()
+        baseline_rows.append((
+            "Flags", (' '.join(baseline_flags) if baseline_flags else "none")
+        ))
+
+        ########## dispatch ##########
+        if self.cc_noopt:
+            dispatch_rows.append((
+                "Requested", "optimization disabled %s" % (
+                    "(unsupported arch)" if self.cc_on_noarch else ""
+                )
+            ))
+        else:
+            dispatch_rows.append(("Requested", repr(self._requested_dispatch)))
+
+        dispatch_names = self.cpu_dispatch_names()
+        dispatch_rows.append((
+            "Enabled", (' '.join(dispatch_names) if dispatch_names else "none")
+        ))
+        ########## Generated ##########
+        # TODO:
+        # - collect object names from 'try_dispatch()'
+        #   then get size of each object and printed
+        # - give more details about the features that not
+        #   generated due compiler support
+        # - find a better output's design.
+        #
+        target_sources = {}
+        for source, (_, targets) in self.sources_status.items():
+            for tar in targets:
+                target_sources.setdefault(tar, []).append(source)
+
+        if not full or not target_sources:
+            generated = ""
+            for tar in self.feature_sorted(target_sources):
+                sources = target_sources[tar]
+                name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+                generated += name + "[%d] " % len(sources)
+            dispatch_rows.append(("Generated", generated[:-1] if generated else "none"))
+        else:
+            dispatch_rows.append(("Generated", ''))
+            for tar in self.feature_sorted(target_sources):
+                sources = target_sources[tar]
+                name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+                flags = ' '.join(self.feature_flags(tar))
+                implies = ' '.join(self.feature_sorted(self.feature_implies(tar)))
+                detect = ' '.join(self.feature_detect(tar))
+                dispatch_rows.append(('', ''))
+                dispatch_rows.append((name, implies))
+                dispatch_rows.append(("Flags", flags))
+                dispatch_rows.append(("Detect", detect))
+                for src in sources:
+                    dispatch_rows.append(("", src))
+
+        ###############################
+        # TODO: add support for 'markdown' format
+        text = []
+        secs_len = [len(secs) for secs, _ in report]
+        cols_len = [len(col) for _, rows in report for col, _ in rows]
+        tab = ' ' * 2
+        pad =  max(max(secs_len), max(cols_len))
+        for sec, rows in report:
+            if not sec:
+                text.append("") # empty line
+                continue
+            sec += ' ' * (pad - len(sec))
+            text.append(sec + tab + ': ')
+            for col, val in rows:
+                col += ' ' * (pad - len(col))
+                text.append(tab + col + ': ' + val)
+
+        return '\n'.join(text)
+
+    def _wrap_target(self, output_dir, dispatch_src, target, nochange=False):
+        assert(isinstance(target, (str, tuple)))
+        if isinstance(target, str):
+            ext_name = target_name = target
+        else:
+            # multi-target
+            ext_name = '.'.join(target)
+            target_name = '__'.join(target)
+
+        wrap_path = os.path.join(output_dir, os.path.basename(dispatch_src))
+        wrap_path = "{0}.{2}{1}".format(*os.path.splitext(wrap_path), ext_name.lower())
+        if nochange and os.path.exists(wrap_path):
+            return wrap_path
+
+        self.dist_log("wrap dispatch-able target -> ", wrap_path)
+        # sorting for readability
+        features = self.feature_sorted(self.feature_implies_c(target))
+        target_join = "#define %sCPU_TARGET_" % self.conf_c_prefix_
+        target_defs = [target_join + f for f in features]
+        target_defs = '\n'.join(target_defs)
+
+        with open(wrap_path, "w") as fd:
+            fd.write(textwrap.dedent("""\
+            /**
+             * AUTOGENERATED DON'T EDIT
+             * Please make changes to the code generator \
+             (distutils/ccompiler_opt.py)
+             */
+            #define {pfx}CPU_TARGET_MODE
+            #define {pfx}CPU_TARGET_CURRENT {target_name}
+            {target_defs}
+            #include "{path}"
+            """).format(
+                pfx=self.conf_c_prefix_, target_name=target_name,
+                path=os.path.abspath(dispatch_src), target_defs=target_defs
+            ))
+        return wrap_path
+
+    def _generate_config(self, output_dir, dispatch_src, targets, has_baseline=False):
+        config_path = os.path.basename(dispatch_src).replace(".c", ".h")
+        config_path = os.path.join(output_dir, config_path)
+        # check if targets didn't change to avoid recompiling
+        cache_hash = self.cache_hash(targets, has_baseline)
+        try:
+            with open(config_path) as f:
+                last_hash = f.readline().split("cache_hash:")
+                if len(last_hash) == 2 and int(last_hash[1]) == cache_hash:
+                    return True
+        except IOError:
+            pass
+
+        self.dist_log("generate dispatched config -> ", config_path)
+        dispatch_calls = []
+        for tar in targets:
+            if isinstance(tar, str):
+                target_name = tar
+            else: # multi target
+                target_name = '__'.join([t for t in tar])
+            req_detect = self.feature_detect(tar)
+            req_detect = '&&'.join([
+                "CHK(%s)" % f for f in req_detect
+            ])
+            dispatch_calls.append(
+                "\t%sCPU_DISPATCH_EXPAND_(CB((%s), %s, __VA_ARGS__))" % (
+                self.conf_c_prefix_, req_detect, target_name
+            ))
+        dispatch_calls = ' \\\n'.join(dispatch_calls)
+
+        if has_baseline:
+            baseline_calls = (
+                "\t%sCPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))"
+            ) % self.conf_c_prefix_
+        else:
+            baseline_calls = ''
+
+        with open(config_path, "w") as fd:
+            fd.write(textwrap.dedent("""\
+            // cache_hash:{cache_hash}
+            /**
+             * AUTOGENERATED DON'T EDIT
+             * Please make changes to the code generator (distutils/ccompiler_opt.py)
+             */
+            #ifndef {pfx}CPU_DISPATCH_EXPAND_
+                #define {pfx}CPU_DISPATCH_EXPAND_(X) X
+            #endif
+            #undef {pfx}CPU_DISPATCH_BASELINE_CALL
+            #undef {pfx}CPU_DISPATCH_CALL
+            #define {pfx}CPU_DISPATCH_BASELINE_CALL(CB, ...) \\
+            {baseline_calls}
+            #define {pfx}CPU_DISPATCH_CALL(CHK, CB, ...) \\
+            {dispatch_calls}
+            """).format(
+                pfx=self.conf_c_prefix_, baseline_calls=baseline_calls,
+                dispatch_calls=dispatch_calls, cache_hash=cache_hash
+            ))
+        return False
+
+def new_ccompiler_opt(compiler, **kwargs):
+    """
+    Create a new instance of 'CCompilerOpt' and generate the dispatch header
+    inside NumPy source dir.
+
+    Parameters
+    ----------
+    'compiler' : CCompiler instance
+    '**kwargs': passed as-is to `CCompilerOpt(...)`
+
+    Returns
+    -------
+    new instance of CCompilerOpt
+    """
+    opt = CCompilerOpt(compiler, **kwargs)
+    npy_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    header_dir = os.path.join(npy_path, *("core/src/common".split("/")))
+    header_path = os.path.join(header_dir, "_cpu_dispatch.h")
+    if not os.path.exists(header_path) or not opt.is_cached():
+        if not os.path.exists(header_dir):
+            opt.dist_log(
+                "dispatch header dir '%s' isn't exist, creating it" % header_dir,
+                stderr=True
+            )
+            os.makedirs(header_dir)
+        opt.generate_dispatch_header(header_path)
+    return opt
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
new file mode 100644
index 000000000..8df556b6c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -0,0 +1,25 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    /* MAXMIN */
+    int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+        ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+    /* ROUNDING */
+    ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+    {
+        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        /* MAXMIN */
+        ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+        ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+        /* ROUNDING */
+        ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+    }
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
new file mode 100644
index 000000000..0158d1354
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    uint32x4_t va = vdupq_n_u32(3);
+    int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+    ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
new file mode 100644
index 000000000..bb437aa40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -0,0 +1,17 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)1);
+    float32x4_t vf   = vdupq_n_f32(1.0f);
+    float32x2_t vlf  = vdup_n_f32(1.0f);
+
+    int ret  = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0);
+        ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0);
+
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
new file mode 100644
index 000000000..80b94000f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -0,0 +1,14 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+
+    int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+        ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c
new file mode 100644
index 000000000..737c0d2e9
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c
new file mode 100644
index 000000000..dfb11fd79
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx2.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256i a = _mm256_abs_epi16(_mm256_setzero_si256());
+    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c
new file mode 100644
index 000000000..71dad83a7
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_clx.c
@@ -0,0 +1,8 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VNNI */
+    __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c
new file mode 100644
index 000000000..dfab4436d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_cnl.c
@@ -0,0 +1,10 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* IFMA */
+    __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* VMBI */
+    a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c
new file mode 100644
index 000000000..cf2706b3b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_icl.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VBMI2 */
+    __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* BITLAG */
+    a = _mm512_popcnt_epi8(a);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c
new file mode 100644
index 000000000..0699f37a6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knl.c
@@ -0,0 +1,11 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    int base[128];
+    /* ER */
+    __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd()));
+    /* PF */
+    _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+    return base[0];
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c
new file mode 100644
index 000000000..db61b4bfa
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knm.c
@@ -0,0 +1,17 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_setzero_si512();
+    __m512 b = _mm512_setzero_ps();
+
+    /* 4FMAPS */
+    b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+    /* 4VNNIW */
+    a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+
+    a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c
new file mode 100644
index 000000000..1d5e15b5e
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_skx.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VL */
+    __m256i a = _mm256_abs_epi64(_mm256_setzero_si256());
+    /* DQ */
+    __m512i b = _mm512_broadcast_i32x8(a);
+    /* BW */
+    b = _mm512_abs_epi16(b);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c
new file mode 100644
index 000000000..61bef6b82
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512cd.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c
new file mode 100644
index 000000000..f60cc09dd
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512f.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_abs_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c
new file mode 100644
index 000000000..a5a343e2d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_f16c.c
@@ -0,0 +1,9 @@
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m128 a  = _mm_cvtph_ps(_mm_setzero_si128());
+    __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128());
+    return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c
new file mode 100644
index 000000000..cf34c6cb1
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma3.c
@@ -0,0 +1,8 @@
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c
new file mode 100644
index 000000000..1ad717033
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma4.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
new file mode 100644
index 000000000..4eab1f384
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
new file mode 100644
index 000000000..745d2e793
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    return (int)vgetq_lane_f32(v_z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
new file mode 100644
index 000000000..45f7b5d69
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f);
+    float32x4_t v2 = vdupq_n_f32(2.0f);
+    float32x4_t v3 = vdupq_n_f32(3.0f);
+    int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0);
+    float64x2_t vd2 = vdupq_n_f64(2.0);
+    float64x2_t vd3 = vdupq_n_f64(3.0);
+    ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c
new file mode 100644
index 000000000..e6a80fb40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_popcnt.c
@@ -0,0 +1,23 @@
+#ifdef _MSC_VER
+    #include <nmmintrin.h>
+#else
+    #include <popcntintrin.h>
+#endif
+
+int main(void)
+{
+    long long a = 0;
+    int b;
+#ifdef _MSC_VER
+    #ifdef _M_X64
+    a = _mm_popcnt_u64(1);
+    #endif
+    b = _mm_popcnt_u32(1);
+#else
+    #ifdef __x86_64__
+    a = __builtin_popcountll(1);
+    #endif
+    b = __builtin_popcount(1);
+#endif
+    return (int)a + b;
+}
diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c
new file mode 100644
index 000000000..bb98bf63c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse.c
@@ -0,0 +1,7 @@
+#include <xmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c
new file mode 100644
index 000000000..658afc9b4
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse2.c
@@ -0,0 +1,7 @@
+#include <emmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c
new file mode 100644
index 000000000..aece1e601
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse3.c
@@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c
new file mode 100644
index 000000000..bfdb9feac
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse41.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_floor_ps(_mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c
new file mode 100644
index 000000000..24f5d93fe
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse42.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c
new file mode 100644
index 000000000..ad0abc1e6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_ssse3.c
@@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return (int)_mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c
new file mode 100644
index 000000000..0b3f30d6a
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    unsigned int zout[4];
+    unsigned int z4[] = {0, 0, 0, 0};
+    __vector unsigned int v_z4 = vsx_ld(0, z4);
+    vsx_st(v_z4, 0, zout);
+    return zout[0];
+}
diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c
new file mode 100644
index 000000000..410fb29d6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx2.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+    v_uint64x2 z2 = (v_uint64x2){0, 0};
+    z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+    return (int)vec_extract(z2, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c
new file mode 100644
index 000000000..857526535
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx3.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+    z4 = vec_absd(z4, z4);
+    return (int)vec_extract(z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c
new file mode 100644
index 000000000..51d70cf2b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_xop.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c
new file mode 100644
index 000000000..4cd09d42a
--- /dev/null
+++ b/numpy/distutils/checks/test_flags.c
@@ -0,0 +1 @@
+int test_flags;
diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py
index 88cd1a160..798c3686f 100644
--- a/numpy/distutils/setup.py
+++ b/numpy/distutils/setup.py
@@ -7,6 +7,7 @@ def configuration(parent_package='',top_path=None):
     config.add_subpackage('tests')
     config.add_data_files('site.cfg')
     config.add_data_files('mingw/gfortran_vs2003_hack.c')
+    config.add_data_dir('checks')
     config.make_config_py()
     return config
 
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index 7ce74bc43..0d3be2ccf 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -209,6 +209,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "core.umath",
     "core.umath_tests",
     "distutils.ccompiler",
+    'distutils.ccompiler_opt',
     "distutils.command",
     "distutils.command.autodist",
     "distutils.command.bdist_rpm",
author	Sayed Adel <seiko@imavr.com>	2020-06-13 18:15:25 +0200
committer	Sayed Adel <seiko@imavr.com>	2020-06-15 22:48:29 +0200
commit	da21d28ef69e65c5bfef8dc22840fe16fec52540 (patch)
tree	dcff74da7f8712f0328d3c1ed49527a0fd6f131d /numpy
parent	5345c2575a28fa2dfbbec83c99636669476c2745 (diff)
download	numpy-da21d28ef69e65c5bfef8dc22840fe16fec52540.tar.gz