summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-06-17 21:35:26 +0300
committerGitHub <noreply@github.com>2020-06-17 21:35:26 +0300
commit8245b392a344a1ae0db6e569ab68b368ad8883c1 (patch)
treee1c5325e8d9114b69e59b3b3ba0cb538fdcbc1e0 /numpy
parent02883d85b5d3f68c12cb1df75f96e0fed741d4a4 (diff)
parente72653810f470415f4d78c8a9ea874370a526126 (diff)
downloadnumpy-8245b392a344a1ae0db6e569ab68b368ad8883c1.tar.gz
Merge pull request #13516 from seiko2plus/core_improve_infa_build
ENH: enable multi-platform SIMD compiler optimizations
Diffstat (limited to 'numpy')
-rw-r--r--numpy/_pytesttester.py15
-rw-r--r--numpy/core/include/numpy/ndarraytypes.h3
-rw-r--r--numpy/core/include/numpy/utils.h34
-rw-r--r--numpy/core/setup.py8
-rw-r--r--numpy/core/src/common/npy_config.h1
-rw-r--r--numpy/core/src/common/npy_cpu_dispatch.h260
-rw-r--r--numpy/core/src/common/npy_cpu_features.c.src43
-rw-r--r--numpy/core/src/common/npy_cpu_features.h42
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c20
-rw-r--r--numpy/core/src/umath/_umath_tests.c.src52
-rw-r--r--numpy/core/src/umath/_umath_tests.dispatch.c33
-rw-r--r--numpy/core/tests/test_cpu_dispatcher.py42
-rw-r--r--numpy/distutils/ccompiler_opt.py2438
-rw-r--r--numpy/distutils/checks/cpu_asimd.c25
-rw-r--r--numpy/distutils/checks/cpu_asimddp.c15
-rw-r--r--numpy/distutils/checks/cpu_asimdfhm.c17
-rw-r--r--numpy/distutils/checks/cpu_asimdhp.c14
-rw-r--r--numpy/distutils/checks/cpu_avx.c7
-rw-r--r--numpy/distutils/checks/cpu_avx2.c7
-rw-r--r--numpy/distutils/checks/cpu_avx512_clx.c8
-rw-r--r--numpy/distutils/checks/cpu_avx512_cnl.c10
-rw-r--r--numpy/distutils/checks/cpu_avx512_icl.c12
-rw-r--r--numpy/distutils/checks/cpu_avx512_knl.c11
-rw-r--r--numpy/distutils/checks/cpu_avx512_knm.c17
-rw-r--r--numpy/distutils/checks/cpu_avx512_skx.c12
-rw-r--r--numpy/distutils/checks/cpu_avx512cd.c7
-rw-r--r--numpy/distutils/checks/cpu_avx512f.c7
-rw-r--r--numpy/distutils/checks/cpu_f16c.c9
-rw-r--r--numpy/distutils/checks/cpu_fma3.c8
-rw-r--r--numpy/distutils/checks/cpu_fma4.c12
-rw-r--r--numpy/distutils/checks/cpu_neon.c15
-rw-r--r--numpy/distutils/checks/cpu_neon_fp16.c11
-rw-r--r--numpy/distutils/checks/cpu_neon_vfpv4.c19
-rw-r--r--numpy/distutils/checks/cpu_popcnt.c23
-rw-r--r--numpy/distutils/checks/cpu_sse.c7
-rw-r--r--numpy/distutils/checks/cpu_sse2.c7
-rw-r--r--numpy/distutils/checks/cpu_sse3.c7
-rw-r--r--numpy/distutils/checks/cpu_sse41.c7
-rw-r--r--numpy/distutils/checks/cpu_sse42.c7
-rw-r--r--numpy/distutils/checks/cpu_ssse3.c7
-rw-r--r--numpy/distutils/checks/cpu_vsx.c21
-rw-r--r--numpy/distutils/checks/cpu_vsx2.c13
-rw-r--r--numpy/distutils/checks/cpu_vsx3.c13
-rw-r--r--numpy/distutils/checks/cpu_xop.c12
-rw-r--r--numpy/distutils/checks/test_flags.c1
-rw-r--r--numpy/distutils/command/build.py9
-rw-r--r--numpy/distutils/command/build_clib.py66
-rw-r--r--numpy/distutils/command/build_ext.py61
-rw-r--r--numpy/distutils/setup.py1
-rw-r--r--numpy/distutils/tests/test_ccompiler_opt.py787
-rw-r--r--numpy/distutils/tests/test_ccompiler_opt_conf.py169
-rw-r--r--numpy/tests/test_public_api.py1
52 files changed, 4431 insertions, 22 deletions
diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py
index ca86aeb22..1c32367f3 100644
--- a/numpy/_pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -35,12 +35,27 @@ __all__ = ['PytestTester']
def _show_numpy_info():
+ from numpy.core._multiarray_umath import (
+ __cpu_features__, __cpu_baseline__, __cpu_dispatch__
+ )
import numpy as np
print("NumPy version %s" % np.__version__)
relaxed_strides = np.ones((10, 1), order="C").flags.f_contiguous
print("NumPy relaxed strides checking option:", relaxed_strides)
+ if len(__cpu_baseline__) == 0 and len(__cpu_dispatch__) == 0:
+ enabled_features = "nothing enabled"
+ else:
+ enabled_features = ' '.join(__cpu_baseline__)
+ for feature in __cpu_dispatch__:
+ if __cpu_features__[feature]:
+ enabled_features += " %s*" % feature
+ else:
+ enabled_features += " %s?" % feature
+ print("NumPy CPU features:", enabled_features)
+
+
class PytestTester:
"""
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 1b61899fa..275bb336b 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -341,9 +341,6 @@ struct NpyAuxData_tag {
#define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr);
#define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr);
-#define NPY_STRINGIFY(x) #x
-#define NPY_TOSTRING(x) NPY_STRINGIFY(x)
-
/*
* Macros to define how array, and dimension/strides data is
* allocated.
diff --git a/numpy/core/include/numpy/utils.h b/numpy/core/include/numpy/utils.h
index 32218b8c7..e251a5201 100644
--- a/numpy/core/include/numpy/utils.h
+++ b/numpy/core/include/numpy/utils.h
@@ -2,20 +2,36 @@
#define __NUMPY_UTILS_HEADER__
#ifndef __COMP_NPY_UNUSED
- #if defined(__GNUC__)
- #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
- # elif defined(__ICC)
- #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
- # elif defined(__clang__)
- #define __COMP_NPY_UNUSED __attribute__ ((unused))
- #else
- #define __COMP_NPY_UNUSED
- #endif
+ #if defined(__GNUC__)
+ #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+ #elif defined(__ICC)
+ #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+ #elif defined(__clang__)
+ #define __COMP_NPY_UNUSED __attribute__ ((unused))
+ #else
+ #define __COMP_NPY_UNUSED
+ #endif
+#endif
+
+#if defined(__GNUC__) || defined(__ICC) || defined(__clang__)
+ #define NPY_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined(_MSC_VER)
+ #define NPY_DECL_ALIGNED(x) __declspec(align(x))
+#else
+ #define NPY_DECL_ALIGNED(x)
#endif
/* Use this to tag a variable as not used. It will remove unused variable
* warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable
* to avoid accidental use */
#define NPY_UNUSED(x) (__NPY_UNUSED_TAGGED ## x) __COMP_NPY_UNUSED
+#define NPY_EXPAND(x) x
+
+#define NPY_STRINGIFY(x) #x
+#define NPY_TOSTRING(x) NPY_STRINGIFY(x)
+
+#define NPY_CAT__(a, b) a ## b
+#define NPY_CAT_(a, b) NPY_CAT__(a, b)
+#define NPY_CAT(a, b) NPY_CAT_(a, b)
#endif
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 5351b30bf..549860179 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -738,6 +738,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'common', 'ufunc_override.h'),
join('src', 'common', 'umathmodule.h'),
join('src', 'common', 'numpyos.h'),
+ join('src', 'common', 'npy_cpu_dispatch.h'),
]
common_src = [
@@ -939,8 +940,11 @@ def configuration(parent_package='',top_path=None):
# umath_tests module #
#######################################################################
- config.add_extension('_umath_tests',
- sources=[join('src', 'umath', '_umath_tests.c.src')])
+ config.add_extension('_umath_tests', sources=[
+ join('src', 'umath', '_umath_tests.c.src'),
+ join('src', 'umath', '_umath_tests.dispatch.c'),
+ join('src', 'common', 'npy_cpu_features.c.src'),
+ ])
#######################################################################
# custom rational dtype module #
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index aebe241a5..4493409bb 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -3,6 +3,7 @@
#include "config.h"
#include "npy_cpu_features.h"
+#include "npy_cpu_dispatch.h"
#include "numpy/numpyconfig.h"
#include "numpy/npy_cpu.h"
#include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
new file mode 100644
index 000000000..846d1ebb9
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -0,0 +1,260 @@
+#ifndef NPY_CPU_DISPATCH_H_
+#define NPY_CPU_DISPATCH_H_
+/**
+ * This file is part of the NumPy CPU dispatcher. Please have a look at doc/reference/simd-optimizations.html
+ * To get a better understanding of the mechanism behind it.
+ */
+#include "npy_cpu_features.h" // NPY_CPU_HAVE
+#include "numpy/utils.h" // NPY_EXPAND, NPY_CAT
+/**
+ * Bringing the main configration header '_cpu_dispatch.h'.
+ *
+ * This header is generated by the distutils module 'ccompiler_opt',
+ * and contains all the #definitions and headers of instruction-sets,
+ * that had been configured through command arguments '--cpu-baseline' and '--cpu-dispatch'.
+ *
+ * It also contains extra C #definitions and macros that are used for implementing
+ * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
+ */
+/**
+ * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * due the nature of command argument '--disable-optimization',
+ * which is explicitly disabling the module ccompiler_opt.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #if defined(__powerpc64__) && !defined(__cplusplus) && defined(bool)
+ /**
+ * "altivec.h" header contains the definitions(bool, vector, pixel),
+ * usually in c++ we undefine them after including the header.
+ * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
+ * since c99 supports bool variables which may lead to ambiguous errors.
+ */
+ // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+ #define NPY__DISPATCH_DEFBOOL
+ typedef bool npy__dispatch_bkbool;
+ #endif
+ #include "_cpu_dispatch.h"
+ #ifdef NPY_HAVE_VSX
+ #undef bool
+ #undef vector
+ #undef pixel
+ #ifdef NPY__DISPATCH_DEFBOOL
+ #define bool npy__dispatch_bkbool
+ #endif
+ #endif
+#endif // !NPY_DISABLE_OPTIMIZATION
+/**
+ * Macro NPY_CPU_DISPATCH_CURFX(NAME)
+ *
+ * Returns @NAME suffixed with "_" + "the current target" during compiling
+ * the wrapped sources that generated from the dispatch-able sources according
+ * to the provided configuration statements.
+ *
+ * It also returns @NAME as-is without any suffix when it comes to the baseline or
+ * in case if the optimization is disabled.
+ *
+ * The idea behind this Macro is to allow exporting certain symbols and to
+ * avoid linking duplications due to the nature of the dispatch-able sources.
+ *
+ * Example:
+ * @targets baseline avx avx512_skx vsx3 asimdhp // configration statments
+ *
+ * void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst)
+ * {
+ * // the kernel
+ * }
+ *
+ * By assuming the required optimizations are enabled via '--cpu-dspatch' and
+ * the compiler supported them too, then the generated symbols will be named as follows:
+ *
+ * - x86:
+ * dispatch_me(const int*, int*) // baseline
+ * dispatch_me_AVX(const int*, int*)
+ * dispatch_me_AVX512_SKX(const int*, int*)
+ *
+ * - ppc64:
+ * dispatch_me(const int*, int*)
+ * dispatch_me_VSX3(const int*, int*)
+ *
+ * - ARM:
+ * dispatch_me(const int*, int*)
+ * dispatch_me_ASIMHP(const int*, int*)
+ *
+ * - unsupported arch or when optimization is disabled:
+ * dispatch_me(const int*, int*)
+ *
+ * For forward declarations, see 'NPY_CPU_DISPATCH_DECLARE'.
+ */
+#ifdef NPY__CPU_TARGET_CURRENT
+ // 'NPY__CPU_TARGET_CURRENT': only defined by the dispatch-able sources
+ #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_CAT(NPY_CAT(NAME, _), NPY__CPU_TARGET_CURRENT)
+#else
+ #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_EXPAND(NAME)
+#endif
+/**
+ * Defining the default behavior for the configurable macros of dispatch-able sources,
+ * 'NPY__CPU_DISPATCH_CALL(...)' and 'NPY__CPU_DISPATCH_BASELINE_CALL(...)'
+ *
+ * These macros are defined inside the generated config files that been derived from
+ * the configuration statements of the dispatch-able sources.
+ *
+ * The generated config file takes the same name of the dispatch-able source with replacing
+ * the extension to '.h' instead of '.c', and it should be treated as a header template.
+ *
+ * For more clarification, please have a look at doc/reference/simd-optimizations.html.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+ &&"Expected config header of the dispatch-able source";
+ #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
+ &&"Expected config header of the dispatch-able source";
+#else
+ /**
+ * We assume by default that all configuration statements contains 'baseline' option however,
+ * if the dispatch-able source doesn't require it, then the dispatch-able source and following macros
+ * need to be guard it with '#ifndef NPY_DISABLE_OPTIMIZATION'
+ */
+ #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+ NPY_EXPAND(CB(__VA_ARGS__))
+ #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...)
+#endif // !NPY_DISABLE_OPTIMIZATION
+/**
+ * Macro NPY_CPU_DISPATCH_DECLARE(LEFT, ...) is used to provide forward
+ * declarations for the exported variables and functions that defined inside
+ * the dispatch-able sources.
+ *
+ * The first argument should ends with the exported function or variable name,
+ * while the Macro pasting the extra arguments.
+ *
+ * Examples:
+ * #ifndef NPY_DISABLE_OPTIMIZATION
+ * #include "dispatchable_source_name.dispatch.h"
+ * #endif
+ *
+ * NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
+ * NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
+ *
+ * By assuming the provided config header drived from a dispatch-able source,
+ * that configured with "@targets baseline sse41 vsx3 asimdhp",
+ * they supported by the compiler and enabled via '--cpu-dspatch',
+ * then the prototype declrations at the above example will equlivent to the follows:
+ *
+ * - x86:
+ * void dispatch_me(const int*, int*); // baseline
+ * void dispatch_me_SSE41(const int*, int*);
+ *
+ * extern cb_type callback_tab[TAB_SIZE];
+ * extern cb_type callback_tab_SSE41[TAB_SIZE];
+ *
+ * - ppc64:
+ * void dispatch_me(const int*, int*);
+ * void dispatch_me_VSX3(const int*, int*);
+ *
+ * extern cb_type callback_tab[TAB_SIZE];
+ * extern cb_type callback_tab_VSX3[TAB_SIZE];
+ *
+ * - ARM:
+ * void dispatch_me(const int*, int*);
+ * void dispatch_me_ASIMDHP(const int*, int*);
+ *
+ * extern cb_type callback_tab[TAB_SIZE];
+ * extern cb_type callback_tab_ASIMDHP[TAB_SIZE];
+ *
+ * - unsupported arch or when optimization is disabled:
+ * void dispatch_me(const int*, int*);
+ * extern cb_type callback_tab[TAB_SIZE];
+ *
+ * For runtime dispatching, see 'NPY_CPU_DISPATCH_CALL'
+ */
+#define NPY_CPU_DISPATCH_DECLARE(...) \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \
+ NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__;
+#define NPY_CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \
+ LEFT __VA_ARGS__;
+// Dummy CPU runtime checking
+#define NPY_CPU_DISPATCH_DECLARE_CHK_(FEATURE)
+/**
+ * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * if it was provided within the configration statments.
+ */
+#define NPY_CPU_DISPATCH_DECLARE_XB(...) \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__)
+/**
+ * Macro NPY_CPU_DISPATCH_CALL(LEFT, ...) is used for runtime dispatching
+ * of the exported functions and variables within the dispatch-able sources
+ * according to the highested interesed CPU features that supported by the
+ * running machine depending on the required optimizations.
+ *
+ * The first argument should ends with the exported function or variable name,
+ * while the Macro pasting the extra arguments.
+ *
+ * Example:
+ * Assume we have a dispatch-able source exporting the following function:
+ *
+ * @targets baseline avx2 avx512_skx // configration statments
+ *
+ * void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst)
+ * {
+ * // the kernel
+ * }
+ *
+ * In order to call or to assign the pointer of it from outside the dispatch-able source,
+ * you have to use this Macro as follows:
+ *
+ * // bring the genreated config header of the dispatch-abel source
+ * #ifndef NPY_DISABLE_OPTIMIZATION
+ * #include "dispatchable_source_name.dispatch.h"
+ * #endif
+ * // forward declaration
+ * NPY_CPU_DISPATCH_DECLARE(dispatch_me, (const int *src, int *dst))
+ *
+ * typedef void(*func_type)(const int*, int*);
+ * func_type the_callee(const int *src, int *dst, func_type *cb)
+ * {
+ * // direct call
+ * NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst))
+ * // assign the pointer
+ * NPY_CPU_DISPATCH_CALL(*cb = dispatch_me)
+ * // return the pointer
+ * NPY_CPU_DISPATCH_CALL(return dispatch_me)
+ * }
+ */
+#define NPY_CPU_DISPATCH_CALL(...) \
+ if (0) {/*DUMMY*/} \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+ else if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+#define NPY_CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
+ else { LEFT __VA_ARGS__; }
+/**
+ * Macro NPY_CPU_DISPATCH_CALL_XB(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * if it was provided within the configration statments.
+ */
+#define NPY_CPU_DISPATCH_CALL_XB(...) \
+ if (0) {/*DUMMY*/} \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__)
+/**
+ * Macro NPY_CPU_DISPATCH_CALL_ALL(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_CALL` but dispatching all the required optimizations for
+ * the exported functions and variables instead of highest interested one.
+ */
+#define NPY_CPU_DISPATCH_CALL_ALL(...) \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+ if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+#define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
+ { LEFT __VA_ARGS__; }
+
+#endif // NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index d35199760..facd27f3c 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -1,6 +1,7 @@
#include "npy_cpu_features.h"
+#include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope.
#include "numpy/npy_common.h" // for NPY_INLINE
-#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope.
+#include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope.
/******************** Private Definitions *********************/
@@ -55,6 +56,44 @@ npy_cpu_features_dict(void)
return dict;
}
+#define NPY__CPU_PYLIST_APPEND_CB(FEATURE, LIST) \
+ item = PyUnicode_FromString(NPY_TOSTRING(FEATURE)); \
+ if (item == NULL) { \
+ Py_DECREF(LIST); \
+ return NULL; \
+ } \
+ PyList_SET_ITEM(LIST, index++, item);
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_baseline_list(void)
+{
+#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
+ PyObject *list = PyList_New(NPY_WITH_CPU_BASELINE_N), *item;
+ int index = 0;
+ if (list != NULL) {
+ NPY_WITH_CPU_BASELINE_CALL(NPY__CPU_PYLIST_APPEND_CB, list)
+ }
+ return list;
+#else
+ return PyList_New(0);
+#endif
+}
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_dispatch_list(void)
+{
+#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
+ PyObject *list = PyList_New(NPY_WITH_CPU_DISPATCH_N), *item;
+ int index = 0;
+ if (list != NULL) {
+ NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_PYLIST_APPEND_CB, list)
+ }
+ return list;
+#else
+ return PyList_New(0);
+#endif
+}
+
/****************************************************************
* This section is reserved to defining @npy__cpu_init_features
* for each CPU architecture, please try to keep it clean. Ty
@@ -366,7 +405,7 @@ npy__cpu_init_features(void)
return;
#endif
// We have nothing else todo
-#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
#if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
#endif
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 0e8901328..fffdef38e 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -109,6 +109,48 @@ npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME)
*/
NPY_VISIBILITY_HIDDEN PyObject *
npy_cpu_features_dict(void);
+/*
+ * Return a new a Python list contains the minimal set of required optimizations
+ * that supported by the compiler and platform according to the specified
+ * values to command argument '--cpu-baseline'.
+ *
+ * This function is mainly used to implement umath's attrbute '__cpu_baseline__',
+ * and the items are sorted from the lowest to highest interest.
+ *
+ * For example, according to the default build configuration and by assuming the compiler
+ * support all the involved optimizations then the returned list should equivalent to:
+ *
+ * On x86: ['SSE', 'SSE2']
+ * On x64: ['SSE', 'SSE2', 'SSE3']
+ * On armhf: []
+ * On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD']
+ * On ppc64: []
+ * On ppc64le: ['VSX', 'VSX2']
+ * On any other arch or if the optimization is disabled: []
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_baseline_list(void);
+/*
+ * Return a new a Python list contains the dispatched set of additional optimizations
+ * that supported by the compiler and platform according to the specified
+ * values to command argument '--cpu-dispatch'.
+ *
+ * This function is mainly used to implement umath's attrbute '__cpu_dispatch__',
+ * and the items are sorted from the lowest to highest interest.
+ *
+ * For example, according to the default build configuration and by assuming the compiler
+ * support all the involved optimizations then the returned list should equivalent to:
+ *
+ * On x86: ['SSE3', 'SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...]
+ * On x64: ['SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...]
+ * On armhf: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD', 'ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
+ * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
+ * On ppc64: ['VSX', 'VSX2', 'VSX3']
+ * On ppc64le: ['VSX3']
+ * On any other arch or if the optimization is disabled: []
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_dispatch_list(void);
#ifdef __cplusplus
}
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 84c22ba65..4190c53bd 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4542,6 +4542,26 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
}
Py_DECREF(s);
+ s = npy_cpu_baseline_list();
+ if (s == NULL) {
+ goto err;
+ }
+ if (PyDict_SetItemString(d, "__cpu_baseline__", s) < 0) {
+ Py_DECREF(s);
+ goto err;
+ }
+ Py_DECREF(s);
+
+ s = npy_cpu_dispatch_list();
+ if (s == NULL) {
+ goto err;
+ }
+ if (PyDict_SetItemString(d, "__cpu_dispatch__", s) < 0) {
+ Py_DECREF(s);
+ goto err;
+ }
+ Py_DECREF(s);
+
s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
if (s == NULL) {
goto err;
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index abc8d78c4..d08aabd64 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -576,6 +576,51 @@ fail:
return NULL;
}
+// Testing the utilites of the CPU dispatcher
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "_umath_tests.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var)
+NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void))
+NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list))
+
+static PyObject *
+UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2))
+{
+ const char *highest_func, *highest_var;
+ NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ())
+ NPY_CPU_DISPATCH_CALL(highest_var = _umath_tests_dispatch_var)
+ const char *highest_func_xb = "nobase", *highest_var_xb = "nobase";
+ NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ())
+ NPY_CPU_DISPATCH_CALL_XB(highest_var_xb = _umath_tests_dispatch_var)
+
+ PyObject *dict = PyDict_New(), *item;
+ if (dict == NULL) {
+ return NULL;
+ }
+ /**begin repeat
+ * #str = func, var, func_xb, var_xb#
+ */
+ item = PyUnicode_FromString(highest_@str@);
+ if (item == NULL || PyDict_SetItemString(dict, "@str@", item) < 0) {
+ goto err;
+ }
+ /**end repeat**/
+ item = PyList_New(0);
+ if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) {
+ goto err;
+ }
+ NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item))
+ if (PyErr_Occurred()) {
+ goto err;
+ }
+ return dict;
+err:
+ Py_XDECREF(item);
+ Py_DECREF(dict);
+ return NULL;
+}
+
static PyMethodDef UMath_TestsMethods[] = {
{"test_signature", UMath_Tests_test_signature, METH_VARARGS,
"Test signature parsing of ufunc. \n"
@@ -583,6 +628,7 @@ static PyMethodDef UMath_TestsMethods[] = {
"If fails, it returns NULL. Otherwise it returns a tuple of ufunc "
"internals. \n",
},
+ {"test_dispatch", UMath_Tests_test_dispatch, METH_NOARGS, NULL},
{NULL, NULL, 0, NULL} /* Sentinel */
};
@@ -604,6 +650,11 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
PyObject *d;
PyObject *version;
+ // Initialize CPU features
+ if (npy_cpu_init() < 0) {
+ return NULL;
+ }
+
m = PyModule_Create(&moduledef);
if (m == NULL) {
return NULL;
@@ -632,6 +683,5 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
"cannot load _umath_tests module.");
return NULL;
}
-
return m;
}
diff --git a/numpy/core/src/umath/_umath_tests.dispatch.c b/numpy/core/src/umath/_umath_tests.dispatch.c
new file mode 100644
index 000000000..d86a54411
--- /dev/null
+++ b/numpy/core/src/umath/_umath_tests.dispatch.c
@@ -0,0 +1,33 @@
+/**
+ * Testing the utilites of the CPU dispatcher
+ *
+ * @targets $werror baseline
+ * SSE2 SSE41 AVX2
+ * VSX VSX2 VSX3
+ * NEON ASIMD ASIMDHP
+ */
+#include <Python.h>
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "_umath_tests.dispatch.h"
+#endif
+
+NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void))
+NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var)
+NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list))
+
+const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_var) = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(var));
+const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_func)(void)
+{
+ static const char *current = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func));
+ return current;
+}
+
+void NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_attach)(PyObject *list)
+{
+ PyObject *item = PyUnicode_FromString(NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func)));
+ if (item) {
+ PyList_Append(list, item);
+ }
+}
diff --git a/numpy/core/tests/test_cpu_dispatcher.py b/numpy/core/tests/test_cpu_dispatcher.py
new file mode 100644
index 000000000..8712dee1a
--- /dev/null
+++ b/numpy/core/tests/test_cpu_dispatcher.py
@@ -0,0 +1,42 @@
+from numpy.core._multiarray_umath import __cpu_features__, __cpu_baseline__, __cpu_dispatch__
+from numpy.core import _umath_tests
+from numpy.testing import assert_equal
+
+def test_dispatcher():
+ """
+ Testing the utilites of the CPU dispatcher
+ """
+ targets = (
+ "SSE2", "SSE41", "AVX2",
+ "VSX", "VSX2", "VSX3",
+ "NEON", "ASIMD", "ASIMDHP"
+ )
+ highest_sfx = "" # no suffix for the baseline
+ all_sfx = []
+ for feature in reversed(targets):
+ # skip baseline features, by the default `CCompilerOpt` do not generate separated objects
+ # for the baseline, just one object combined all of them via 'baseline' option
+ # within the configuration statments.
+ if feature in __cpu_baseline__:
+ continue
+ # check compiler and running machine support
+ if feature not in __cpu_dispatch__ or not __cpu_features__[feature]:
+ continue
+
+ if not highest_sfx:
+ highest_sfx = "_" + feature
+ all_sfx.append("func" + "_" + feature)
+
+ test = _umath_tests.test_dispatch()
+ assert_equal(test["func"], "func" + highest_sfx)
+ assert_equal(test["var"], "var" + highest_sfx)
+
+ if highest_sfx:
+ assert_equal(test["func_xb"], "func" + highest_sfx)
+ assert_equal(test["var_xb"], "var" + highest_sfx)
+ else:
+ assert_equal(test["func_xb"], "nobase")
+ assert_equal(test["var_xb"], "nobase")
+
+ all_sfx.append("func") # add the baseline
+ assert_equal(test["all"], all_sfx)
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
new file mode 100644
index 000000000..0488173ca
--- /dev/null
+++ b/numpy/distutils/ccompiler_opt.py
@@ -0,0 +1,2438 @@
+"""Provides the `CCompilerOpt` class, used for handling the CPU/hardware
+optimization, starting from parsing the command arguments, to managing the
+relation between the CPU baseline and dispatch-able features,
+also generating the required C headers and ending with compiling
+the sources with proper compiler's flags.
+
+`CCompilerOpt` doesn't provide runtime detection for the CPU features,
+instead only focuses on the compiler side, but it creates abstract C headers
+that can be used later for the final runtime dispatching process."""
+
+import sys, io, os, re, textwrap, pprint, inspect, atexit, subprocess
+
+class _Config:
+ """An abstract class holds all configurable attributes of `CCompilerOpt`,
+ these class attributes can be used to change the default behavior
+ of `CCompilerOpt` in order to fit other requirements.
+
+ Attributes
+ ----------
+ conf_nocache : bool
+ Set True to disable memory and file cache.
+ Default is False.
+
+ conf_noopt : bool
+ Set True to forces the optimization to be disabled,
+ in this case `CCompilerOpt` tends to generate all
+ expected headers in order to 'not' break the build.
+ Default is False.
+
+ conf_cache_factors : list
+ Add extra factors to the primary caching factors. The caching factors
+ are utilized to determine if there are changes had happened that
+ requires to discard the cache and re-updating it. The primary factors
+ are the arguments of `CCompilerOpt` and `CCompiler`'s properties(type, flags, etc).
+ Default is list of two items, containing the time of last modification
+ of `ccompiler_opt` and value of attribute "conf_noopt"
+
+ conf_tmp_path : str,
+ The path of temporary directory. Default is auto-created
+ temporary directory via ``tempfile.mkdtemp()``.
+
+ conf_check_path : str
+ The path of testing files. Each added CPU feature must have a
+ **C** source file contains at least one intrinsic or instruction that
+ related to this feature, so it can be tested against the compiler.
+ Default is ``./distutils/checks``.
+
+ conf_target_groups : dict
+ Extra tokens that can be reached from dispatch-able sources through
+ the special mark ``@targets``. Default is an empty dictionary.
+
+ **Notes**:
+ - case-insensitive for tokens and group names
+ - sign '#' must stick in the begin of group name and only within ``@targets``
+
+ **Example**:
+ .. code-block:: console
+
+ $ "@targets #avx_group other_tokens" > group_inside.c
+
+ >>> CCompilerOpt.conf_target_groups["avx_group"] = \\
+ "$werror $maxopt avx2 avx512f avx512_skx"
+ >>> cco = CCompilerOpt(cc_instance)
+ >>> cco.try_dispatch(["group_inside.c"])
+
+ conf_c_prefix : str
+ The prefix of public C definitions. Default is ``"NPY_"``.
+
+ conf_c_prefix_ : str
+ The prefix of internal C definitions. Default is ``"NPY__"``.
+
+ conf_cc_flags : dict
+ Nested dictionaries defining several compiler flags
+ that linked to some major functions, the main key
+ represent the compiler name and sub-keys represent
+ flags names. Default is already covers all supported
+ **C** compilers.
+
+ Sub-keys explained as follows:
+
+ "native": str or None
+ used by argument option `native`, to detect the current
+ machine support via the compiler.
+ "werror": str or None
+ utilized to treat warning as errors during testing CPU features
+ against the compiler and also for target's policy `$werror`
+ via dispatch-able sources.
+ "maxopt": str or None
+ utilized for target's policy '$maxopt' and the value should
+ contains the maximum acceptable optimization by the compiler.
+ e.g. in gcc `'-O3'`
+
+ **Notes**:
+ * case-sensitive for compiler names and flags
+ * use space to separate multiple flags
+ * any flag will tested against the compiler and it will skipped
+ if it's not applicable.
+
+ conf_min_features : dict
+ A dictionary defines the used CPU features for
+ argument option `'min'`, the key represent the CPU architecture
+ name e.g. `'x86'`. Default values provide the best effort
+ on wide range of users platforms.
+
+ **Note**: case-sensitive for architecture names.
+
+ conf_features : dict
+ Nested dictionaries used for identifying the CPU features.
+ the primary key is represented as a feature name or group name
+ that gathers several features. Default values covers all
+ supported features but without the major options like "flags",
+ these undefined options handle it by method `conf_features_partial()`.
+ Default value is covers almost all CPU features for *X86*, *IBM/Power64*
+ and *ARM 7/8*.
+
+ Sub-keys explained as follows:
+
+ "implies" : str or list, optional,
+ List of CPU feature names to be implied by it,
+ the feature name must be defined within `conf_features`.
+ Default is None.
+
+ "flags": str or list, optional
+ List of compiler flags. Default is None.
+
+ "detect": str or list, optional
+ List of CPU feature names that required to be detected
+ in runtime. By default, its the feature name or features
+ in "group" if its specified.
+
+ "implies_detect": bool, optional
+ If True, all "detect" of implied features will be combined.
+ Default is True. see `feature_detect()`.
+
+ "group": str or list, optional
+ Same as "implies" but doesn't require the feature name to be
+ defined within `conf_features`.
+
+ "interest": int, required
+ a key for sorting CPU features
+
+ "headers": str or list, optional
+ intrinsics C header file
+
+ "disable": str, optional
+ force disable feature, the string value should contains the
+ reason of disabling.
+
+ "autovec": bool or None, optional
+ True or False to declare that CPU feature can be auto-vectorized
+ by the compiler.
+ By default(None), treated as True if the feature contains at
+ least one applicable flag. see `feature_can_autovec()`
+
+ **NOTES**:
+ * space can be used as separator with options that supports "str or list"
+ * case-sensitive for all values and feature name must be in upper-case.
+ * if flags aren't applicable, its will skipped rather than disable the
+ CPU feature
+ * the CPU feature will disabled if the compiler fail to compile
+ the test file
+ """
+ conf_nocache = False
+ conf_noopt = False
+ conf_cache_factors = None
+ conf_tmp_path = None
+ conf_check_path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), "checks"
+ )
+ conf_target_groups = {}
+ conf_c_prefix = 'NPY_'
+ conf_c_prefix_ = 'NPY__'
+ conf_cc_flags = dict(
+ gcc = dict(
+ # native should always fail on arm and ppc64,
+ # native usually works only with x86
+ native = '-march=native',
+ opt = '-O3',
+ werror = '-Werror'
+ ),
+ clang = dict(
+ native = '-march=native',
+ opt = "-O3",
+ werror = '-Werror'
+ ),
+ icc = dict(
+ native = '-xHost',
+ opt = '-O3',
+ werror = '-Werror'
+ ),
+ iccw = dict(
+ native = '/QxHost',
+ opt = '/O3',
+ werror = '/Werror'
+ ),
+ msvc = dict(
+ native = None,
+ opt = '/O2',
+ werror = '/WX'
+ )
+ )
+ conf_min_features = dict(
+ x86 = "SSE SSE2",
+ x64 = "SSE SSE2 SSE3",
+ ppc64 = '', # play it safe
+ ppc64le = "VSX VSX2",
+ armhf = '', # play it safe
+ aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD"
+ )
+ conf_features = dict(
+ # X86
+ SSE = dict(
+ interest=1, headers="xmmintrin.h",
+ # enabling SSE without SSE2 is useless also
+ # it's non-optional for x86_64
+ implies="SSE2"
+ ),
+ SSE2 = dict(interest=2, implies="SSE", headers="emmintrin.h"),
+ SSE3 = dict(interest=3, implies="SSE2", headers="pmmintrin.h"),
+ SSSE3 = dict(interest=4, implies="SSE3", headers="tmmintrin.h"),
+ SSE41 = dict(interest=5, implies="SSSE3", headers="smmintrin.h"),
+ POPCNT = dict(interest=6, implies="SSE41", headers="popcntintrin.h"),
+ SSE42 = dict(interest=7, implies="POPCNT"),
+ AVX = dict(
+ interest=8, implies="SSE42", headers="immintrin.h",
+ implies_detect=False
+ ),
+ XOP = dict(interest=9, implies="AVX", headers="x86intrin.h"),
+ FMA4 = dict(interest=10, implies="AVX", headers="x86intrin.h"),
+ F16C = dict(interest=11, implies="AVX"),
+ FMA3 = dict(interest=12, implies="F16C"),
+ AVX2 = dict(interest=13, implies="F16C"),
+ AVX512F = dict(interest=20, implies="FMA3 AVX2", implies_detect=False),
+ AVX512CD = dict(interest=21, implies="AVX512F"),
+ AVX512_KNL = dict(
+ interest=40, implies="AVX512CD", group="AVX512ER AVX512PF",
+ detect="AVX512_KNL", implies_detect=False
+ ),
+ AVX512_KNM = dict(
+ interest=41, implies="AVX512_KNL",
+ group="AVX5124FMAPS AVX5124VNNIW AVX512VPOPCNTDQ",
+ detect="AVX512_KNM", implies_detect=False
+ ),
+ AVX512_SKX = dict(
+ interest=42, implies="AVX512CD", group="AVX512VL AVX512BW AVX512DQ",
+ detect="AVX512_SKX", implies_detect=False
+ ),
+ AVX512_CLX = dict(
+ interest=43, implies="AVX512_SKX", group="AVX512VNNI",
+ detect="AVX512_CLX"
+ ),
+ AVX512_CNL = dict(
+ interest=44, implies="AVX512_SKX", group="AVX512IFMA AVX512VBMI",
+ detect="AVX512_CNL", implies_detect=False
+ ),
+ AVX512_ICL = dict(
+ interest=45, implies="AVX512_CLX AVX512_CNL",
+ group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ",
+ detect="AVX512_ICL", implies_detect=False
+ ),
+ # IBM/Power
+ ## Power7/ISA 2.06
+ VSX = dict(interest=1, headers="altivec.h"),
+ ## Power8/ISA 2.07
+ VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
+ ## Power9/ISA 3.00
+ VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+ # ARM
+ NEON = dict(interest=1, headers="arm_neon.h"),
+ NEON_FP16 = dict(interest=2, implies="NEON"),
+ ## FMA
+ NEON_VFPV4 = dict(interest=3, implies="NEON_FP16"),
+ ## Advanced SIMD
+ ASIMD = dict(interest=4, implies="NEON_FP16 NEON_VFPV4", implies_detect=False),
+ ## ARMv8.2 half-precision & vector arithm
+ ASIMDHP = dict(interest=5, implies="ASIMD"),
+ ## ARMv8.2 dot product
+ ASIMDDP = dict(interest=6, implies="ASIMD"),
+ ## ARMv8.2 Single & half-precision Multiply
+ ASIMDFHM = dict(interest=7, implies="ASIMDHP"),
+ )
+ def conf_features_partial(self):
+ """Return a dictionary of supported CPU features by the platform,
+ and accumulate the rest of undefined options in `conf_features`,
+ the returned dict has same rules and notes in
+ class attribute `conf_features`, also its override
+ any options that been set in 'conf_features'.
+ """
+ if self.cc_noopt:
+ # optimization is disabled
+ return {}
+
+ on_x86 = self.cc_on_x86 or self.cc_on_x64
+ is_unix = self.cc_is_gcc or self.cc_is_clang
+
+ if on_x86 and is_unix: return dict(
+ SSE = dict(flags="-msse"),
+ SSE2 = dict(flags="-msse2"),
+ SSE3 = dict(flags="-msse3"),
+ SSSE3 = dict(flags="-mssse3"),
+ SSE41 = dict(flags="-msse4.1"),
+ POPCNT = dict(flags="-mpopcnt"),
+ SSE42 = dict(flags="-msse4.2"),
+ AVX = dict(flags="-mavx"),
+ F16C = dict(flags="-mf16c"),
+ XOP = dict(flags="-mxop"),
+ FMA4 = dict(flags="-mfma4"),
+ FMA3 = dict(flags="-mfma"),
+ AVX2 = dict(flags="-mavx2"),
+ AVX512F = dict(flags="-mavx512f"),
+ AVX512CD = dict(flags="-mavx512cd"),
+ AVX512_KNL = dict(flags="-mavx512er -mavx512pf"),
+ AVX512_KNM = dict(
+ flags="-mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq"
+ ),
+ AVX512_SKX = dict(flags="-mavx512vl -mavx512bw -mavx512dq"),
+ AVX512_CLX = dict(flags="-mavx512vnni"),
+ AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"),
+ AVX512_ICL = dict(
+ flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq"
+ )
+ )
+ if on_x86 and self.cc_is_icc: return dict(
+ SSE = dict(flags="-msse"),
+ SSE2 = dict(flags="-msse2"),
+ SSE3 = dict(flags="-msse3"),
+ SSSE3 = dict(flags="-mssse3"),
+ SSE41 = dict(flags="-msse4.1"),
+ POPCNT = {},
+ SSE42 = dict(flags="-msse4.2"),
+ AVX = dict(flags="-mavx"),
+ F16C = {},
+ XOP = dict(disable="Intel Compiler doesn't support it"),
+ FMA4 = dict(disable="Intel Compiler doesn't support it"),
+ # Intel Compiler doesn't support AVX2 or FMA3 independently
+ FMA3 = dict(
+ implies="F16C AVX2", flags="-march=core-avx2"
+ ),
+ AVX2 = dict(implies="FMA3", flags="-march=core-avx2"),
+ # Intel Compiler doesn't support AVX512F or AVX512CD independently
+ AVX512F = dict(
+ implies="AVX2 AVX512CD", flags="-march=common-avx512"
+ ),
+ AVX512CD = dict(
+ implies="AVX2 AVX512F", flags="-march=common-avx512"
+ ),
+ AVX512_KNL = dict(flags="-xKNL"),
+ AVX512_KNM = dict(flags="-xKNM"),
+ AVX512_SKX = dict(flags="-xSKYLAKE-AVX512"),
+ AVX512_CLX = dict(flags="-xCASCADELAKE"),
+ AVX512_CNL = dict(flags="-xCANNONLAKE"),
+ AVX512_ICL = dict(flags="-xICELAKE-CLIENT"),
+ )
+ if on_x86 and self.cc_is_iccw: return dict(
+ SSE = dict(flags="/arch:SSE"),
+ SSE2 = dict(flags="/arch:SSE2"),
+ SSE3 = dict(flags="/arch:SSE3"),
+ SSSE3 = dict(flags="/arch:SSSE3"),
+ SSE41 = dict(flags="/arch:SSE4.1"),
+ POPCNT = {},
+ SSE42 = dict(flags="/arch:SSE4.2"),
+ AVX = dict(flags="/arch:AVX"),
+ F16C = {},
+ XOP = dict(disable="Intel Compiler doesn't support it"),
+ FMA4 = dict(disable="Intel Compiler doesn't support it"),
+ # Intel Compiler doesn't support FMA3 or AVX2 independently
+ FMA3 = dict(
+ implies="F16C AVX2", flags="/arch:CORE-AVX2"
+ ),
+ AVX2 = dict(
+ implies="FMA3", flags="/arch:CORE-AVX2"
+ ),
+ # Intel Compiler doesn't support AVX512F or AVX512CD independently
+ AVX512F = dict(
+ implies="AVX2 AVX512CD", flags="/Qx:COMMON-AVX512"
+ ),
+ AVX512CD = dict(
+ implies="AVX2 AVX512F", flags="/Qx:COMMON-AVX512"
+ ),
+ AVX512_KNL = dict(flags="/Qx:KNL"),
+ AVX512_KNM = dict(flags="/Qx:KNM"),
+ AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"),
+ AVX512_CLX = dict(flags="/Qx:CASCADELAKE"),
+ AVX512_CNL = dict(flags="/Qx:CANNONLAKE"),
+ AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT")
+ )
+ if on_x86 and self.cc_is_msvc: return dict(
+ SSE = dict(flags="/arch:SSE"),
+ SSE2 = dict(flags="/arch:SSE2"),
+ SSE3 = {},
+ SSSE3 = {},
+ SSE41 = {},
+ POPCNT = dict(headers="nmmintrin.h"),
+ SSE42 = {},
+ AVX = dict(flags="/arch:AVX"),
+ F16C = {},
+ XOP = dict(headers="ammintrin.h"),
+ FMA4 = dict(headers="ammintrin.h"),
+ # MSVC doesn't support FMA3 or AVX2 independently
+ FMA3 = dict(
+ implies="F16C AVX2", flags="/arch:AVX2"
+ ),
+ AVX2 = dict(
+ implies="F16C FMA3", flags="/arch:AVX2"
+ ),
+ # MSVC doesn't support AVX512F or AVX512CD independently,
+ # always generate instructions belong to (VL/VW/DQ)
+ AVX512F = dict(
+ implies="AVX2 AVX512CD AVX512_SKX", flags="/arch:AVX512"
+ ),
+ AVX512CD = dict(
+ implies="AVX512F AVX512_SKX", flags="/arch:AVX512"
+ ),
+ AVX512_KNL = dict(
+ disable="MSVC compiler doesn't support it"
+ ),
+ AVX512_KNM = dict(
+ disable="MSVC compiler doesn't support it"
+ ),
+ AVX512_SKX = dict(flags="/arch:AVX512"),
+ AVX512_CLX = {},
+ AVX512_CNL = {},
+ AVX512_ICL = {}
+ )
+
+ on_power = self.cc_on_ppc64le or self.cc_on_ppc64
+ if on_power:
+ partial = dict(
+ VSX = dict(
+ implies=("VSX2" if self.cc_on_ppc64le else ""),
+ flags="-mvsx"
+ ),
+ VSX2 = dict(
+ flags="-mcpu=power8", implies_detect=False
+ ),
+ VSX3 = dict(
+ flags="-mcpu=power9 -mtune=power9", implies_detect=False
+ )
+ )
+ if self.cc_is_clang:
+ partial["VSX"]["flags"] = "-maltivec -mvsx"
+ partial["VSX2"]["flags"] = "-mpower8-vector"
+ partial["VSX3"]["flags"] = "-mpower9-vector"
+
+ return partial
+
+ if self.cc_on_aarch64 and is_unix: return dict(
+ NEON = dict(
+ implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True
+ ),
+ NEON_FP16 = dict(
+ implies="NEON NEON_VFPV4 ASIMD", autovec=True
+ ),
+ NEON_VFPV4 = dict(
+ implies="NEON NEON_FP16 ASIMD", autovec=True
+ ),
+ ASIMD = dict(
+ implies="NEON NEON_FP16 NEON_VFPV4", autovec=True
+ ),
+ ASIMDHP = dict(
+ flags="-march=armv8.2-a+fp16"
+ ),
+ ASIMDDP = dict(
+ flags="-march=armv8.2-a+dotprod"
+ ),
+ ASIMDFHM = dict(
+ flags="-march=armv8.2-a+fp16fml"
+ ),
+ )
+ if self.cc_on_armhf and is_unix: return dict(
+ NEON = dict(
+ flags="-mfpu=neon"
+ ),
+ NEON_FP16 = dict(
+ flags="-mfpu=neon-fp16 -mfp16-format=ieee"
+ ),
+ NEON_VFPV4 = dict(
+ flags="-mfpu=neon-vfpv4",
+ ),
+ ASIMD = dict(
+ flags="-mfpu=neon-fp-armv8 -march=armv8-a+simd",
+ ),
+ ASIMDHP = dict(
+ flags="-march=armv8.2-a+fp16"
+ ),
+ ASIMDDP = dict(
+ flags="-march=armv8.2-a+dotprod",
+ ),
+ ASIMDFHM = dict(
+ flags="-march=armv8.2-a+fp16fml"
+ )
+ )
+ # TODO: ARM MSVC
+ return {}
+
+ def __init__(self):
+ if self.conf_tmp_path is None:
+ import tempfile, shutil
+ tmp = tempfile.mkdtemp()
+ def rm_temp():
+ try:
+ shutil.rmtree(tmp)
+ except IOError:
+ pass
+ atexit.register(rm_temp)
+ self.conf_tmp_path = tmp
+
+ if self.conf_cache_factors is None:
+ self.conf_cache_factors = [
+ os.path.getmtime(__file__),
+ self.conf_nocache
+ ]
+
+class _Distutils:
+ """A helper class that provides a collection of fundamental methods
+ implemented in a top of Python and NumPy Distutils.
+
+ The idea behind this class is to gather all methods that it may
+ need to override in case of reuse 'CCompilerOpt' in environment
+ different than of what NumPy has.
+
+ Parameters
+ ----------
+ ccompiler : `CCompiler`
+ The generate instance that returned from `distutils.ccompiler.new_compiler()`.
+ """
+ def __init__(self, ccompiler):
+ self._ccompiler = ccompiler
+
+ def dist_compile(self, sources, flags, **kwargs):
+ """Wrap CCompiler.compile()"""
+ assert(isinstance(sources, list))
+ assert(isinstance(flags, list))
+ flags = kwargs.pop("extra_postargs", []) + flags
+ return self._ccompiler.compile(
+ sources, extra_postargs=flags, **kwargs
+ )
+
+ def dist_test(self, source, flags):
+ """Return True if 'CCompiler.compile()' able to compile
+ a source file with certain flags.
+ """
+ assert(isinstance(source, str))
+ from distutils.errors import CompileError
+ cc = self._ccompiler;
+ bk_spawn = getattr(cc, 'spawn', None)
+ if bk_spawn:
+ cc_type = getattr(self._ccompiler, "compiler_type", "")
+ if cc_type in ("msvc",):
+ setattr(cc, 'spawn', self._dist_test_spawn_paths)
+ else:
+ setattr(cc, 'spawn', self._dist_test_spawn)
+ test = False
+ try:
+ self.dist_compile(
+ [source], flags, output_dir=self.conf_tmp_path
+ )
+ test = True
+ except CompileError as e:
+ self.dist_log(str(e), stderr=True)
+ if bk_spawn:
+ setattr(cc, 'spawn', bk_spawn)
+ return test
+
+ def dist_info(self):
+ """Return a string containing all environment information, required
+ by the abstract class '_CCompiler' to discovering the platform
+ environment, also used as a cache factor in order to detect
+ any changes from outside.
+ """
+ if hasattr(self, "_dist_info"):
+ return self._dist_info
+ # play it safe
+ cc_info = ""
+ compiler = getattr(self._ccompiler, "compiler", None)
+ if compiler is not None:
+ if isinstance(compiler, str):
+ cc_info += compiler
+ elif hasattr(compiler, "__iter__"):
+ cc_info += ' '.join(compiler)
+ # in case if 'compiler' attribute doesn't provide anything
+ cc_type = getattr(self._ccompiler, "compiler_type", "")
+ if cc_type in ("intelem", "intelemw", "mingw64"):
+ cc_info += "x86_64"
+ elif cc_type in ("intel", "intelw", "intele"):
+ cc_info += "x86"
+ elif cc_type in ("msvc", "mingw32"):
+ import platform
+ if platform.architecture()[0] == "32bit":
+ cc_info += "x86"
+ else:
+ cc_info += "x86_64"
+ else:
+ # the last hope, too bad for cross-compiling
+ import platform
+ cc_info += platform.machine()
+
+ cc_info += cc_type
+ cflags = os.environ.get("CFLAGS", "")
+ if cflags not in cc_info:
+ cc_info += cflags
+
+ self._dist_info = cc_info
+ return cc_info
+
+ @staticmethod
+ def dist_error(*args):
+ """Raise a compiler error"""
+ from distutils.errors import CompileError
+ raise CompileError(_Distutils._dist_str(*args))
+
+ @staticmethod
+ def dist_fatal(*args):
+ """Raise a distutils error"""
+ from distutils.errors import DistutilsError
+ raise DistutilsError(_Distutils._dist_str(*args))
+
+ @staticmethod
+ def dist_log(*args, stderr=False):
+ """Print a console message"""
+ from numpy.distutils import log
+ out = _Distutils._dist_str(*args)
+ if stderr:
+ log.warn(out)
+ else:
+ log.info(out)
+
+ @staticmethod
+ def dist_load_module(name, path):
+ """Load a module from file, required by the abstract class '_Cache'."""
+ from numpy.compat import npy_load_module
+ try:
+ return npy_load_module(name, path)
+ except Exception as e:
+ _Distutils.dist_log(e, stderr=True)
+ return None
+
+ @staticmethod
+ def _dist_str(*args):
+ """Return a string to print by log and errors."""
+ def to_str(arg):
+ if not isinstance(arg, str) and hasattr(arg, '__iter__'):
+ ret = []
+ for a in arg:
+ ret.append(to_str(a))
+ return '('+ ' '.join(ret) + ')'
+ return str(arg)
+
+ stack = inspect.stack()[2]
+ start = "CCompilerOpt.%s[%d] : " % (stack.function, stack.lineno)
+ out = ' '.join([
+ to_str(a)
+ for a in (*args,)
+ ])
+ return start + out
+
+ def _dist_test_spawn_paths(self, cmd, display=None):
+ """
+ Fix msvc SDK ENV path same as distutils do
+ without it we get c1: fatal error C1356: unable to find mspdbcore.dll
+ """
+ if not hasattr(self._ccompiler, "_paths"):
+ self._dist_test_spawn(cmd)
+ return
+ old_path = os.getenv("path")
+ try:
+ os.environ["path"] = self._ccompiler._paths
+ self._dist_test_spawn(cmd)
+ finally:
+ os.environ["path"] = old_path
+
+ _dist_warn_regex = re.compile(
+ # intel and msvc compilers don't raise
+ # fatal errors when flags are wrong or unsupported
+ ".*("
+ "ignoring unknown option|" # msvc
+ "invalid argument for option" # intel
+ ").*"
+ )
+ @staticmethod
+ def _dist_test_spawn(cmd, display=None):
+ from distutils.errors import CompileError
+ try:
+ o = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+ if isinstance(o, bytes):
+ o = o.decode()
+ if o and re.match(_Distutils._dist_warn_regex, o):
+ _Distutils.dist_error(
+ "Flags in command", cmd ,"aren't supported by the compiler"
+ ", output -> \n%s" % o
+ )
+ except subprocess.CalledProcessError as exc:
+ o = exc.output
+ s = exc.returncode
+ except OSError:
+ o = b''
+ s = 127
+ else:
+ return None
+ o = o.decode()
+ _Distutils.dist_error(
+ "Command", cmd, "failed with exit status %d output -> \n%s" % (
+ s, o
+ ))
+
+_share_cache = {}
+class _Cache:
+ """An abstract class handles caching functionality, provides two
+ levels of caching, in-memory by share instances attributes among
+ each other and by store attributes into files.
+
+ **Note**:
+ any attributes that start with ``_`` or ``conf_`` will be ignored.
+
+ Parameters
+ ----------
+ cache_path: str or None
+ The path of cache file, if None then cache in file will disabled.
+
+ *factors:
+ The caching factors that need to utilize next to `conf_cache_factors`.
+
+ Attributes
+ ----------
+ cache_private: set
+ Hold the attributes that need be skipped from "in-memory cache".
+
+ cache_infile: bool
+ Utilized during initializing this class, to determine if the cache was able
+ to loaded from the specified cache path in 'cache_path'.
+ """
+
+ # skip attributes from cache
+ _cache_ignore = re.compile("^(_|conf_)")
+
+ def __init__(self, cache_path=None, *factors):
+ self.cache_me = {}
+ self.cache_private = set()
+ self.cache_infile = False
+
+ if self.conf_nocache:
+ self.dist_log("cache is disabled by `Config`")
+ return
+
+ chash = self.cache_hash(*factors, *self.conf_cache_factors)
+ if cache_path:
+ if os.path.exists(cache_path):
+ self.dist_log("load cache from file ->", cache_path)
+ cache_mod = self.dist_load_module("cache", cache_path)
+ if not cache_mod:
+ self.dist_log(
+ "unable to load the cache file as a module",
+ stderr=True
+ )
+ elif not hasattr(cache_mod, "hash") or \
+ not hasattr(cache_mod, "data"):
+ self.dist_log("invalid cache file", stderr=True)
+ elif chash == cache_mod.hash:
+ self.dist_log("hit the file cache")
+ for attr, val in cache_mod.data.items():
+ setattr(self, attr, val)
+ self.cache_infile = True
+ else:
+ self.dist_log("miss the file cache")
+
+ atexit.register(self._cache_write, cache_path, chash)
+
+ if not self.cache_infile:
+ other_cache = _share_cache.get(chash)
+ if other_cache:
+ self.dist_log("hit the memory cache")
+ for attr, val in other_cache.__dict__.items():
+ if attr in other_cache.cache_private or \
+ re.match(self._cache_ignore, attr):
+ continue
+ setattr(self, attr, val)
+
+ _share_cache[chash] = self
+
+ def __del__(self):
+ # TODO: remove the cache form share on del
+ pass
+
+ def _cache_write(self, cache_path, cache_hash):
+ # TODO: don't write if the cache doesn't change
+ self.dist_log("write cache to path ->", cache_path)
+ for attr in list(self.__dict__.keys()):
+ if re.match(self._cache_ignore, attr):
+ self.__dict__.pop(attr)
+
+ d = os.path.dirname(cache_path)
+ if not os.path.exists(d):
+ os.makedirs(d)
+
+ repr_dict = pprint.pformat(self.__dict__, compact=True)
+ with open(cache_path, "w") as f:
+ f.write(textwrap.dedent("""\
+ # AUTOGENERATED DON'T EDIT
+ # Please make changes to the code generator \
+ (distutils/ccompiler_opt.py)
+ hash = {}
+ data = \\
+ """).format(cache_hash))
+ f.write(repr_dict)
+
+ def cache_hash(self, *factors):
+ # is there a built-in non-crypto hash?
+ # sdbm
+ chash = 0
+ for f in factors:
+ for char in str(f):
+ chash = ord(char) + (chash << 6) + (chash << 16) - chash
+ chash &= 0xFFFFFFFF
+ return chash
+
+ @staticmethod
+ def me(cb):
+ """
+ A static method that can be treated as a decorator to
+ dynamically cache certain methods.
+ """
+ def cache_wrap_me(self, *args, **kwargs):
+ # good for normal args
+ cache_key = str((
+ cb.__name__, *args, *kwargs.keys(), *kwargs.values()
+ ))
+ if cache_key in self.cache_me:
+ return self.cache_me[cache_key]
+ ccb = cb(self, *args, **kwargs)
+ self.cache_me[cache_key] = ccb
+ return ccb
+ return cache_wrap_me
+
+class _CCompiler(object):
+ """A helper class for `CCompilerOpt` containing all utilities that
+ related to the fundamental compiler's functions.
+
+ Attributes
+ ----------
+ cc_on_x86 : bool
+ True when the target architecture is 32-bit x86
+ cc_on_x64 : bool
+ True when the target architecture is 64-bit x86
+ cc_on_ppc64 : bool
+ True when the target architecture is 64-bit big-endian PowerPC
+ cc_on_armhf : bool
+ True when the target architecture is 32-bit ARMv7+
+ cc_on_aarch64 : bool
+ True when the target architecture is 64-bit Armv8-a+
+ cc_on_noarch : bool
+ True when the target architecture is unknown or not supported
+ cc_is_gcc : bool
+ True if the compiler is GNU or
+ if the compiler is unknown
+ cc_is_clang : bool
+ True if the compiler is Clang
+ cc_is_icc : bool
+ True if the compiler is Intel compiler (unix like)
+ cc_is_iccw : bool
+ True if the compiler is Intel compiler (msvc like)
+ cc_is_nocc : bool
+ True if the compiler isn't supported directly,
+ Note: that cause a fail-back to gcc
+ cc_has_debug : bool
+ True if the compiler has debug flags
+ cc_has_native : bool
+ True if the compiler has native flags
+ cc_noopt : bool
+ True if the compiler has definition 'DISABLE_OPT*',
+ or 'cc_on_noarch' is True
+ cc_march : str
+ The target architecture name, or "unknown" if
+ the architecture isn't supported
+ cc_name : str
+ The compiler name, or "unknown" if the compiler isn't supported
+ cc_flags : dict
+ Dictionary containing the initialized flags of `_Config.conf_cc_flags`
+ """
+ def __init__(self):
+ if hasattr(self, "cc_is_cached"):
+ return
+ to_detect = (
+ # attr regex
+ (
+ ("cc_on_x64", "^(x|x86_|amd)64"),
+ ("cc_on_x86", "^(x86|i386|i686)"),
+ ("cc_on_ppc64le", "^(powerpc|ppc)64(el|le)"),
+ ("cc_on_ppc64", "^(powerpc|ppc)64"),
+ ("cc_on_armhf", "^arm"),
+ ("cc_on_aarch64", "^aarch64"),
+ # priority is given to first of string
+ # if it fail we search in the rest, due
+ # to append platform.machine() at the end,
+ # check method 'dist_info()' for more clarification.
+ ("cc_on_x64", ".*(x|x86_|amd)64.*"),
+ ("cc_on_x86", ".*(x86|i386|i686).*"),
+ ("cc_on_ppc64le", ".*(powerpc|ppc)64(el|le).*"),
+ ("cc_on_ppc64", ".*(powerpc|ppc)64.*"),
+ ("cc_on_armhf", ".*arm.*"),
+ ("cc_on_aarch64", ".*aarch64.*"),
+ # undefined platform
+ ("cc_on_noarch", ""),
+ ),
+ (
+ ("cc_is_gcc", r".*(gcc|gnu\-g).*"),
+ ("cc_is_clang", ".*clang.*"),
+ ("cc_is_iccw", ".*(intelw|intelemw|iccw).*"), # intel msvc like
+ ("cc_is_icc", ".*(intel|icc).*"), # intel unix like
+ ("cc_is_msvc", ".*msvc.*"),
+ ("cc_is_nocc", ""),
+ ),
+ (("cc_has_debug", ".*(O0|Od|ggdb|coverage|debug:full).*"),),
+ (("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"),),
+ # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
+ (("cc_noopt", ".*DISABLE_OPT.*"),),
+ )
+ for section in to_detect:
+ for attr, rgex in section:
+ setattr(self, attr, False)
+
+ dist_info = self.dist_info()
+ for section in to_detect:
+ for attr, rgex in section:
+ if rgex and not re.match(rgex, dist_info, re.IGNORECASE):
+ continue
+ setattr(self, attr, True)
+ break
+
+ if self.cc_on_noarch:
+ self.dist_log(
+ "unable to detect CPU arch via compiler info, "
+ "optimization is disabled \ninfo << %s >> " % dist_info,
+ stderr=True
+ )
+ self.cc_noopt = True
+
+ if self.conf_noopt:
+ self.dist_log("Optimization is disabled by the Config", stderr=True)
+ self.cc_noopt = True
+
+ if self.cc_is_nocc:
+ """
+ mingw can be treated as a gcc, and also xlc even if it based on clang,
+ but still has the same gcc optimization flags.
+ """
+ self.dist_log(
+ "unable to detect compiler name via info <<\n%s\n>> "
+ "treating it as a gcc" % dist_info,
+ stderr=True
+ )
+ self.cc_is_gcc = True
+
+ self.cc_march = "unknown"
+ for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"):
+ if getattr(self, "cc_on_" + arch):
+ self.cc_march = arch
+ break
+
+ self.cc_name = "unknown"
+ for name in ("gcc", "clang", "iccw", "icc", "msvc"):
+ if getattr(self, "cc_is_" + name):
+ self.cc_name = name
+ break
+
+ self.cc_flags = {}
+ compiler_flags = self.conf_cc_flags.get(self.cc_name)
+ if compiler_flags is None:
+ self.dist_fatal(
+ "undefined flag for compiler '%s', "
+ "leave an empty dict instead" % self.cc_name
+ )
+ for name, flags in compiler_flags.items():
+ self.cc_flags[name] = nflags = []
+ if flags:
+ assert(isinstance(flags, str))
+ flags = flags.split()
+ for f in flags:
+ if self.cc_test_flags([f]):
+ nflags.append(f)
+
+ self.cc_is_cached = True
+
+ @_Cache.me
+ def cc_test_flags(self, flags):
+ """
+ Returns True if the compiler supports 'flags'.
+ """
+ assert(isinstance(flags, list))
+ self.dist_log("testing flags", flags)
+ test_path = os.path.join(self.conf_check_path, "test_flags.c")
+ test = self.dist_test(test_path, flags)
+ if not test:
+ self.dist_log("testing failed", stderr=True)
+ return test
+
+ def cc_normalize_flags(self, flags):
+ """
+ Remove the conflicts that caused due gathering implied features flags.
+
+ Parameters
+ ----------
+ 'flags' list, compiler flags
+ flags should be sorted from the lowest to the highest interest.
+
+ Returns
+ -------
+ list, filtered from any conflicts.
+
+ Examples
+ --------
+ >>> self.cc_normalize_flags(['-march=armv8.2-a+fp16', '-march=armv8.2-a+dotprod'])
+ ['armv8.2-a+fp16+dotprod']
+
+ >>> self.cc_normalize_flags(
+ ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-mavx', '-march=core-avx2']
+ )
+ ['-march=core-avx2']
+ """
+ assert(isinstance(flags, list))
+ if self.cc_is_gcc or self.cc_is_clang or self.cc_is_icc:
+ return self._cc_normalize_unix(flags)
+
+ if self.cc_is_msvc or self.cc_is_iccw:
+ return self._cc_normalize_win(flags)
+ return flags
+
+ _cc_normalize_unix_mrgx = re.compile(
+ # 1- to check the highest of
+ r"^(-mcpu=|-march=|-x[A-Z0-9\-])"
+ )
+ _cc_normalize_unix_frgx = re.compile(
+ # 2- to remove any flags starts with
+ # -march, -mcpu, -x(INTEL) and '-m' without '='
+ r"^(?!(-mcpu=|-march=|-x[A-Z0-9\-]))(?!-m[a-z0-9\-\.]*.$)"
+ )
+ _cc_normalize_unix_krgx = re.compile(
+ # 3- keep only the highest of
+ r"^(-mfpu|-mtune)"
+ )
+ _cc_normalize_arch_ver = re.compile(
+ r"[0-9.]"
+ )
+ def _cc_normalize_unix(self, flags):
+ def ver_flags(f):
+ # arch ver subflag
+ # -march=armv8.2-a+fp16fml
+ tokens = f.split('+')
+ ver = float('0' + ''.join(
+ re.findall(self._cc_normalize_arch_ver, tokens[0])
+ ))
+ return ver, tokens[0], tokens[1:]
+
+ if len(flags) <= 1:
+ return flags
+ # get the highest matched flag
+ for i, cur_flag in enumerate(reversed(flags)):
+ if not re.match(self._cc_normalize_unix_mrgx, cur_flag):
+ continue
+ lower_flags = flags[:-(i+1)]
+ upper_flags = flags[-i:]
+ filterd = list(filter(
+ self._cc_normalize_unix_frgx.search, lower_flags
+ ))
+ # gather subflags
+ ver, arch, subflags = ver_flags(cur_flag)
+ if ver > 0 and len(subflags) > 0:
+ for xflag in lower_flags:
+ xver, _, xsubflags = ver_flags(xflag)
+ if ver == xver:
+ subflags = xsubflags + subflags
+ cur_flag = arch + '+' + '+'.join(subflags)
+
+ flags = filterd + [cur_flag]
+ if i > 0:
+ flags += upper_flags
+ break
+
+ # to remove overridable flags
+ final_flags = []
+ matched = set()
+ for f in reversed(flags):
+ match = re.match(self._cc_normalize_unix_krgx, f)
+ if not match:
+ pass
+ elif match[0] in matched:
+ continue
+ else:
+ matched.add(match[0])
+ final_flags.insert(0, f)
+ return final_flags
+
+ _cc_normalize_win_frgx = re.compile(
+ r"^(?!(/arch\:|/Qx\:))"
+ )
+ _cc_normalize_win_mrgx = re.compile(
+ r"^(/arch|/Qx:)"
+ )
+ def _cc_normalize_win(self, flags):
+ for i, f in enumerate(reversed(flags)):
+ if not re.match(self._cc_normalize_win_mrgx, f):
+ continue
+ i += 1
+ return list(filter(
+ self._cc_normalize_win_frgx.search, flags[:-i]
+ )) + flags[-i:]
+ return flags
+
+class _Feature:
+ """A helper class for `CCompilerOpt` that managing CPU features.
+
+ Attributes
+ ----------
+ feature_supported : dict
+ Dictionary containing all CPU features that supported
+ by the platform, according to the specified values in attribute
+ `_Config.conf_features` and `_Config.conf_features_partial()`
+
+ feature_min : set
+ The minimum support of CPU features, according to
+ the specified values in attribute `_Config.conf_min_features`.
+ """
+ def __init__(self):
+ if hasattr(self, "feature_is_cached"):
+ return
+ self.feature_supported = pfeatures = self.conf_features_partial()
+ for feature_name in list(pfeatures.keys()):
+ feature = pfeatures[feature_name]
+ cfeature = self.conf_features[feature_name]
+ feature.update({
+ k:v for k,v in cfeature.items() if k not in feature
+ })
+ disabled = feature.get("disable")
+ if disabled is not None:
+ pfeatures.pop(feature_name)
+ self.dist_log(
+ "feature '%s' is disabled," % feature_name,
+ disabled, stderr=True
+ )
+ continue
+ # list is used internally for these options
+ for option in (
+ "implies", "group", "detect", "headers", "flags"
+ ) :
+ oval = feature.get(option)
+ if isinstance(oval, str):
+ feature[option] = oval.split()
+
+ self.feature_min = set()
+ min_f = self.conf_min_features.get(self.cc_march, "")
+ for F in min_f.upper().split():
+ if F in self.feature_supported:
+ self.feature_min.add(F)
+
+ self.feature_is_cached = True
+
+ def feature_names(self, names=None, force_flags=None):
+ """
+ Returns a set of CPU feature names that supported by platform and the **C** compiler.
+
+ Parameters
+ ----------
+ 'names': sequence or None, optional
+ Specify certain CPU features to test it against the **C** compiler.
+ if None(default), it will test all current supported features.
+ **Note**: feature names must be in upper-case.
+
+ 'force_flags': list or None, optional
+ If None(default), default compiler flags for every CPU feature will be used
+ during the test.
+ """
+ assert(
+ names is None or (
+ not isinstance(names, str) and
+ hasattr(names, "__iter__")
+ )
+ )
+ assert(force_flags is None or isinstance(force_flags, list))
+ if names is None:
+ names = self.feature_supported.keys()
+ supported_names = set()
+ for f in names:
+ if self.feature_is_supported(f, force_flags=force_flags):
+ supported_names.add(f)
+ return supported_names
+
+ def feature_is_exist(self, name):
+ """
+ Returns True if a certain feature is exist and covered within
+ `_Config.conf_features`.
+
+ Parameters
+ ----------
+ 'name': str
+ feature name in uppercase.
+ """
+ assert(name.isupper())
+ return name in self.conf_features
+
+ def feature_sorted(self, names, reverse=False):
+ """
+ Sort a list of CPU features ordered by the lowest interest.
+
+ Parameters
+ ----------
+ 'names': sequence
+ sequence of supported feature names in uppercase.
+ 'reverse': bool, optional
+ If true, the sorted features is reversed. (highest interest)
+
+ Returns
+ -------
+ list, sorted CPU features
+ """
+ def sort_cb(k):
+ if isinstance(k, str):
+ return self.feature_supported[k]["interest"]
+ # multiple features
+ rank = max([self.feature_supported[f]["interest"] for f in k])
+ # FIXME: that's not a safe way to increase the rank for
+ # multi targets
+ rank += len(k) -1
+ return rank
+ return sorted(names, reverse=reverse, key=sort_cb)
+
+ def feature_implies(self, names):
+ """Return a set of CPU features that implied by 'names'"""
+ def get_implies(name, _caller=[]):
+ implies = set()
+ d = self.feature_supported[name]
+ for i in d.get("implies", []):
+ implies.add(i)
+ if i in _caller:
+ # infinity recursive guard since
+ # features can imply each other
+ continue
+ _caller.append(name)
+ implies = implies.union(get_implies(i, _caller))
+ return implies
+
+ if isinstance(names, str):
+ return get_implies(names)
+
+ assert(hasattr(names, "__iter__"))
+ implies = set()
+ for n in names:
+ implies = implies.union(get_implies(n))
+ return implies
+
+ def feature_implies_c(self, names):
+ """same as feature_implies() but combining 'names'"""
+ if isinstance(names, str):
+ names = set((names,))
+ else:
+ names = set(names)
+ return names.union(self.feature_implies(names))
+
+ def feature_ahead(self, names):
+ """
+ Return list of features in 'names' after remove any
+ implied features and keep the origins.
+
+ Parameters
+ ----------
+ 'names': sequence
+ sequence of CPU feature names in uppercase.
+
+ Returns
+ -------
+ list of CPU features sorted as-is 'names'
+
+ Examples
+ --------
+ >>> self.feature_untied(["SSE2", "SSE3", "SSE41"])
+ ["SSE41"]
+ # assume AVX2 and FMA3 implies each other and AVX2
+ # is the highest interest
+ >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+ ["AVX2"]
+ # assume AVX2 and FMA3 don't implies each other
+ >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+ ["AVX2", "FMA3"]
+ """
+ assert(
+ not isinstance(names, str)
+ and hasattr(names, '__iter__')
+ )
+ implies = self.feature_implies(names)
+ ahead = [n for n in names if n not in implies]
+ if len(ahead) == 0:
+ # return the highest interested feature
+ # if all features imply each other
+ ahead = self.feature_sorted(names, reverse=True)[:1]
+ return ahead
+
+ def feature_untied(self, names):
+ """
+ same as 'feature_ahead()' but if both features implied each other
+ and keep the highest interest.
+
+ Parameters
+ ----------
+ 'names': sequence
+ sequence of CPU feature names in uppercase.
+
+ Returns
+ -------
+ list of CPU features sorted as-is 'names'
+
+ Examples
+ --------
+ >>> self.feature_untied(["SSE2", "SSE3", "SSE41"])
+ ["SSE2", "SSE3", "SSE41"]
+ # assume AVX2 and FMA3 implies each other
+ >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "FMA3", "AVX2"])
+ ["SSE2", "SSE3", "SSE41", "AVX2"]
+ """
+ assert(
+ not isinstance(names, str)
+ and hasattr(names, '__iter__')
+ )
+ final = []
+ for n in names:
+ implies = self.feature_implies(n)
+ tied = [
+ nn for nn in final
+ if nn in implies and n in self.feature_implies(nn)
+ ]
+ if tied:
+ tied = self.feature_sorted(tied + [n])
+ if n not in tied[1:]:
+ continue
+ final.remove(tied[:1][0])
+ final.append(n)
+ return final
+
+ def feature_get_til(self, names, keyisfalse):
+ """
+ same as `feature_implies_c()` but stop collecting implied
+ features when feature's option that provided through
+ parameter 'keyisfalse' is False, also sorting the returned
+ features.
+ """
+ def til(tnames):
+ # sort from highest to lowest interest then cut if "key" is False
+ tnames = self.feature_implies_c(tnames)
+ tnames = self.feature_sorted(tnames, reverse=True)
+ for i, n in enumerate(tnames):
+ if not self.feature_supported[n].get(keyisfalse, True):
+ tnames = tnames[:i+1]
+ break
+ return tnames
+
+ if isinstance(names, str) or len(names) <= 1:
+ names = til(names)
+ # normalize the sort
+ names.reverse()
+ return names
+
+ names = self.feature_ahead(names)
+ names = {t for n in names for t in til(n)}
+ return self.feature_sorted(names)
+
+ def feature_detect(self, names):
+ """
+ Return a list of CPU features that required to be detected
+ sorted from the lowest to highest interest.
+ """
+ names = self.feature_get_til(names, "implies_detect")
+ detect = []
+ for n in names:
+ d = self.feature_supported[n]
+ detect += d.get("detect", d.get("group", [n]))
+ return detect
+
+ @_Cache.me
+ def feature_flags(self, names):
+ """
+ Return a list of CPU features flags sorted from the lowest
+ to highest interest.
+ """
+ names = self.feature_sorted(self.feature_implies_c(names))
+ flags = []
+ for n in names:
+ d = self.feature_supported[n]
+ f = d.get("flags", [])
+ if not f or not self.cc_test_flags(f):
+ continue
+ flags += f
+ return self.cc_normalize_flags(flags)
+
+ @_Cache.me
+ def feature_test(self, name, force_flags=None):
+ """
+ Test a certain CPU feature against the compiler through its own
+ check file.
+
+ Parameters
+ ----------
+ 'name': str
+ Supported CPU feature name.
+
+ 'force_flags': list or None, optional
+ If None(default), the returned flags from `feature_flags()`
+ will be used.
+ """
+ if force_flags is None:
+ force_flags = self.feature_flags(name)
+
+ self.dist_log(
+ "testing feature '%s' with flags (%s)" % (
+ name, ' '.join(force_flags)
+ ))
+ # Each CPU feature must have C source code contains at
+ # least one intrinsic or instruction related to this feature.
+ test_path = os.path.join(
+ self.conf_check_path, "cpu_%s.c" % name.lower()
+ )
+ if not os.path.exists(test_path):
+ self.dist_fatal("feature test file is not exist", path)
+
+ test = self.dist_test(test_path, force_flags + self.cc_flags["werror"])
+ if not test:
+ self.dist_log("testing failed", stderr=True)
+ return test
+
+ @_Cache.me
+ def feature_is_supported(self, name, force_flags=None):
+ """
+ Check if a certain CPU feature is supported by the platform and compiler.
+
+ Parameters
+ ----------
+ 'name': str
+ CPU feature name in uppercase.
+
+ 'force_flags': list or None, optional
+ If None(default), default compiler flags for every CPU feature will be used
+ during test.
+ """
+ assert(name.isupper())
+ assert(force_flags is None or isinstance(force_flags, list))
+
+ supported = name in self.feature_supported
+ if supported:
+ for impl in self.feature_implies(name):
+ if not self.feature_test(impl, force_flags):
+ return False
+ if not self.feature_test(name, force_flags):
+ return False
+ return supported
+
+ @_Cache.me
+ def feature_can_autovec(self, name):
+ """
+ check if the feature can be auto-vectorized by the compiler
+ """
+ assert(isinstance(name, str))
+ d = self.feature_supported[name]
+ can = d.get("autovec", None)
+ if can is None:
+ valid_flags = [
+ self.cc_test_flags([f]) for f in d.get("flags", [])
+ ]
+ can = valid_flags and any(valid_flags)
+ return can
+
+ def feature_c_preprocessor(self, feature_name, tabs=0):
+ """
+ Generate C preprocessor definitions and include headers of a CPU feature.
+
+ Parameters
+ ----------
+ 'feature_name': str
+ CPU feature name in uppercase.
+ 'tabs': int
+ if > 0, align the generated strings to the right depend on number of tabs.
+
+ Returns
+ -------
+ str, generated C preprocessor
+
+ Examples
+ --------
+ >>> self.feature_c_preprocessor("SSE3")
+ /** SSE3 **/
+ #define NPY_HAVE_SSE3 1
+ #include <pmmintrin.h>
+ """
+ assert(feature_name.isupper())
+ feature = self.feature_supported.get(feature_name)
+ assert(feature is not None)
+
+ prepr = [
+ "/** %s **/" % feature_name,
+ "#define %sHAVE_%s 1" % (self.conf_c_prefix, feature_name)
+ ]
+ prepr += [
+ "#include <%s>" % h for h in feature.get("headers", [])
+ ]
+ group = feature.get("group", [])
+ for f in group:
+ # Guard features in case of duplicate definitions
+ prepr += [
+ "#ifndef %sHAVE_%s" % (self.conf_c_prefix, f),
+ "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, f),
+ "#endif",
+ ]
+ if tabs > 0:
+ prepr = [('\t'*tabs) + l for l in prepr]
+ return '\n'.join(prepr)
+
+class _Parse:
+ """A helper class that parsing main arguments of `CCompilerOpt`,
+ also parsing configuration statements in dispatch-able sources.
+
+ Parameters
+ ----------
+ cpu_baseline: str or None
+ minimal set of required CPU features or special options.
+
+ cpu_dispatch: str or None
+ dispatched set of additional CPU features or special options.
+
+ Special options can be:
+ - **MIN**: Enables the minimum CPU features that utilized via `_Config.conf_min_features`
+ - **MAX**: Enables all supported CPU features by the Compiler and platform.
+ - **NATIVE**: Enables all CPU features that supported by the current machine.
+ - **NONE**: Enables nothing
+ - **Operand +/-**: remove or add features, useful with options **MAX**, **MIN** and **NATIVE**.
+ NOTE: operand + is only added for nominal reason.
+
+ NOTES:
+ - Case-insensitive among all CPU features and special options.
+ - Comma or space can be used as a separator.
+ - If the CPU feature is not supported by the user platform or compiler,
+ it will be skipped rather than raising a fatal error.
+ - Any specified CPU features to 'cpu_dispatch' will be skipped if its part of CPU baseline features
+ - 'cpu_baseline' force enables implied features.
+
+ Attributes
+ ----------
+ parse_baseline_names : list
+ Final CPU baseline's feature names(sorted from low to high)
+ parse_baseline_flags : list
+ Compiler flags of baseline features
+ parse_dispatch_names : list
+ Final CPU dispatch-able feature names(sorted from low to high)
+ parse_target_groups : dict
+ Dictionary containing initialized target groups that configured
+ through class attribute `conf_target_groups`.
+
+ The key is represent the group name and value is a tuple
+ contains three items :
+ - bool, True if group has the 'baseline' option.
+ - list, list of CPU features.
+ - list, list of extra compiler flags.
+
+ """
+ def __init__(self, cpu_baseline, cpu_dispatch):
+ self._parse_policies = dict(
+ # POLICY NAME, (HAVE, NOT HAVE, [DEB])
+ KEEP_BASELINE = (
+ None, self._parse_policy_not_keepbase,
+ []
+ ),
+ KEEP_SORT = (
+ self._parse_policy_keepsort,
+ self._parse_policy_not_keepsort,
+ []
+ ),
+ MAXOPT = (
+ self._parse_policy_maxopt, None,
+ []
+ ),
+ WERROR = (
+ self._parse_policy_werror, None,
+ []
+ ),
+ AUTOVEC = (
+ self._parse_policy_autovec, None,
+ ["MAXOPT"]
+ )
+ )
+ if hasattr(self, "parse_is_cached"):
+ return
+
+ self.parse_baseline_names = []
+ self.parse_baseline_flags = []
+ self.parse_dispatch_names = []
+ self.parse_target_groups = {}
+
+ if self.cc_noopt:
+ # skip parsing baseline and dispatch args and keep parsing target groups
+ cpu_baseline = cpu_dispatch = None
+
+ self.dist_log("check requested baseline")
+ if cpu_baseline is not None:
+ cpu_baseline = self._parse_arg_features("cpu_baseline", cpu_baseline)
+ baseline_names = self.feature_names(cpu_baseline)
+ self.parse_baseline_flags = self.feature_flags(baseline_names)
+ self.parse_baseline_names = self.feature_sorted(
+ self.feature_implies_c(baseline_names)
+ )
+
+ self.dist_log("check requested dispatch-able features")
+ if cpu_dispatch is not None:
+ cpu_dispatch_ = self._parse_arg_features("cpu_dispatch", cpu_dispatch)
+ cpu_dispatch = {
+ f for f in cpu_dispatch_
+ if f not in self.parse_baseline_names
+ }
+ conflict_baseline = cpu_dispatch_.difference(cpu_dispatch)
+ self.parse_dispatch_names = self.feature_sorted(
+ self.feature_names(cpu_dispatch)
+ )
+ if len(conflict_baseline) > 0:
+ self.dist_log(
+ "skip features", conflict_baseline, "since its part of baseline"
+ )
+
+ self.dist_log("initialize targets groups")
+ for group_name, tokens in self.conf_target_groups.items():
+ self.dist_log("parse target group", group_name)
+ GROUP_NAME = group_name.upper()
+ if not tokens or not tokens.strip():
+ # allow empty groups, useful in case if there's a need
+ # to disable certain group since '_parse_target_tokens()'
+ # requires at least one valid target
+ self.parse_target_groups[GROUP_NAME] = (
+ False, [], []
+ )
+ continue
+ has_baseline, features, extra_flags = \
+ self._parse_target_tokens(tokens)
+ self.parse_target_groups[GROUP_NAME] = (
+ has_baseline, features, extra_flags
+ )
+
+ self.parse_is_cached = True
+
+ def parse_targets(self, source):
+ """
+ Fetch and parse configuration statements that required for
+ defining the targeted CPU features, statements should be declared
+ in the top of source in between **C** comment and start
+ with a special mark **@targets**.
+
+ Configuration statements are sort of keywords representing
+ CPU features names, group of statements and policies, combined
+ together to determine the required optimization.
+
+ Parameters
+ ----------
+ source: str
+ the path of **C** source file.
+
+ Returns
+ -------
+ - bool, True if group has the 'baseline' option
+ - list, list of CPU features
+ - list, list of extra compiler flags
+ """
+ self.dist_log("looking for '@targets' inside -> ", source)
+ # get lines between /*@targets and */
+ with open(source) as fd:
+ tokens = ""
+ max_to_reach = 1000 # good enough, isn't?
+ start_with = "@targets"
+ start_pos = -1
+ end_with = "*/"
+ end_pos = -1
+ for current_line, line in enumerate(fd):
+ if current_line == max_to_reach:
+ self.dist_fatal("reached the max of lines")
+ break
+ if start_pos == -1:
+ start_pos = line.find(start_with)
+ if start_pos == -1:
+ continue
+ start_pos += len(start_with)
+ tokens += line
+ end_pos = line.find(end_with)
+ if end_pos != -1:
+ end_pos += len(tokens) - len(line)
+ break
+
+ if start_pos == -1:
+ self.dist_fatal("expected to find '%s' within a C comment" % start_with)
+ if end_pos == -1:
+ self.dist_fatal("expected to end with '%s'" % end_with)
+
+ tokens = tokens[start_pos:end_pos]
+ return self._parse_target_tokens(tokens)
+
+ _parse_regex_arg = re.compile(r'\s|[,]|([+-])')
+ def _parse_arg_features(self, arg_name, req_features):
+ if not isinstance(req_features, str):
+ self.dist_fatal("expected a string in '%s'" % arg_name)
+
+ final_features = set()
+ # space and comma can be used as a separator
+ tokens = list(filter(None, re.split(self._parse_regex_arg, req_features)))
+ append = True # append is the default
+ for tok in tokens:
+ if tok[0] in ("#", "$"):
+ self.dist_fatal(
+ arg_name, "target groups and policies "
+ "aren't allowed from arguments, "
+ "only from dispatch-able sources"
+ )
+ if tok == '+':
+ append = True
+ continue
+ if tok == '-':
+ append = False
+ continue
+
+ TOK = tok.upper() # we use upper-case internally
+ features_to = set()
+ if TOK == "NONE":
+ pass
+ elif TOK == "NATIVE":
+ native = self.cc_flags["native"]
+ if not native:
+ self.dist_fatal(arg_name,
+ "native option isn't supported by the compiler"
+ )
+ features_to = self.feature_names(force_flags=native)
+ elif TOK == "MAX":
+ features_to = self.feature_supported.keys()
+ elif TOK == "MIN":
+ features_to = self.feature_min
+ else:
+ if TOK in self.feature_supported:
+ features_to.add(TOK)
+ else:
+ if not self.feature_is_exist(TOK):
+ self.dist_fatal(arg_name,
+ ", '%s' isn't a known feature or option" % tok
+ )
+ if append:
+ final_features = final_features.union(features_to)
+ else:
+ final_features = final_features.difference(features_to)
+
+ append = True # back to default
+
+ return final_features
+
+ _parse_regex_target = re.compile(r'\s|[*,/]|([()])')
+ def _parse_target_tokens(self, tokens):
+ assert(isinstance(tokens, str))
+ final_targets = [] # to keep it sorted as specified
+ extra_flags = []
+ has_baseline = False
+
+ skipped = set()
+ policies = set()
+ multi_target = None
+
+ tokens = list(filter(None, re.split(self._parse_regex_target, tokens)))
+ if not tokens:
+ self.dist_fatal("expected one token at least")
+
+ for tok in tokens:
+ TOK = tok.upper()
+ ch = tok[0]
+ if ch in ('+', '-'):
+ self.dist_fatal(
+ "+/- are 'not' allowed from target's groups or @targets, "
+ "only from cpu_baseline and cpu_dispatch parms"
+ )
+ elif ch == '$':
+ if multi_target is not None:
+ self.dist_fatal(
+ "policies aren't allowed inside multi-target '()'"
+ ", only CPU features"
+ )
+ policies.add(self._parse_token_policy(TOK))
+ elif ch == '#':
+ if multi_target is not None:
+ self.dist_fatal(
+ "target groups aren't allowed inside multi-target '()'"
+ ", only CPU features"
+ )
+ has_baseline, final_targets, extra_flags = \
+ self._parse_token_group(TOK, has_baseline, final_targets, extra_flags)
+ elif ch == '(':
+ if multi_target is not None:
+ self.dist_fatal("unclosed multi-target, missing ')'")
+ multi_target = set()
+ elif ch == ')':
+ if multi_target is None:
+ self.dist_fatal("multi-target opener '(' wasn't found")
+ targets = self._parse_multi_target(multi_target)
+ if targets is None:
+ skipped.add(tuple(multi_target))
+ else:
+ if len(targets) == 1:
+ targets = targets[0]
+ if targets and targets not in final_targets:
+ final_targets.append(targets)
+ multi_target = None # back to default
+ else:
+ if TOK == "BASELINE":
+ if multi_target is not None:
+ self.dist_fatal("baseline isn't allowed inside multi-target '()'")
+ has_baseline = True
+ continue
+
+ if multi_target is not None:
+ multi_target.add(TOK)
+ continue
+
+ if not self.feature_is_exist(TOK):
+ self.dist_fatal("invalid target name '%s'" % TOK)
+
+ is_enabled = (
+ TOK in self.parse_baseline_names or
+ TOK in self.parse_dispatch_names
+ )
+ if is_enabled:
+ if TOK not in final_targets:
+ final_targets.append(TOK)
+ continue
+
+ skipped.add(TOK)
+
+ if multi_target is not None:
+ self.dist_fatal("unclosed multi-target, missing ')'")
+ if skipped:
+ self.dist_log(
+ "skip targets", skipped,
+ "not part of baseline or dispatch-able features"
+ )
+
+ final_targets = self.feature_untied(final_targets)
+
+ # add polices dependencies
+ for p in list(policies):
+ _, _, deps = self._parse_policies[p]
+ for d in deps:
+ if d in policies:
+ continue
+ self.dist_log(
+ "policy '%s' force enables '%s'" % (
+ p, d
+ ))
+ policies.add(d)
+
+ # release policies filtrations
+ for p, (have, nhave, _) in self._parse_policies.items():
+ func = None
+ if p in policies:
+ func = have
+ self.dist_log("policy '%s' is ON" % p)
+ else:
+ func = nhave
+ if not func:
+ continue
+ has_baseline, final_targets, extra_flags = func(
+ has_baseline, final_targets, extra_flags
+ )
+
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_token_policy(self, token):
+ """validate policy token"""
+ if len(token) <= 1 or token[-1:] == token[0]:
+ self.dist_fatal("'$' must stuck in the begin of policy name")
+ token = token[1:]
+ if token not in self._parse_policies:
+ self.dist_fatal(
+ "'%s' is an invalid policy name, available policies are" % token,
+ self._parse_policies.keys()
+ )
+ return token
+
+ def _parse_token_group(self, token, has_baseline, final_targets, extra_flags):
+ """validate group token"""
+ if len(token) <= 1 or token[-1:] == token[0]:
+ self.dist_fatal("'#' must stuck in the begin of group name")
+
+ token = token[1:]
+ ghas_baseline, gtargets, gextra_flags = self.parse_target_groups.get(
+ token, (False, None, [])
+ )
+ if gtargets is None:
+ self.dist_fatal(
+ "'%s' is an invalid target group name, " % token + \
+ "available target groups are",
+ self.parse_target_groups.keys()
+ )
+ if ghas_baseline:
+ has_baseline = True
+ # always keep sorting as specified
+ final_targets += [f for f in gtargets if f not in final_targets]
+ extra_flags += [f for f in gextra_flags if f not in extra_flags]
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_multi_target(self, targets):
+ """validate multi targets that defined between parentheses()"""
+ # remove any implied features and keep the origins
+ if not targets:
+ self.dist_fatal("empty multi-target '()'")
+ if not all([
+ self.feature_is_exist(tar) for tar in targets
+ ]) :
+ self.dist_fatal("invalid target name in multi-target", targets)
+ if not all([
+ (
+ tar in self.parse_baseline_names or
+ tar in self.parse_dispatch_names
+ )
+ for tar in targets
+ ]) :
+ return None
+ targets = self.feature_ahead(targets)
+ if not targets:
+ return None
+ # force sort multi targets, so it can be comparable
+ targets = self.feature_sorted(targets)
+ targets = tuple(targets) # hashable
+ return targets
+
+ def _parse_policy_not_keepbase(self, has_baseline, final_targets, extra_flags):
+ """skip all baseline features"""
+ skipped = []
+ for tar in final_targets[:]:
+ is_base = False
+ if isinstance(tar, str):
+ is_base = tar in self.parse_baseline_names
+ else:
+ # multi targets
+ is_base = all([
+ f in self.parse_baseline_names
+ for f in tar
+ ])
+ if is_base:
+ skipped.append(tar)
+ final_targets.remove(tar)
+
+ if skipped:
+ self.dist_log("skip baseline features", skipped)
+
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_policy_keepsort(self, has_baseline, final_targets, extra_flags):
+ """leave a notice that $keep_sort is on"""
+ self.dist_log(
+ "policy 'keep_sort' is on, dispatch-able targets", final_targets, "\n"
+ "are 'not' sorted depend on the highest interest but"
+ "as specified in the dispatch-able source or the extra group"
+ )
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_policy_not_keepsort(self, has_baseline, final_targets, extra_flags):
+ """sorted depend on the highest interest"""
+ final_targets = self.feature_sorted(final_targets, reverse=True)
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_policy_maxopt(self, has_baseline, final_targets, extra_flags):
+ """append the compiler optimization flags"""
+ if self.cc_has_debug:
+ self.dist_log("debug mode is detected, policy 'maxopt' is skipped.")
+ elif self.cc_noopt:
+ self.dist_log("optimization is disabled, policy 'maxopt' is skipped.")
+ else:
+ flags = self.cc_flags["opt"]
+ if not flags:
+ self.dist_log(
+ "current compiler doesn't support optimization flags, "
+ "policy 'maxopt' is skipped", stderr=True
+ )
+ else:
+ extra_flags += flags
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_policy_werror(self, has_baseline, final_targets, extra_flags):
+ """force warnings to treated as errors"""
+ flags = self.cc_flags["werror"]
+ if not flags:
+ self.dist_log(
+ "current compiler doesn't support werror flags, "
+ "warnings will 'not' treated as errors", stderr=True
+ )
+ else:
+ self.dist_log("compiler warnings are treated as errors")
+ extra_flags += flags
+ return has_baseline, final_targets, extra_flags
+
+ def _parse_policy_autovec(self, has_baseline, final_targets, extra_flags):
+ """skip features that has no auto-vectorized support by compiler"""
+ skipped = []
+ for tar in final_targets[:]:
+ if isinstance(tar, str):
+ can = self.feature_can_autovec(tar)
+ else: # multiple target
+ can = all([
+ self.feature_can_autovec(t)
+ for t in tar
+ ])
+ if not can:
+ final_targets.remove(tar)
+ skipped.append(tar)
+
+ if skipped:
+ self.dist_log("skip non auto-vectorized features", skipped)
+
+ return has_baseline, final_targets, extra_flags
+
+class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
+ """
+ A helper class for `CCompiler` aims to provide extra build options
+ to effectively control of compiler optimizations that are directly
+ related to CPU features.
+ """
+ def __init__(self, ccompiler, cpu_baseline="min", cpu_dispatch="max", cache_path=None):
+ _Config.__init__(self)
+ _Distutils.__init__(self, ccompiler)
+ _Cache.__init__(self, cache_path, self.dist_info(), cpu_baseline, cpu_dispatch)
+ _CCompiler.__init__(self)
+ _Feature.__init__(self)
+ if not self.cc_noopt and self.cc_has_native:
+ self.dist_log(
+ "native flag is specified through environment variables. "
+ "force cpu-baseline='native'"
+ )
+ cpu_baseline = "native"
+ _Parse.__init__(self, cpu_baseline, cpu_dispatch)
+ # keep the requested features untouched, need it later for report
+ # and trace purposes
+ self._requested_baseline = cpu_baseline
+ self._requested_dispatch = cpu_dispatch
+ # key is the dispatch-able source and value is a tuple
+ # contains two items (has_baseline[boolean], dispatched-features[list])
+ self.sources_status = getattr(self, "sources_status", {})
+ # every instance should has a separate one
+ self.cache_private.add("sources_status")
+ # set it at the end to make sure the cache writing was done after init
+ # this class
+ self.hit_cache = hasattr(self, "hit_cache")
+
+ def is_cached(self):
+ """
+ Returns True if the class loaded from the cache file
+ """
+ return self.cache_infile and self.hit_cache
+
+ def cpu_baseline_flags(self):
+ """
+ Returns a list of final CPU baseline compiler flags
+ """
+ return self.parse_baseline_flags
+
+ def cpu_baseline_names(self):
+ """
+ return a list of final CPU baseline feature names
+ """
+ return self.parse_baseline_names
+
+ def cpu_dispatch_names(self):
+ """
+ return a list of final CPU dispatch feature names
+ """
+ return self.parse_dispatch_names
+
+ def try_dispatch(self, sources, src_dir=None, **kwargs):
+ """
+ Compile one or more dispatch-able sources and generates object files,
+ also generates abstract C config headers and macros that
+ used later for the final runtime dispatching process.
+
+ The mechanism behind it is to takes each source file that specified
+ in 'sources' and branching it into several files depend on
+ special configuration statements that must be declared in the
+ top of each source which contains targeted CPU features,
+ then it compiles every branched source with the proper compiler flags.
+
+ Parameters
+ ----------
+ sources : list
+ Must be a list of dispatch-able sources file paths,
+ and configuration statements must be declared inside
+ each file.
+
+ src_dir : str
+ Path of parent directory for the generated headers and wrapped sources.
+ If None(default) the files will generated in-place.
+
+ **kwargs : any
+ Arguments to pass on to the `CCompiler.compile()`
+
+ Returns
+ -------
+ list : generated object files
+
+ Raises
+ ------
+ CompileError
+ Raises by `CCompiler.compile()` on compiling failure.
+ DistutilsError
+ Some errors during checking the sanity of configuration statements.
+
+ See Also
+ --------
+ parse_targets() :
+ Parsing the configuration statements of dispatch-able sources.
+ """
+ to_compile = {}
+ baseline_flags = self.cpu_baseline_flags()
+ include_dirs = kwargs.setdefault("include_dirs", [])
+
+ for src in sources:
+ output_dir = os.path.dirname(src)
+ if src_dir and not output_dir.startswith(src_dir):
+ output_dir = os.path.join(src_dir, output_dir)
+ if output_dir not in include_dirs:
+ include_dirs.append(output_dir)
+
+ has_baseline, targets, extra_flags = self.parse_targets(src)
+ nochange = self._generate_config(output_dir, src, targets, has_baseline)
+ for tar in targets:
+ tar_src = self._wrap_target(output_dir, src, tar, nochange=nochange)
+ flags = tuple(extra_flags + self.feature_flags(tar))
+ to_compile.setdefault(flags, []).append(tar_src)
+
+ if has_baseline:
+ flags = tuple(extra_flags + baseline_flags)
+ to_compile.setdefault(flags, []).append(src)
+
+ self.sources_status[src] = (has_baseline, targets)
+
+ # For these reasons, the sources are compiled in a separate loop:
+ # - Gathering all sources with the same flags to benefit from
+ # the parallel compiling as much as possible.
+ # - To generate all config headers of the dispatchable sources,
+ # before the compilation in case if there are dependency relationships
+ # among them.
+ objects = []
+ for flags, srcs in to_compile.items():
+ objects += self.dist_compile(srcs, list(flags), **kwargs)
+ return objects
+
+ def generate_dispatch_header(self, header_path):
+ """
+ Generate the dispatch header which containing all definitions
+ and headers of instruction-sets for the enabled CPU baseline and
+ dispatch-able features.
+
+ Its highly recommended to take a look at the generated header
+ also the generated source files via `try_dispatch()`
+ in order to get the full picture.
+ """
+ self.dist_log("generate CPU dispatch header: (%s)" % header_path)
+
+ baseline_names = self.cpu_baseline_names()
+ dispatch_names = self.cpu_dispatch_names()
+ baseline_len = len(baseline_names)
+ dispatch_len = len(dispatch_names)
+
+ with open(header_path, 'w') as f:
+ baseline_calls = ' \\\n'.join([
+ (
+ "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+ ) % (self.conf_c_prefix, f)
+ for f in baseline_names
+ ])
+ dispatch_calls = ' \\\n'.join([
+ (
+ "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+ ) % (self.conf_c_prefix, f)
+ for f in dispatch_names
+ ])
+ f.write(textwrap.dedent("""\
+ /*
+ * AUTOGENERATED DON'T EDIT
+ * Please make changes to the code generator (distutils/ccompiler_opt.py)
+ */
+ #define {pfx}WITH_CPU_BASELINE "{baseline_str}"
+ #define {pfx}WITH_CPU_DISPATCH "{dispatch_str}"
+ #define {pfx}WITH_CPU_BASELINE_N {baseline_len}
+ #define {pfx}WITH_CPU_DISPATCH_N {dispatch_len}
+ #define {pfx}WITH_CPU_EXPAND_(X) X
+ #define {pfx}WITH_CPU_BASELINE_CALL(MACRO_TO_CALL, ...) \\
+ {baseline_calls}
+ #define {pfx}WITH_CPU_DISPATCH_CALL(MACRO_TO_CALL, ...) \\
+ {dispatch_calls}
+ """).format(
+ pfx=self.conf_c_prefix, baseline_str=" ".join(baseline_names),
+ dispatch_str=" ".join(dispatch_names), baseline_len=baseline_len,
+ dispatch_len=dispatch_len, baseline_calls=baseline_calls,
+ dispatch_calls=dispatch_calls
+ ))
+ baseline_pre = ''
+ for name in baseline_names:
+ baseline_pre += self.feature_c_preprocessor(name, tabs=1) + '\n'
+
+ dispatch_pre = ''
+ for name in dispatch_names:
+ dispatch_pre += textwrap.dedent("""\
+ #ifdef {pfx}CPU_TARGET_{name}
+ {pre}
+ #endif /*{pfx}CPU_TARGET_{name}*/
+ """).format(
+ pfx=self.conf_c_prefix_, name=name, pre=self.feature_c_preprocessor(
+ name, tabs=1
+ ))
+
+ f.write(textwrap.dedent("""\
+ /******* baseline features *******/
+ {baseline_pre}
+ /******* dispatch features *******/
+ {dispatch_pre}
+ """).format(
+ pfx=self.conf_c_prefix_, baseline_pre=baseline_pre,
+ dispatch_pre=dispatch_pre
+ ))
+
+ def report(self, full=False):
+ report = []
+ baseline_rows = []
+ dispatch_rows = []
+ report.append(("CPU baseline", baseline_rows))
+ report.append(("", ""))
+ report.append(("CPU dispatch", dispatch_rows))
+
+ ########## baseline ##########
+ if self.cc_noopt:
+ baseline_rows.append((
+ "Requested", "optimization disabled %s" % (
+ "(unsupported arch)" if self.cc_on_noarch else ""
+ )
+ ))
+ else:
+ baseline_rows.append(("Requested", repr(self._requested_baseline)))
+
+ baseline_names = self.cpu_baseline_names()
+ baseline_rows.append((
+ "Enabled", (' '.join(baseline_names) if baseline_names else "none")
+ ))
+ baseline_flags = self.cpu_baseline_flags()
+ baseline_rows.append((
+ "Flags", (' '.join(baseline_flags) if baseline_flags else "none")
+ ))
+
+ ########## dispatch ##########
+ if self.cc_noopt:
+ dispatch_rows.append((
+ "Requested", "optimization disabled %s" % (
+ "(unsupported arch)" if self.cc_on_noarch else ""
+ )
+ ))
+ else:
+ dispatch_rows.append(("Requested", repr(self._requested_dispatch)))
+
+ dispatch_names = self.cpu_dispatch_names()
+ dispatch_rows.append((
+ "Enabled", (' '.join(dispatch_names) if dispatch_names else "none")
+ ))
+ ########## Generated ##########
+ # TODO:
+ # - collect object names from 'try_dispatch()'
+ # then get size of each object and printed
+ # - give more details about the features that not
+ # generated due compiler support
+ # - find a better output's design.
+ #
+ target_sources = {}
+ for source, (_, targets) in self.sources_status.items():
+ for tar in targets:
+ target_sources.setdefault(tar, []).append(source)
+
+ if not full or not target_sources:
+ generated = ""
+ for tar in self.feature_sorted(target_sources):
+ sources = target_sources[tar]
+ name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+ generated += name + "[%d] " % len(sources)
+ dispatch_rows.append(("Generated", generated[:-1] if generated else "none"))
+ else:
+ dispatch_rows.append(("Generated", ''))
+ for tar in self.feature_sorted(target_sources):
+ sources = target_sources[tar]
+ name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+ flags = ' '.join(self.feature_flags(tar))
+ implies = ' '.join(self.feature_sorted(self.feature_implies(tar)))
+ detect = ' '.join(self.feature_detect(tar))
+ dispatch_rows.append(('', ''))
+ dispatch_rows.append((name, implies))
+ dispatch_rows.append(("Flags", flags))
+ dispatch_rows.append(("Detect", detect))
+ for src in sources:
+ dispatch_rows.append(("", src))
+
+ ###############################
+ # TODO: add support for 'markdown' format
+ text = []
+ secs_len = [len(secs) for secs, _ in report]
+ cols_len = [len(col) for _, rows in report for col, _ in rows]
+ tab = ' ' * 2
+ pad = max(max(secs_len), max(cols_len))
+ for sec, rows in report:
+ if not sec:
+ text.append("") # empty line
+ continue
+ sec += ' ' * (pad - len(sec))
+ text.append(sec + tab + ': ')
+ for col, val in rows:
+ col += ' ' * (pad - len(col))
+ text.append(tab + col + ': ' + val)
+
+ return '\n'.join(text)
+
+ def _wrap_target(self, output_dir, dispatch_src, target, nochange=False):
+ assert(isinstance(target, (str, tuple)))
+ if isinstance(target, str):
+ ext_name = target_name = target
+ else:
+ # multi-target
+ ext_name = '.'.join(target)
+ target_name = '__'.join(target)
+
+ wrap_path = os.path.join(output_dir, os.path.basename(dispatch_src))
+ wrap_path = "{0}.{2}{1}".format(*os.path.splitext(wrap_path), ext_name.lower())
+ if nochange and os.path.exists(wrap_path):
+ return wrap_path
+
+ self.dist_log("wrap dispatch-able target -> ", wrap_path)
+ # sorting for readability
+ features = self.feature_sorted(self.feature_implies_c(target))
+ target_join = "#define %sCPU_TARGET_" % self.conf_c_prefix_
+ target_defs = [target_join + f for f in features]
+ target_defs = '\n'.join(target_defs)
+
+ with open(wrap_path, "w") as fd:
+ fd.write(textwrap.dedent("""\
+ /**
+ * AUTOGENERATED DON'T EDIT
+ * Please make changes to the code generator \
+ (distutils/ccompiler_opt.py)
+ */
+ #define {pfx}CPU_TARGET_MODE
+ #define {pfx}CPU_TARGET_CURRENT {target_name}
+ {target_defs}
+ #include "{path}"
+ """).format(
+ pfx=self.conf_c_prefix_, target_name=target_name,
+ path=os.path.abspath(dispatch_src), target_defs=target_defs
+ ))
+ return wrap_path
+
+ def _generate_config(self, output_dir, dispatch_src, targets, has_baseline=False):
+ config_path = os.path.basename(dispatch_src).replace(".c", ".h")
+ config_path = os.path.join(output_dir, config_path)
+ # check if targets didn't change to avoid recompiling
+ cache_hash = self.cache_hash(targets, has_baseline)
+ try:
+ with open(config_path) as f:
+ last_hash = f.readline().split("cache_hash:")
+ if len(last_hash) == 2 and int(last_hash[1]) == cache_hash:
+ return True
+ except IOError:
+ pass
+
+ self.dist_log("generate dispatched config -> ", config_path)
+ dispatch_calls = []
+ for tar in targets:
+ if isinstance(tar, str):
+ target_name = tar
+ else: # multi target
+ target_name = '__'.join([t for t in tar])
+ req_detect = self.feature_detect(tar)
+ req_detect = '&&'.join([
+ "CHK(%s)" % f for f in req_detect
+ ])
+ dispatch_calls.append(
+ "\t%sCPU_DISPATCH_EXPAND_(CB((%s), %s, __VA_ARGS__))" % (
+ self.conf_c_prefix_, req_detect, target_name
+ ))
+ dispatch_calls = ' \\\n'.join(dispatch_calls)
+
+ if has_baseline:
+ baseline_calls = (
+ "\t%sCPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))"
+ ) % self.conf_c_prefix_
+ else:
+ baseline_calls = ''
+
+ with open(config_path, "w") as fd:
+ fd.write(textwrap.dedent("""\
+ // cache_hash:{cache_hash}
+ /**
+ * AUTOGENERATED DON'T EDIT
+ * Please make changes to the code generator (distutils/ccompiler_opt.py)
+ */
+ #ifndef {pfx}CPU_DISPATCH_EXPAND_
+ #define {pfx}CPU_DISPATCH_EXPAND_(X) X
+ #endif
+ #undef {pfx}CPU_DISPATCH_BASELINE_CALL
+ #undef {pfx}CPU_DISPATCH_CALL
+ #define {pfx}CPU_DISPATCH_BASELINE_CALL(CB, ...) \\
+ {baseline_calls}
+ #define {pfx}CPU_DISPATCH_CALL(CHK, CB, ...) \\
+ {dispatch_calls}
+ """).format(
+ pfx=self.conf_c_prefix_, baseline_calls=baseline_calls,
+ dispatch_calls=dispatch_calls, cache_hash=cache_hash
+ ))
+ return False
+
+def new_ccompiler_opt(compiler, **kwargs):
+ """
+ Create a new instance of 'CCompilerOpt' and generate the dispatch header
+ inside NumPy source dir.
+
+ Parameters
+ ----------
+ 'compiler' : CCompiler instance
+ '**kwargs': passed as-is to `CCompilerOpt(...)`
+
+ Returns
+ -------
+ new instance of CCompilerOpt
+ """
+ opt = CCompilerOpt(compiler, **kwargs)
+ npy_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+ header_dir = os.path.join(npy_path, *("core/src/common".split("/")))
+ header_path = os.path.join(header_dir, "_cpu_dispatch.h")
+ if not os.path.exists(header_path) or not opt.is_cached():
+ if not os.path.exists(header_dir):
+ opt.dist_log(
+ "dispatch header dir '%s' isn't exist, creating it" % header_dir,
+ stderr=True
+ )
+ os.makedirs(header_dir)
+ opt.generate_dispatch_header(header_path)
+ return opt
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
new file mode 100644
index 000000000..8df556b6c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -0,0 +1,25 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+ /* MAXMIN */
+ int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+ ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+ /* ROUNDING */
+ ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+ {
+ float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+ /* MAXMIN */
+ ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+ ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+ /* ROUNDING */
+ ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+ }
+#endif
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
new file mode 100644
index 000000000..0158d1354
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+ uint32x4_t va = vdupq_n_u32(3);
+ int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+ ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
new file mode 100644
index 000000000..bb437aa40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -0,0 +1,17 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ float16x8_t vhp = vdupq_n_f16((float16_t)1);
+ float16x4_t vlhp = vdup_n_f16((float16_t)1);
+ float32x4_t vf = vdupq_n_f32(1.0f);
+ float32x2_t vlf = vdup_n_f32(1.0f);
+
+ int ret = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0);
+ ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0);
+
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
new file mode 100644
index 000000000..80b94000f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -0,0 +1,14 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ float16x8_t vhp = vdupq_n_f16((float16_t)-1);
+ float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+
+ int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+ ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c
new file mode 100644
index 000000000..737c0d2e9
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps());
+ return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c
new file mode 100644
index 000000000..dfb11fd79
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx2.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ __m256i a = _mm256_abs_epi16(_mm256_setzero_si256());
+ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c
new file mode 100644
index 000000000..71dad83a7
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_clx.c
@@ -0,0 +1,8 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ /* VNNI */
+ __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c
new file mode 100644
index 000000000..dfab4436d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_cnl.c
@@ -0,0 +1,10 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ /* IFMA */
+ __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+ /* VMBI */
+ a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512());
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c
new file mode 100644
index 000000000..cf2706b3b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_icl.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ /* VBMI2 */
+ __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+ /* BITLAG */
+ a = _mm512_popcnt_epi8(a);
+ /* VPOPCNTDQ */
+ a = _mm512_popcnt_epi64(a);
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c
new file mode 100644
index 000000000..0699f37a6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knl.c
@@ -0,0 +1,11 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ int base[128];
+ /* ER */
+ __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd()));
+ /* PF */
+ _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+ return base[0];
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c
new file mode 100644
index 000000000..db61b4bfa
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knm.c
@@ -0,0 +1,17 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ __m512i a = _mm512_setzero_si512();
+ __m512 b = _mm512_setzero_ps();
+
+ /* 4FMAPS */
+ b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+ /* 4VNNIW */
+ a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+ /* VPOPCNTDQ */
+ a = _mm512_popcnt_epi64(a);
+
+ a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c
new file mode 100644
index 000000000..1d5e15b5e
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_skx.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ /* VL */
+ __m256i a = _mm256_abs_epi64(_mm256_setzero_si256());
+ /* DQ */
+ __m512i b = _mm512_broadcast_i32x8(a);
+ /* BW */
+ b = _mm512_abs_epi16(b);
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c
new file mode 100644
index 000000000..61bef6b82
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512cd.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512());
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c
new file mode 100644
index 000000000..f60cc09dd
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512f.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+ __m512i a = _mm512_abs_epi32(_mm512_setzero_si512());
+ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c
new file mode 100644
index 000000000..a5a343e2d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_f16c.c
@@ -0,0 +1,9 @@
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+ __m128 a = _mm_cvtph_ps(_mm_setzero_si128());
+ __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128());
+ return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c
new file mode 100644
index 000000000..cf34c6cb1
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma3.c
@@ -0,0 +1,8 @@
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+ __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+ return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c
new file mode 100644
index 000000000..1ad717033
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma4.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+ #include <ammintrin.h>
+#else
+ #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+ __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+ return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
new file mode 100644
index 000000000..4eab1f384
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+ int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+ float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+ ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
new file mode 100644
index 000000000..745d2e793
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
+ float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+ return (int)vgetq_lane_f32(v_z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
new file mode 100644
index 000000000..45f7b5d69
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+ #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+ float32x4_t v1 = vdupq_n_f32(1.0f);
+ float32x4_t v2 = vdupq_n_f32(2.0f);
+ float32x4_t v3 = vdupq_n_f32(3.0f);
+ int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+ float64x2_t vd1 = vdupq_n_f64(1.0);
+ float64x2_t vd2 = vdupq_n_f64(2.0);
+ float64x2_t vd3 = vdupq_n_f64(3.0);
+ ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+ return ret;
+}
diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c
new file mode 100644
index 000000000..e6a80fb40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_popcnt.c
@@ -0,0 +1,23 @@
+#ifdef _MSC_VER
+ #include <nmmintrin.h>
+#else
+ #include <popcntintrin.h>
+#endif
+
+int main(void)
+{
+ long long a = 0;
+ int b;
+#ifdef _MSC_VER
+ #ifdef _M_X64
+ a = _mm_popcnt_u64(1);
+ #endif
+ b = _mm_popcnt_u32(1);
+#else
+ #ifdef __x86_64__
+ a = __builtin_popcountll(1);
+ #endif
+ b = __builtin_popcount(1);
+#endif
+ return (int)a + b;
+}
diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c
new file mode 100644
index 000000000..bb98bf63c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse.c
@@ -0,0 +1,7 @@
+#include <xmmintrin.h>
+
+int main(void)
+{
+ __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+ return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c
new file mode 100644
index 000000000..658afc9b4
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse2.c
@@ -0,0 +1,7 @@
+#include <emmintrin.h>
+
+int main(void)
+{
+ __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+ return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c
new file mode 100644
index 000000000..aece1e601
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse3.c
@@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+
+int main(void)
+{
+ __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+ return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c
new file mode 100644
index 000000000..bfdb9feac
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse41.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+ __m128 a = _mm_floor_ps(_mm_setzero_ps());
+ return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c
new file mode 100644
index 000000000..24f5d93fe
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse42.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+ __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+ return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c
new file mode 100644
index 000000000..ad0abc1e6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_ssse3.c
@@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+
+int main(void)
+{
+ __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+ return (int)_mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c
new file mode 100644
index 000000000..0b3f30d6a
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+ #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+ #define vsx_ld vec_vsx_ld
+ #define vsx_st vec_vsx_st
+#else
+ #define vsx_ld vec_xl
+ #define vsx_st vec_xst
+#endif
+
+int main(void)
+{
+ unsigned int zout[4];
+ unsigned int z4[] = {0, 0, 0, 0};
+ __vector unsigned int v_z4 = vsx_ld(0, z4);
+ vsx_st(v_z4, 0, zout);
+ return zout[0];
+}
diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c
new file mode 100644
index 000000000..410fb29d6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx2.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+ #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+ v_uint64x2 z2 = (v_uint64x2){0, 0};
+ z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+ return (int)vec_extract(z2, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c
new file mode 100644
index 000000000..857526535
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx3.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+ #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+ v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+ z4 = vec_absd(z4, z4);
+ return (int)vec_extract(z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c
new file mode 100644
index 000000000..51d70cf2b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_xop.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+ #include <ammintrin.h>
+#else
+ #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+ __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+ return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c
new file mode 100644
index 000000000..4cd09d42a
--- /dev/null
+++ b/numpy/distutils/checks/test_flags.c
@@ -0,0 +1 @@
+int test_flags;
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index a156a7c6e..60ba4c917 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -16,6 +16,12 @@ class build(old_build):
"specify the Fortran compiler type"),
('warn-error', None,
"turn all warnings into errors (-Werror)"),
+ ('cpu-baseline=', None,
+ "specify a list of enabled baseline CPU optimizations"),
+ ('cpu-dispatch=', None,
+ "specify a list of dispatched CPU optimizations"),
+ ('disable-optimization', None,
+ "disable CPU optimized code(dispatch,simd,fast...)"),
]
help_options = old_build.help_options + [
@@ -27,6 +33,9 @@ class build(old_build):
old_build.initialize_options(self)
self.fcompiler = None
self.warn_error = False
+ self.cpu_baseline = "min"
+ self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
+ self.disable_optimization = False
def finalize_options(self):
build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py
index f6a84e351..87345adbc 100644
--- a/numpy/distutils/command/build_clib.py
+++ b/numpy/distutils/command/build_clib.py
@@ -13,6 +13,7 @@ from numpy.distutils.misc_util import (
filter_sources, get_lib_source_files, get_numpy_include_dirs,
has_cxx_sources, has_f_sources, is_sequence
)
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt
# Fix Python distutils bug sf #1718574:
_l = old_build_clib.user_options
@@ -34,9 +35,16 @@ class build_clib(old_build_clib):
"number of parallel jobs"),
('warn-error', None,
"turn all warnings into errors (-Werror)"),
+ ('cpu-baseline=', None,
+ "specify a list of enabled baseline CPU optimizations"),
+ ('cpu-dispatch=', None,
+ "specify a list of dispatched CPU optimizations"),
+ ('disable-optimization', None,
+ "disable CPU optimized code(dispatch,simd,fast...)"),
]
- boolean_options = old_build_clib.boolean_options + ['inplace', 'warn-error']
+ boolean_options = old_build_clib.boolean_options + \
+ ['inplace', 'warn-error', 'disable-optimization']
def initialize_options(self):
old_build_clib.initialize_options(self)
@@ -44,6 +52,10 @@ class build_clib(old_build_clib):
self.inplace = 0
self.parallel = None
self.warn_error = None
+ self.cpu_baseline = None
+ self.cpu_dispatch = None
+ self.disable_optimization = None
+
def finalize_options(self):
if self.parallel:
@@ -55,6 +67,9 @@ class build_clib(old_build_clib):
self.set_undefined_options('build',
('parallel', 'parallel'),
('warn_error', 'warn_error'),
+ ('cpu_baseline', 'cpu_baseline'),
+ ('cpu_dispatch', 'cpu_dispatch'),
+ ('disable_optimization', 'disable_optimization')
)
def have_f_sources(self):
@@ -102,6 +117,25 @@ class build_clib(old_build_clib):
self.compiler.show_customization()
+ if not self.disable_optimization:
+ opt_cache_path = os.path.abspath(
+ os.path.join(self.build_temp, 'ccompiler_opt_cache_clib.py'
+ ))
+ self.compiler_opt = new_ccompiler_opt(
+ compiler=self.compiler, cpu_baseline=self.cpu_baseline,
+ cpu_dispatch=self.cpu_dispatch, cache_path=opt_cache_path
+ )
+ if not self.compiler_opt.is_cached():
+ log.info("Detected changes on compiler optimizations, force rebuilding")
+ self.force = True
+
+ import atexit
+ def report():
+ log.info("\n########### CLIB COMPILER OPTIMIZATION ###########")
+ log.info(self.compiler_opt.report(full=True))
+
+ atexit.register(report)
+
if self.have_f_sources():
from numpy.distutils.fcompiler import new_fcompiler
self._f_compiler = new_fcompiler(compiler=self.fcompiler,
@@ -211,6 +245,8 @@ class build_clib(old_build_clib):
'extra_f90_compile_args') or []
macros = build_info.get('macros')
+ if macros is None:
+ macros = []
include_dirs = build_info.get('include_dirs')
if include_dirs is None:
include_dirs = []
@@ -223,6 +259,31 @@ class build_clib(old_build_clib):
if requiref90:
self.mkpath(module_build_dir)
+ dispatch_objects = []
+ if not self.disable_optimization:
+ dispatch_sources = [
+ c_sources.pop(c_sources.index(src))
+ for src in c_sources[:] if src.endswith(".dispatch.c")
+ ]
+ if dispatch_sources:
+ if not self.inplace:
+ build_src = self.get_finalized_command("build_src").build_src
+ else:
+ build_src = None
+ dispatch_objects = self.compiler_opt.try_dispatch(
+ dispatch_sources,
+ output_dir=self.build_temp,
+ src_dir=build_src,
+ macros=macros,
+ include_dirs=include_dirs,
+ debug=self.debug,
+ extra_postargs=extra_postargs
+ )
+ extra_args_baseopt = extra_postargs + self.compiler_opt.cpu_baseline_flags()
+ else:
+ extra_args_baseopt = extra_postargs
+ macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+
if compiler.compiler_type == 'msvc':
# this hack works around the msvc compiler attributes
# problem, msvc uses its own convention :(
@@ -237,7 +298,8 @@ class build_clib(old_build_clib):
macros=macros,
include_dirs=include_dirs,
debug=self.debug,
- extra_postargs=extra_postargs)
+ extra_postargs=extra_args_baseopt)
+ objects.extend(dispatch_objects)
if cxx_sources:
log.info("compiling C++ sources")
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index d53285c92..b6557fcf6 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,7 +19,7 @@ from numpy.distutils.misc_util import (
has_cxx_sources, has_f_sources, is_sequence
)
from numpy.distutils.command.config_compiler import show_fortran_compilers
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt
class build_ext (old_build_ext):
@@ -33,6 +33,12 @@ class build_ext (old_build_ext):
"number of parallel jobs"),
('warn-error', None,
"turn all warnings into errors (-Werror)"),
+ ('cpu-baseline=', None,
+ "specify a list of enabled baseline CPU optimizations"),
+ ('cpu-dispatch=', None,
+ "specify a list of dispatched CPU optimizations"),
+ ('disable-optimization', None,
+ "disable CPU optimized code(dispatch,simd,fast...)"),
]
help_options = old_build_ext.help_options + [
@@ -40,13 +46,16 @@ class build_ext (old_build_ext):
show_fortran_compilers),
]
- boolean_options = old_build_ext.boolean_options + ['warn-error']
+ boolean_options = old_build_ext.boolean_options + ['warn-error', 'disable-optimization']
def initialize_options(self):
old_build_ext.initialize_options(self)
self.fcompiler = None
self.parallel = None
self.warn_error = None
+ self.cpu_baseline = None
+ self.cpu_dispatch = None
+ self.disable_optimization = None
def finalize_options(self):
if self.parallel:
@@ -75,6 +84,9 @@ class build_ext (old_build_ext):
self.set_undefined_options('build',
('parallel', 'parallel'),
('warn_error', 'warn_error'),
+ ('cpu_baseline', 'cpu_baseline'),
+ ('cpu_dispatch', 'cpu_dispatch'),
+ ('disable_optimization', 'disable_optimization'),
)
def run(self):
@@ -129,6 +141,22 @@ class build_ext (old_build_ext):
self.compiler.show_customization()
+ if not self.disable_optimization:
+ opt_cache_path = os.path.abspath(os.path.join(self.build_temp, 'ccompiler_opt_cache_ext.py'))
+ self.compiler_opt = new_ccompiler_opt(compiler=self.compiler,
+ cpu_baseline=self.cpu_baseline,
+ cpu_dispatch=self.cpu_dispatch,
+ cache_path=opt_cache_path)
+ if not self.compiler_opt.is_cached():
+ log.info("Detected changes on compiler optimizations, force rebuilding")
+ self.force = True
+
+ import atexit
+ def report():
+ log.info("\n########### EXT COMPILER OPTIMIZATION ###########")
+ log.info(self.compiler_opt.report(full=True))
+ atexit.register(report)
+
# Setup directory for storing generated extra DLL files on Windows
self.extra_dll_dir = os.path.join(self.build_temp, '.libs')
if not os.path.isdir(self.extra_dll_dir):
@@ -378,6 +406,32 @@ class build_ext (old_build_ext):
include_dirs = ext.include_dirs + get_numpy_include_dirs()
+ dispatch_objects = []
+ if not self.disable_optimization:
+ dispatch_sources = [
+ c_sources.pop(c_sources.index(src))
+ for src in c_sources[:] if src.endswith(".dispatch.c")
+ ]
+ if dispatch_sources:
+ if not self.inplace:
+ build_src = self.get_finalized_command("build_src").build_src
+ else:
+ build_src = None
+ dispatch_objects = self.compiler_opt.try_dispatch(
+ dispatch_sources,
+ output_dir=output_dir,
+ src_dir=build_src,
+ macros=macros,
+ include_dirs=include_dirs,
+ debug=self.debug,
+ extra_postargs=extra_args,
+ **kws
+ )
+ extra_args_baseopt = extra_args + self.compiler_opt.cpu_baseline_flags()
+ else:
+ extra_args_baseopt = extra_args
+ macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+
c_objects = []
if c_sources:
log.info("compiling C sources")
@@ -386,8 +440,9 @@ class build_ext (old_build_ext):
macros=macros,
include_dirs=include_dirs,
debug=self.debug,
- extra_postargs=extra_args,
+ extra_postargs=extra_args_baseopt,
**kws)
+ c_objects.extend(dispatch_objects)
if cxx_sources:
log.info("compiling C++ sources")
diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py
index 88cd1a160..798c3686f 100644
--- a/numpy/distutils/setup.py
+++ b/numpy/distutils/setup.py
@@ -7,6 +7,7 @@ def configuration(parent_package='',top_path=None):
config.add_subpackage('tests')
config.add_data_files('site.cfg')
config.add_data_files('mingw/gfortran_vs2003_hack.c')
+ config.add_data_dir('checks')
config.make_config_py()
return config
diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py
new file mode 100644
index 000000000..a789be1ea
--- /dev/null
+++ b/numpy/distutils/tests/test_ccompiler_opt.py
@@ -0,0 +1,787 @@
+import re, textwrap, os
+from os import sys, path
+from distutils.errors import DistutilsError
+
+is_standalone = __name__ == '__main__' and __package__ is None
+if is_standalone:
+ import unittest, contextlib, tempfile, shutil
+ sys.path.append(path.abspath(path.join(path.dirname(__file__), "..")))
+ from ccompiler_opt import CCompilerOpt
+
+ # from numpy/testing/_private/utils.py
+ @contextlib.contextmanager
+ def tempdir(*args, **kwargs):
+ tmpdir = tempfile.mkdtemp(*args, **kwargs)
+ try:
+ yield tmpdir
+ finally:
+ shutil.rmtree(tmpdir)
+
+ def assert_(expr, msg=''):
+ if not expr:
+ raise AssertionError(msg)
+else:
+ from numpy.distutils.ccompiler_opt import CCompilerOpt
+ from numpy.testing import assert_, tempdir
+
+# architectures and compilers to test
+arch_compilers = dict(
+ x86 = ("gcc", "clang", "icc", "iccw", "msvc"),
+ x64 = ("gcc", "clang", "icc", "iccw", "msvc"),
+ ppc64 = ("gcc", "clang"),
+ ppc64le = ("gcc", "clang"),
+ armhf = ("gcc", "clang"),
+ aarch64 = ("gcc", "clang"),
+ noarch = ("gcc",)
+)
+
+class FakeCCompilerOpt(CCompilerOpt):
+ fake_info = ""
+ def __init__(self, trap_files="", trap_flags="", *args, **kwargs):
+ self.fake_trap_files = trap_files
+ self.fake_trap_flags = trap_flags
+ CCompilerOpt.__init__(self, None, **kwargs)
+
+ def __repr__(self):
+ return textwrap.dedent("""\
+ <<<<
+ march : {}
+ compiler : {}
+ ----------------
+ {}
+ >>>>
+ """).format(self.cc_march, self.cc_name, self.report())
+
+ def dist_compile(self, sources, flags, **kwargs):
+ assert(isinstance(sources, list))
+ assert(isinstance(flags, list))
+ if self.fake_trap_files:
+ for src in sources:
+ if re.match(self.fake_trap_files, src):
+ self.dist_error("source is trapped by a fake interface")
+ if self.fake_trap_flags:
+ for f in flags:
+ if re.match(self.fake_trap_flags, f):
+ self.dist_error("flag is trapped by a fake interface")
+ # fake objects
+ return zip(sources, [' '.join(flags)] * len(sources))
+
+ def dist_info(self):
+ return FakeCCompilerOpt.fake_info
+
+ @staticmethod
+ def dist_log(*args, stderr=False):
+ pass
+
+class _Test_CCompilerOpt(object):
+ arch = None # x86_64
+ cc = None # gcc
+
+ def setup(self):
+ FakeCCompilerOpt.conf_nocache = True
+ self._opt = None
+
+ def nopt(self, *args, **kwargs):
+ FakeCCompilerOpt.fake_info = self.arch + '_' + self.cc
+ return FakeCCompilerOpt(*args, **kwargs)
+
+ def opt(self):
+ if not self._opt:
+ self._opt = self.nopt()
+ return self._opt
+
+ def march(self):
+ return self.opt().cc_march
+
+ def cc_name(self):
+ return self.opt().cc_name
+
+ def get_targets(self, targets, groups, **kwargs):
+ FakeCCompilerOpt.conf_target_groups = groups
+ opt = self.nopt(
+ cpu_baseline=kwargs.get("baseline", "min"),
+ cpu_dispatch=kwargs.get("dispatch", "max"),
+ trap_files=kwargs.get("trap_files", ""),
+ trap_flags=kwargs.get("trap_flags", "")
+ )
+ with tempdir() as tmpdir:
+ file = os.path.join(tmpdir, "test_targets.c")
+ with open(file, 'w') as f:
+ f.write(targets)
+ gtargets = []
+ gflags = {}
+ fake_objects = opt.try_dispatch([file])
+ for source, flags in fake_objects:
+ gtar = source.split('.')[1:-1]
+ glen = len(gtar)
+ if glen == 0:
+ gtar = "baseline"
+ elif glen == 1:
+ gtar = gtar[0].upper()
+ else:
+ # converting multi-target into parentheses str format to be equivalent
+ # to the configuration statements syntax.
+ gtar = ('('+' '.join(gtar)+')').upper()
+ gtargets.append(gtar)
+ gflags[gtar] = flags
+
+ has_baseline, targets = opt.sources_status[file]
+ targets = targets + ["baseline"] if has_baseline else targets
+ # convert tuple that represent multi-target into parentheses str format
+ targets = [
+ '('+' '.join(tar)+')' if isinstance(tar, tuple) else tar
+ for tar in targets
+ ]
+ if len(targets) != len(gtargets) or not all(t in gtargets for t in targets):
+ raise AssertionError(
+ "'sources_status' returns different targets than the compiled targets\n"
+ "%s != %s" % (targets, gtargets)
+ )
+ # return targets from 'sources_status' since the order is matters
+ return targets, gflags
+
+ def arg_regex(self, **kwargs):
+ map2origin = dict(
+ x64 = "x86",
+ ppc64le = "ppc64",
+ aarch64 = "armhf",
+ clang = "gcc",
+ )
+ march = self.march(); cc_name = self.cc_name()
+ map_march = map2origin.get(march, march)
+ map_cc = map2origin.get(cc_name, cc_name)
+ for key in (
+ march, cc_name, map_march, map_cc,
+ march + '_' + cc_name,
+ map_march + '_' + cc_name,
+ march + '_' + map_cc,
+ map_march + '_' + map_cc,
+ ) :
+ regex = kwargs.pop(key, None)
+ if regex is not None:
+ break
+ if regex:
+ if isinstance(regex, dict):
+ for k, v in regex.items():
+ if v[-1:] not in ')}$?\\.+*':
+ regex[k] = v + '$'
+ else:
+ assert(isinstance(regex, str))
+ if regex[-1:] not in ')}$?\\.+*':
+ regex += '$'
+ return regex
+
+ def expect(self, dispatch, baseline="", **kwargs):
+ match = self.arg_regex(**kwargs)
+ if match is None:
+ return
+ opt = self.nopt(
+ cpu_baseline=baseline, cpu_dispatch=dispatch,
+ trap_files=kwargs.get("trap_files", ""),
+ trap_flags=kwargs.get("trap_flags", "")
+ )
+ features = ' '.join(opt.cpu_dispatch_names())
+ if not match:
+ if len(features) != 0:
+ raise AssertionError(
+ 'expected empty features, not "%s"' % features
+ )
+ return
+ if not re.match(match, features, re.IGNORECASE):
+ raise AssertionError(
+ 'dispatch features "%s" not match "%s"' % (features, match)
+ )
+
+ def expect_baseline(self, baseline, dispatch="", **kwargs):
+ match = self.arg_regex(**kwargs)
+ if match is None:
+ return
+ opt = self.nopt(
+ cpu_baseline=baseline, cpu_dispatch=dispatch,
+ trap_files=kwargs.get("trap_files", ""),
+ trap_flags=kwargs.get("trap_flags", "")
+ )
+ features = ' '.join(opt.cpu_baseline_names())
+ if not match:
+ if len(features) != 0:
+ raise AssertionError(
+ 'expected empty features, not "%s"' % features
+ )
+ return
+ if not re.match(match, features, re.IGNORECASE):
+ raise AssertionError(
+ 'baseline features "%s" not match "%s"' % (features, match)
+ )
+
+ def expect_flags(self, baseline, dispatch="", **kwargs):
+ match = self.arg_regex(**kwargs)
+ if match is None:
+ return
+ opt = self.nopt(
+ cpu_baseline=baseline, cpu_dispatch=dispatch,
+ trap_files=kwargs.get("trap_files", ""),
+ trap_flags=kwargs.get("trap_flags", "")
+ )
+ flags = ' '.join(opt.cpu_baseline_flags())
+ if not match:
+ if len(flags) != 0:
+ raise AssertionError(
+ 'expected empty flags not "%s"' % flags
+ )
+ return
+ if not re.match(match, flags):
+ raise AssertionError(
+ 'flags "%s" not match "%s"' % (flags, match)
+ )
+
+ def expect_targets(self, targets, groups={}, **kwargs):
+ match = self.arg_regex(**kwargs)
+ if match is None:
+ return
+ targets, _ = self.get_targets(targets=targets, groups=groups, **kwargs)
+ targets = ' '.join(targets)
+ if not match:
+ if len(targets) != 0:
+ raise AssertionError(
+ 'expected empty targets, not "%s"' % targets
+ )
+ return
+ if not re.match(match, targets, re.IGNORECASE):
+ raise AssertionError(
+ 'targets "%s" not match "%s"' % (targets, match)
+ )
+
+ def expect_target_flags(self, targets, groups={}, **kwargs):
+ match_dict = self.arg_regex(**kwargs)
+ if match_dict is None:
+ return
+ assert(isinstance(match_dict, dict))
+ _, tar_flags = self.get_targets(targets=targets, groups=groups)
+
+ for match_tar, match_flags in match_dict.items():
+ if match_tar not in tar_flags:
+ raise AssertionError(
+ 'expected to find target "%s"' % match_tar
+ )
+ flags = tar_flags[match_tar]
+ if not match_flags:
+ if len(flags) != 0:
+ raise AssertionError(
+ 'expected to find empty flags in target "%s"' % match_tar
+ )
+ if not re.match(match_flags, flags):
+ raise AssertionError(
+ '"%s" flags "%s" not match "%s"' % (match_tar, flags, match_flags)
+ )
+
+ def test_interface(self):
+ wrong_arch = "ppc64" if self.arch != "ppc64" else "x86"
+ wrong_cc = "clang" if self.cc != "clang" else "icc"
+ opt = self.opt()
+ assert_(getattr(opt, "cc_on_" + self.arch))
+ assert_(not getattr(opt, "cc_on_" + wrong_arch))
+ assert_(getattr(opt, "cc_is_" + self.cc))
+ assert_(not getattr(opt, "cc_is_" + wrong_cc))
+
+ def test_args_empty(self):
+ for baseline, dispatch in (
+ ("", "none"),
+ (None, ""),
+ ("none +none", "none - none"),
+ ("none -max", "min - max"),
+ ("+vsx2 -VSX2", "vsx avx2 avx512f -max"),
+ ("max -vsx - avx + avx512f neon -MAX ",
+ "min -min + max -max -vsx + avx2 -avx2 +NONE")
+ ) :
+ opt = self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch)
+ assert(len(opt.cpu_baseline_names()) == 0)
+ assert(len(opt.cpu_dispatch_names()) == 0)
+
+ def test_args_validation(self):
+ if self.march() == "unknown":
+ return
+ # check sanity of argument's validation
+ for baseline, dispatch in (
+ ("unkown_feature - max +min", "unknown max min"), # unknowing features
+ ("#avx2", "$vsx") # groups and polices aren't acceptable
+ ) :
+ try:
+ self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch)
+ raise AssertionError("excepted an exception for invalid arguments")
+ except DistutilsError:
+ pass
+
+ def test_skip(self):
+ # only takes what platform supports and skip the others
+ # without casing exceptions
+ self.expect(
+ "sse vsx neon",
+ x86="sse", ppc64="vsx", armhf="neon", unknown=""
+ )
+ self.expect(
+ "sse41 avx avx2 vsx2 vsx3 neon_vfpv4 asimd",
+ x86 = "sse41 avx avx2",
+ ppc64 = "vsx2 vsx3",
+ armhf = "neon_vfpv4 asimd",
+ unknown = ""
+ )
+ # any features in cpu_dispatch must be ignored if it's part of baseline
+ self.expect(
+ "sse neon vsx", baseline="sse neon vsx",
+ x86="", ppc64="", armhf=""
+ )
+ self.expect(
+ "avx2 vsx3 asimdhp", baseline="avx2 vsx3 asimdhp",
+ x86="", ppc64="", armhf=""
+ )
+
+ def test_implies(self):
+ # baseline combining implied features, so we count
+ # on it instead of testing 'feature_implies()'' directly
+ self.expect_baseline(
+ "fma3 avx2 asimd vsx3",
+ # .* between two spaces can validate features in between
+ x86 = "sse .* sse41 .* fma3.*avx2",
+ ppc64 = "vsx vsx2 vsx3",
+ armhf = "neon neon_fp16 neon_vfpv4 asimd"
+ )
+ """
+ special cases
+ """
+ # in icc and msvc, FMA3 and AVX2 can't be separated
+ # both need to implies each other, same for avx512f & cd
+ for f0, f1 in (
+ ("fma3", "avx2"),
+ ("avx512f", "avx512cd"),
+ ):
+ diff = ".* sse42 .* %s .*%s$" % (f0, f1)
+ self.expect_baseline(f0,
+ x86_gcc=".* sse42 .* %s$" % f0,
+ x86_icc=diff, x86_iccw=diff
+ )
+ self.expect_baseline(f1,
+ x86_gcc=".* avx .* %s$" % f1,
+ x86_icc=diff, x86_iccw=diff
+ )
+ # in msvc, following features can't be separated too
+ for f in (("fma3", "avx2"), ("avx512f", "avx512cd", "avx512_skx")):
+ for ff in f:
+ self.expect_baseline(ff,
+ x86_msvc=".*%s" % ' '.join(f)
+ )
+
+ # in ppc64le VSX and VSX2 can't be separated
+ self.expect_baseline("vsx", ppc64le="vsx vsx2")
+ # in aarch64 following features can't be separated
+ for f in ("neon", "neon_fp16", "neon_vfpv4", "asimd"):
+ self.expect_baseline(f, aarch64="neon neon_fp16 neon_vfpv4 asimd")
+
+ def test_args_options(self):
+ # max & native
+ for o in ("max", "native"):
+ if o == "native" and self.cc_name() == "msvc":
+ continue
+ self.expect(o,
+ trap_files=".*cpu_(sse|vsx|neon).c",
+ x86="", ppc64="", armhf=""
+ )
+ self.expect(o,
+ trap_files=".*cpu_(sse3|vsx2|neon_vfpv4).c",
+ x86="sse sse2", ppc64="vsx", armhf="neon neon_fp16",
+ aarch64="", ppc64le=""
+ )
+ self.expect(o,
+ trap_files=".*cpu_(popcnt|vsx3).c",
+ x86="sse .* sse41", ppc64="vsx vsx2",
+ armhf="neon neon_fp16 .* asimd .*"
+ )
+ self.expect(o,
+ x86_gcc=".* xop fma4 .* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+ # in icc, xop and fam4 aren't supported
+ x86_icc=".* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+ x86_iccw=".* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+ # in msvc, avx512_knl avx512_knm aren't supported
+ x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*",
+ armhf=".* asimd asimdhp asimddp .*",
+ ppc64="vsx vsx2 vsx3.*"
+ )
+ # min
+ self.expect("min",
+ x86="sse sse2", x64="sse sse2 sse3",
+ armhf="", aarch64="neon neon_fp16 .* asimd",
+ ppc64="", ppc64le="vsx vsx2"
+ )
+ self.expect(
+ "min", trap_files=".*cpu_(sse2|vsx2).c",
+ x86="", ppc64le=""
+ )
+ # an exception must triggered if native flag isn't supported
+ # when option "native" is activated through the args
+ try:
+ self.expect("native",
+ trap_flags=".*(-march=native|-xHost|/QxHost).*",
+ x86=".*", ppc64=".*", armhf=".*"
+ )
+ if self.march() != "unknown":
+ raise AssertionError(
+ "excepted an exception for %s" % self.march()
+ )
+ except DistutilsError:
+ if self.march() == "unknown":
+ raise AssertionError("excepted no exceptions")
+
+ def test_flags(self):
+ self.expect_flags(
+ "sse sse2 vsx vsx2 neon neon_fp16",
+ x86_gcc="-msse -msse2", x86_icc="-msse -msse2",
+ x86_iccw="/arch:SSE2", x86_msvc="/arch:SSE2",
+ ppc64_gcc= "-mcpu=power8",
+ ppc64_clang="-maltivec -mvsx -mpower8-vector",
+ armhf_gcc="-mfpu=neon-fp16 -mfp16-format=ieee",
+ aarch64=""
+ )
+ # testing normalize -march
+ self.expect_flags(
+ "asimd",
+ aarch64="",
+ armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8-a\+simd"
+ )
+ self.expect_flags(
+ "asimdhp",
+ aarch64_gcc=r"-march=armv8.2-a\+fp16",
+ armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8.2-a\+fp16"
+ )
+ self.expect_flags(
+ "asimddp", aarch64_gcc=r"-march=armv8.2-a\+dotprod"
+ )
+ self.expect_flags(
+ # asimdfhm implies asimdhp
+ "asimdfhm", aarch64_gcc=r"-march=armv8.2-a\+fp16\+fp16fml"
+ )
+ self.expect_flags(
+ "asimddp asimdhp asimdfhm",
+ aarch64_gcc=r"-march=armv8.2-a\+dotprod\+fp16\+fp16fml"
+ )
+
+ def test_targets_exceptions(self):
+ for targets in (
+ "bla bla", "/*@targets",
+ "/*@targets */",
+ "/*@targets unknown */",
+ "/*@targets $unknown_policy avx2 */",
+ "/*@targets #unknown_group avx2 */",
+ "/*@targets $ */",
+ "/*@targets # vsx */",
+ "/*@targets #$ vsx */",
+ "/*@targets vsx avx2 ) */",
+ "/*@targets vsx avx2 (avx2 */",
+ "/*@targets vsx avx2 () */",
+ "/*@targets vsx avx2 ($autovec) */", # no features
+ "/*@targets vsx avx2 (xxx) */",
+ "/*@targets vsx avx2 (baseline) */",
+ ) :
+ try:
+ self.expect_targets(
+ targets,
+ x86="", armhf="", ppc64=""
+ )
+ if self.march() != "unknown":
+ raise AssertionError(
+ "excepted an exception for %s" % self.march()
+ )
+ except DistutilsError:
+ if self.march() == "unknown":
+ raise AssertionError("excepted no exceptions")
+
+ def test_targets_syntax(self):
+ for targets in (
+ "/*@targets $keep_baseline sse vsx neon*/",
+ "/*@targets,$keep_baseline,sse,vsx,neon*/",
+ "/*@targets*$keep_baseline*sse*vsx*neon*/",
+ """
+ /*
+ ** @targets
+ ** $keep_baseline, sse vsx,neon
+ */
+ """,
+ """
+ /*
+ ************@targets*************
+ ** $keep_baseline, sse vsx, neon
+ *********************************
+ */
+ """,
+ """
+ /*
+ /////////////@targets/////////////////
+ //$keep_baseline//sse//vsx//neon
+ /////////////////////////////////////
+ */
+ """,
+ """
+ /*
+ @targets
+ $keep_baseline
+ SSE VSX NEON*/
+ """
+ ) :
+ self.expect_targets(targets,
+ x86="sse", ppc64="vsx", armhf="neon", unknown=""
+ )
+
+ def test_targets(self):
+ # test skipping baseline features
+ self.expect_targets(
+ """
+ /*@targets
+ sse sse2 sse41 avx avx2 avx512f
+ vsx vsx2 vsx3
+ neon neon_fp16 asimdhp asimddp
+ */
+ """,
+ baseline="avx vsx2 asimd",
+ x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3"
+ )
+ # test skipping non-dispatch features
+ self.expect_targets(
+ """
+ /*@targets
+ sse41 avx avx2 avx512f
+ vsx2 vsx3
+ asimd asimdhp asimddp
+ */
+ """,
+ baseline="", dispatch="sse41 avx2 vsx2 asimd asimddp",
+ x86="avx2 sse41", armhf="asimddp asimd", ppc64="vsx2"
+ )
+ # test skipping features that not supported
+ self.expect_targets(
+ """
+ /*@targets
+ sse2 sse41 avx2 avx512f
+ vsx2 vsx3
+ neon asimdhp asimddp
+ */
+ """,
+ baseline="",
+ trap_files=".*(avx2|avx512f|vsx3|asimddp).c",
+ x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon"
+ )
+ # test skipping features that implies each other
+ self.expect_targets(
+ """
+ /*@targets
+ sse sse2 avx fma3 avx2 avx512f avx512cd
+ vsx vsx2 vsx3
+ neon neon_vfpv4 neon_fp16 neon_fp16 asimd asimdhp
+ asimddp asimdfhm
+ */
+ """,
+ baseline="",
+ x86_gcc="avx512cd avx512f avx2 fma3 avx sse2",
+ x86_msvc="avx512cd avx2 avx sse2",
+ x86_icc="avx512cd avx2 avx sse2",
+ x86_iccw="avx512cd avx2 avx sse2",
+ ppc64="vsx3 vsx2 vsx",
+ ppc64le="vsx3 vsx2",
+ armhf="asimdfhm asimddp asimdhp asimd neon_vfpv4 neon_fp16 neon",
+ aarch64="asimdfhm asimddp asimdhp asimd"
+ )
+
+ def test_targets_policies(self):
+ # 'keep_baseline', generate objects for baseline features
+ self.expect_targets(
+ """
+ /*@targets
+ $keep_baseline
+ sse2 sse42 avx2 avx512f
+ vsx2 vsx3
+ neon neon_vfpv4 asimd asimddp
+ */
+ """,
+ baseline="sse41 avx2 vsx2 asimd vsx3",
+ x86="avx512f avx2 sse42 sse2",
+ ppc64="vsx3 vsx2",
+ armhf="asimddp asimd neon_vfpv4 neon",
+ # neon, neon_vfpv4, asimd implies each other
+ aarch64="asimddp asimd"
+ )
+ # 'keep_sort', leave the sort as-is
+ self.expect_targets(
+ """
+ /*@targets
+ $keep_baseline $keep_sort
+ avx512f sse42 avx2 sse2
+ vsx2 vsx3
+ asimd neon neon_vfpv4 asimddp
+ */
+ """,
+ x86="avx512f sse42 avx2 sse2",
+ ppc64="vsx2 vsx3",
+ armhf="asimd neon neon_vfpv4 asimddp",
+ # neon, neon_vfpv4, asimd implies each other
+ aarch64="asimd asimddp"
+ )
+ # 'autovec', skipping features that can't be
+ # vectorized by the compiler
+ self.expect_targets(
+ """
+ /*@targets
+ $keep_baseline $keep_sort $autovec
+ avx512f avx2 sse42 sse41 sse2
+ vsx3 vsx2
+ asimddp asimd neon_vfpv4 neon
+ */
+ """,
+ x86_gcc="avx512f avx2 sse42 sse41 sse2",
+ x86_icc="avx512f avx2 sse42 sse41 sse2",
+ x86_iccw="avx512f avx2 sse42 sse41 sse2",
+ x86_msvc="avx512f avx2 sse2",
+ ppc64="vsx3 vsx2",
+ armhf="asimddp asimd neon_vfpv4 neon",
+ # neon, neon_vfpv4, asimd implies each other
+ aarch64="asimddp asimd"
+ )
+ for policy in ("$maxopt", "$autovec"):
+ # 'maxopt' and autovec set the max acceptable optimization flags
+ self.expect_target_flags(
+ "/*@targets baseline %s */" % policy,
+ gcc={"baseline":".*-O3.*"}, icc={"baseline":".*-O3.*"},
+ iccw={"baseline":".*/O3.*"}, msvc={"baseline":".*/O2.*"},
+ unknown={"baseline":".*"}
+ )
+
+ # 'werror', force compilers to treat warnings as errors
+ self.expect_target_flags(
+ "/*@targets baseline $werror */",
+ gcc={"baseline":".*-Werror.*"}, icc={"baseline":".*-Werror.*"},
+ iccw={"baseline":".*/Werror.*"}, msvc={"baseline":".*/WX.*"},
+ unknown={"baseline":".*"}
+ )
+
+ def test_targets_groups(self):
+ self.expect_targets(
+ """
+ /*@targets $keep_baseline baseline #test_group */
+ """,
+ groups=dict(
+ test_group=("""
+ $keep_baseline
+ asimddp sse2 vsx2 avx2 vsx3
+ avx512f asimdhp
+ """)
+ ),
+ x86="avx512f avx2 sse2 baseline",
+ ppc64="vsx3 vsx2 baseline",
+ armhf="asimddp asimdhp baseline"
+ )
+ # test skip duplicating and sorting
+ self.expect_targets(
+ """
+ /*@targets
+ * sse42 avx avx512f
+ * #test_group_1
+ * vsx2
+ * #test_group_2
+ * asimddp asimdfhm
+ */
+ """,
+ groups=dict(
+ test_group_1=("""
+ VSX2 vsx3 asimd avx2 SSE41
+ """),
+ test_group_2=("""
+ vsx2 vsx3 asImd aVx2 sse41
+ """)
+ ),
+ x86="avx512f avx2 avx sse42 sse41",
+ ppc64="vsx3 vsx2",
+ # vsx2 part of the default baseline of ppc64le, option ("min")
+ ppc64le="vsx3",
+ armhf="asimdfhm asimddp asimd",
+ # asimd part of the default baseline of aarch64, option ("min")
+ aarch64="asimdfhm asimddp"
+ )
+
+ def test_targets_multi(self):
+ self.expect_targets(
+ """
+ /*@targets
+ (avx512_clx avx512_cnl) (asimdhp asimddp)
+ */
+ """,
+ x86=r"\(avx512_clx avx512_cnl\)",
+ armhf=r"\(asimdhp asimddp\)",
+ )
+ # test skipping implied features and auto-sort
+ self.expect_targets(
+ """
+ /*@targets
+ f16c (sse41 avx sse42) (sse3 avx2 avx512f)
+ vsx2 (vsx vsx3 vsx2)
+ (neon neon_vfpv4 asimd asimdhp asimddp)
+ */
+ """,
+ x86="avx512f f16c avx",
+ ppc64="vsx3 vsx2",
+ ppc64le="vsx3", # vsx2 part of baseline
+ armhf=r"\(asimdhp asimddp\)",
+ )
+ # test skipping implied features and keep sort
+ self.expect_targets(
+ """
+ /*@targets $keep_sort
+ (sse41 avx sse42) (sse3 avx2 avx512f)
+ (vsx vsx3 vsx2)
+ (asimddp neon neon_vfpv4 asimd asimdhp)
+ */
+ """,
+ x86="avx avx512f",
+ ppc64="vsx3",
+ armhf=r"\(asimdhp asimddp\)",
+ )
+ # test compiler variety and avoiding duplicating
+ self.expect_targets(
+ """
+ /*@targets $keep_sort
+ fma3 avx2 (fma3 avx2) (avx2 fma3) avx2 fma3
+ */
+ """,
+ x86_gcc=r"fma3 avx2 \(fma3 avx2\)",
+ x86_icc="avx2", x86_iccw="avx2",
+ x86_msvc="avx2"
+ )
+
+def new_test(arch, cc):
+ if is_standalone: return textwrap.dedent("""\
+ class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt, unittest.TestCase):
+ arch = '{arch}'
+ cc = '{cc}'
+ def __init__(self, methodName="runTest"):
+ unittest.TestCase.__init__(self, methodName)
+ self.setup()
+ """).format(
+ class_name=arch + '_' + cc, arch=arch, cc=cc
+ )
+ return textwrap.dedent("""\
+ class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt):
+ arch = '{arch}'
+ cc = '{cc}'
+ """).format(
+ class_name=arch + '_' + cc, arch=arch, cc=cc
+ )
+"""
+if 1 and is_standalone:
+ FakeCCompilerOpt.fake_info = "x86_icc"
+ cco = FakeCCompilerOpt(None, cpu_baseline="avx2")
+ print(' '.join(cco.cpu_baseline_names()))
+ print(cco.cpu_baseline_flags())
+ unittest.main()
+ sys.exit()
+"""
+for arch, compilers in arch_compilers.items():
+ for cc in compilers:
+ exec(new_test(arch, cc))
+
+if is_standalone:
+ unittest.main()
diff --git a/numpy/distutils/tests/test_ccompiler_opt_conf.py b/numpy/distutils/tests/test_ccompiler_opt_conf.py
new file mode 100644
index 000000000..2f83a59e0
--- /dev/null
+++ b/numpy/distutils/tests/test_ccompiler_opt_conf.py
@@ -0,0 +1,169 @@
+import unittest
+from os import sys, path
+
+is_standalone = __name__ == '__main__' and __package__ is None
+if is_standalone:
+ sys.path.append(path.abspath(path.join(path.dirname(__file__), "..")))
+ from ccompiler_opt import CCompilerOpt
+else:
+ from numpy.distutils.ccompiler_opt import CCompilerOpt
+
+arch_compilers = dict(
+ x86 = ("gcc", "clang", "icc", "iccw", "msvc"),
+ x64 = ("gcc", "clang", "icc", "iccw", "msvc"),
+ ppc64 = ("gcc", "clang"),
+ ppc64le = ("gcc", "clang"),
+ armhf = ("gcc", "clang"),
+ aarch64 = ("gcc", "clang"),
+ narch = ("gcc",)
+)
+
+class FakeCCompilerOpt(CCompilerOpt):
+ fake_info = ""
+ def __init__(self, *args, **kwargs):
+ CCompilerOpt.__init__(self, None, **kwargs)
+ def dist_compile(self, sources, flags, **kwargs):
+ return sources
+ def dist_info(self):
+ return FakeCCompilerOpt.fake_info
+ @staticmethod
+ def dist_log(*args, stderr=False):
+ pass
+
+class _TestConfFeatures(FakeCCompilerOpt):
+ """A hook to check the sanity of configured features
+- before it called by the abstract class '_Feature'
+ """
+
+ def conf_features_partial(self):
+ conf_all = self.conf_features
+ for feature_name, feature in conf_all.items():
+ self.test_feature(
+ "attribute conf_features",
+ conf_all, feature_name, feature
+ )
+
+ conf_partial = FakeCCompilerOpt.conf_features_partial(self)
+ for feature_name, feature in conf_partial.items():
+ self.test_feature(
+ "conf_features_partial()",
+ conf_partial, feature_name, feature
+ )
+ return conf_partial
+
+ def test_feature(self, log, search_in, feature_name, feature_dict):
+ error_msg = (
+ "during validate '{}' within feature '{}', "
+ "march '{}' and compiler '{}'\n>> "
+ ).format(log, feature_name, self.cc_march, self.cc_name)
+
+ if not feature_name.isupper():
+ raise AssertionError(error_msg + "feature name must be in uppercase")
+
+ for option, val in feature_dict.items():
+ self.test_option_types(error_msg, option, val)
+ self.test_duplicates(error_msg, option, val)
+
+ self.test_implies(error_msg, search_in, feature_name, feature_dict)
+ self.test_group(error_msg, search_in, feature_name, feature_dict)
+
+ def test_option_types(self, error_msg, option, val):
+ for tp, available in (
+ ((str, list), (
+ "implies", "headers", "flags", "group", "detect"
+ )),
+ ((str,), ("disable",)),
+ ((int,), ("interest",)),
+ ((bool,), ("implies_detect",)),
+ ((bool, type(None)), ("autovec",)),
+ ) :
+ found_it = option in available
+ if not found_it:
+ continue
+ if not isinstance(val, tp):
+ error_tp = [t.__name__ for t in (*tp,)]
+ error_tp = ' or '.join(error_tp)
+ raise AssertionError(error_msg + \
+ "expected '%s' type for option '%s' not '%s'" % (
+ error_tp, option, type(val).__name__
+ ))
+ break
+
+ if not found_it:
+ raise AssertionError(error_msg + \
+ "invalid option name '%s'" % option
+ )
+
+ def test_duplicates(self, error_msg, option, val):
+ if option not in (
+ "implies", "headers", "flags", "group", "detect"
+ ) : return
+
+ if isinstance(val, str):
+ val = val.split()
+
+ if len(val) != len(set(val)):
+ raise AssertionError(error_msg + \
+ "duplicated values in option '%s'" % option
+ )
+
+ def test_implies(self, error_msg, search_in, feature_name, feature_dict):
+ if feature_dict.get("disabled") is not None:
+ return
+ implies = feature_dict.get("implies", "")
+ if not implies:
+ return
+ if isinstance(implies, str):
+ implies = implies.split()
+
+ if feature_name in implies:
+ raise AssertionError(error_msg + \
+ "feature implies itself"
+ )
+
+ for impl in implies:
+ impl_dict = search_in.get(impl)
+ if impl_dict is not None:
+ if "disable" in impl_dict:
+ raise AssertionError(error_msg + \
+ "implies disabled feature '%s'" % impl
+ )
+ continue
+ raise AssertionError(error_msg + \
+ "implies non-exist feature '%s'" % impl
+ )
+
+ def test_group(self, error_msg, search_in, feature_name, feature_dict):
+ if feature_dict.get("disabled") is not None:
+ return
+ group = feature_dict.get("group", "")
+ if not group:
+ return
+ if isinstance(group, str):
+ group = group.split()
+
+ for f in group:
+ impl_dict = search_in.get(f)
+ if not impl_dict or "disable" in impl_dict:
+ continue
+ raise AssertionError(error_msg + \
+ "in option '%s', '%s' already exists as a feature name" % (
+ option, f
+ ))
+
+class TestConfFeatures(unittest.TestCase):
+ def __init__(self, methodName="runTest"):
+ unittest.TestCase.__init__(self, methodName)
+ self.setup()
+
+ def setup(self):
+ FakeCCompilerOpt.conf_nocache = True
+
+ def test_features(self):
+ for arch, compilers in arch_compilers.items():
+ for cc in compilers:
+ FakeCCompilerOpt.fake_info = arch + cc
+ _TestConfFeatures()
+
+if is_standalone:
+ unittest.main()
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index df0e04285..cc4c5d8c5 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -214,6 +214,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
"core.umath",
"core.umath_tests",
"distutils.ccompiler",
+ 'distutils.ccompiler_opt',
"distutils.command",
"distutils.command.autodist",
"distutils.command.bdist_rpm",