diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-06-17 21:35:26 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-17 21:35:26 +0300 |
commit | 8245b392a344a1ae0db6e569ab68b368ad8883c1 (patch) | |
tree | e1c5325e8d9114b69e59b3b3ba0cb538fdcbc1e0 /numpy | |
parent | 02883d85b5d3f68c12cb1df75f96e0fed741d4a4 (diff) | |
parent | e72653810f470415f4d78c8a9ea874370a526126 (diff) | |
download | numpy-8245b392a344a1ae0db6e569ab68b368ad8883c1.tar.gz |
Merge pull request #13516 from seiko2plus/core_improve_infa_build
ENH: enable multi-platform SIMD compiler optimizations
Diffstat (limited to 'numpy')
52 files changed, 4431 insertions, 22 deletions
diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py index ca86aeb22..1c32367f3 100644 --- a/numpy/_pytesttester.py +++ b/numpy/_pytesttester.py @@ -35,12 +35,27 @@ __all__ = ['PytestTester'] def _show_numpy_info(): + from numpy.core._multiarray_umath import ( + __cpu_features__, __cpu_baseline__, __cpu_dispatch__ + ) import numpy as np print("NumPy version %s" % np.__version__) relaxed_strides = np.ones((10, 1), order="C").flags.f_contiguous print("NumPy relaxed strides checking option:", relaxed_strides) + if len(__cpu_baseline__) == 0 and len(__cpu_dispatch__) == 0: + enabled_features = "nothing enabled" + else: + enabled_features = ' '.join(__cpu_baseline__) + for feature in __cpu_dispatch__: + if __cpu_features__[feature]: + enabled_features += " %s*" % feature + else: + enabled_features += " %s?" % feature + print("NumPy CPU features:", enabled_features) + + class PytestTester: """ diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index 1b61899fa..275bb336b 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -341,9 +341,6 @@ struct NpyAuxData_tag { #define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr); #define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr); -#define NPY_STRINGIFY(x) #x -#define NPY_TOSTRING(x) NPY_STRINGIFY(x) - /* * Macros to define how array, and dimension/strides data is * allocated. diff --git a/numpy/core/include/numpy/utils.h b/numpy/core/include/numpy/utils.h index 32218b8c7..e251a5201 100644 --- a/numpy/core/include/numpy/utils.h +++ b/numpy/core/include/numpy/utils.h @@ -2,20 +2,36 @@ #define __NUMPY_UTILS_HEADER__ #ifndef __COMP_NPY_UNUSED - #if defined(__GNUC__) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - # elif defined(__ICC) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - # elif defined(__clang__) - #define __COMP_NPY_UNUSED __attribute__ ((unused)) - #else - #define __COMP_NPY_UNUSED - #endif + #if defined(__GNUC__) + #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) + #elif defined(__ICC) + #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) + #elif defined(__clang__) + #define __COMP_NPY_UNUSED __attribute__ ((unused)) + #else + #define __COMP_NPY_UNUSED + #endif +#endif + +#if defined(__GNUC__) || defined(__ICC) || defined(__clang__) + #define NPY_DECL_ALIGNED(x) __attribute__ ((aligned (x))) +#elif defined(_MSC_VER) + #define NPY_DECL_ALIGNED(x) __declspec(align(x)) +#else + #define NPY_DECL_ALIGNED(x) #endif /* Use this to tag a variable as not used. It will remove unused variable * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable * to avoid accidental use */ #define NPY_UNUSED(x) (__NPY_UNUSED_TAGGED ## x) __COMP_NPY_UNUSED +#define NPY_EXPAND(x) x + +#define NPY_STRINGIFY(x) #x +#define NPY_TOSTRING(x) NPY_STRINGIFY(x) + +#define NPY_CAT__(a, b) a ## b +#define NPY_CAT_(a, b) NPY_CAT__(a, b) +#define NPY_CAT(a, b) NPY_CAT_(a, b) #endif diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 5351b30bf..549860179 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -738,6 +738,7 @@ def configuration(parent_package='',top_path=None): join('src', 'common', 'ufunc_override.h'), join('src', 'common', 'umathmodule.h'), join('src', 'common', 'numpyos.h'), + join('src', 'common', 'npy_cpu_dispatch.h'), ] common_src = [ @@ -939,8 +940,11 @@ def configuration(parent_package='',top_path=None): # umath_tests module # ####################################################################### - config.add_extension('_umath_tests', - sources=[join('src', 'umath', '_umath_tests.c.src')]) + config.add_extension('_umath_tests', sources=[ + join('src', 'umath', '_umath_tests.c.src'), + join('src', 'umath', '_umath_tests.dispatch.c'), + join('src', 'common', 'npy_cpu_features.c.src'), + ]) ####################################################################### # custom rational dtype module # diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index aebe241a5..4493409bb 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -3,6 +3,7 @@ #include "config.h" #include "npy_cpu_features.h" +#include "npy_cpu_dispatch.h" #include "numpy/numpyconfig.h" #include "numpy/npy_cpu.h" #include "numpy/npy_os.h" diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h new file mode 100644 index 000000000..846d1ebb9 --- /dev/null +++ b/numpy/core/src/common/npy_cpu_dispatch.h @@ -0,0 +1,260 @@ +#ifndef NPY_CPU_DISPATCH_H_ +#define NPY_CPU_DISPATCH_H_ +/** + * This file is part of the NumPy CPU dispatcher. Please have a look at doc/reference/simd-optimizations.html + * To get a better understanding of the mechanism behind it. + */ +#include "npy_cpu_features.h" // NPY_CPU_HAVE +#include "numpy/utils.h" // NPY_EXPAND, NPY_CAT +/** + * Bringing the main configration header '_cpu_dispatch.h'. + * + * This header is generated by the distutils module 'ccompiler_opt', + * and contains all the #definitions and headers of instruction-sets, + * that had been configured through command arguments '--cpu-baseline' and '--cpu-dispatch'. + * + * It also contains extra C #definitions and macros that are used for implementing + * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`. + */ +/** + * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION', + * due the nature of command argument '--disable-optimization', + * which is explicitly disabling the module ccompiler_opt. + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #if defined(__powerpc64__) && !defined(__cplusplus) && defined(bool) + /** + * "altivec.h" header contains the definitions(bool, vector, pixel), + * usually in c++ we undefine them after including the header. + * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead, + * since c99 supports bool variables which may lead to ambiguous errors. + */ + // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token. + #define NPY__DISPATCH_DEFBOOL + typedef bool npy__dispatch_bkbool; + #endif + #include "_cpu_dispatch.h" + #ifdef NPY_HAVE_VSX + #undef bool + #undef vector + #undef pixel + #ifdef NPY__DISPATCH_DEFBOOL + #define bool npy__dispatch_bkbool + #endif + #endif +#endif // !NPY_DISABLE_OPTIMIZATION +/** + * Macro NPY_CPU_DISPATCH_CURFX(NAME) + * + * Returns @NAME suffixed with "_" + "the current target" during compiling + * the wrapped sources that generated from the dispatch-able sources according + * to the provided configuration statements. + * + * It also returns @NAME as-is without any suffix when it comes to the baseline or + * in case if the optimization is disabled. + * + * The idea behind this Macro is to allow exporting certain symbols and to + * avoid linking duplications due to the nature of the dispatch-able sources. + * + * Example: + * @targets baseline avx avx512_skx vsx3 asimdhp // configration statments + * + * void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst) + * { + * // the kernel + * } + * + * By assuming the required optimizations are enabled via '--cpu-dspatch' and + * the compiler supported them too, then the generated symbols will be named as follows: + * + * - x86: + * dispatch_me(const int*, int*) // baseline + * dispatch_me_AVX(const int*, int*) + * dispatch_me_AVX512_SKX(const int*, int*) + * + * - ppc64: + * dispatch_me(const int*, int*) + * dispatch_me_VSX3(const int*, int*) + * + * - ARM: + * dispatch_me(const int*, int*) + * dispatch_me_ASIMHP(const int*, int*) + * + * - unsupported arch or when optimization is disabled: + * dispatch_me(const int*, int*) + * + * For forward declarations, see 'NPY_CPU_DISPATCH_DECLARE'. + */ +#ifdef NPY__CPU_TARGET_CURRENT + // 'NPY__CPU_TARGET_CURRENT': only defined by the dispatch-able sources + #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_CAT(NPY_CAT(NAME, _), NPY__CPU_TARGET_CURRENT) +#else + #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_EXPAND(NAME) +#endif +/** + * Defining the default behavior for the configurable macros of dispatch-able sources, + * 'NPY__CPU_DISPATCH_CALL(...)' and 'NPY__CPU_DISPATCH_BASELINE_CALL(...)' + * + * These macros are defined inside the generated config files that been derived from + * the configuration statements of the dispatch-able sources. + * + * The generated config file takes the same name of the dispatch-able source with replacing + * the extension to '.h' instead of '.c', and it should be treated as a header template. + * + * For more clarification, please have a look at doc/reference/simd-optimizations.html. + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \ + &&"Expected config header of the dispatch-able source"; + #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \ + &&"Expected config header of the dispatch-able source"; +#else + /** + * We assume by default that all configuration statements contains 'baseline' option however, + * if the dispatch-able source doesn't require it, then the dispatch-able source and following macros + * need to be guard it with '#ifndef NPY_DISABLE_OPTIMIZATION' + */ + #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \ + NPY_EXPAND(CB(__VA_ARGS__)) + #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) +#endif // !NPY_DISABLE_OPTIMIZATION +/** + * Macro NPY_CPU_DISPATCH_DECLARE(LEFT, ...) is used to provide forward + * declarations for the exported variables and functions that defined inside + * the dispatch-able sources. + * + * The first argument should ends with the exported function or variable name, + * while the Macro pasting the extra arguments. + * + * Examples: + * #ifndef NPY_DISABLE_OPTIMIZATION + * #include "dispatchable_source_name.dispatch.h" + * #endif + * + * NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*)) + * NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE]) + * + * By assuming the provided config header drived from a dispatch-able source, + * that configured with "@targets baseline sse41 vsx3 asimdhp", + * they supported by the compiler and enabled via '--cpu-dspatch', + * then the prototype declrations at the above example will equlivent to the follows: + * + * - x86: + * void dispatch_me(const int*, int*); // baseline + * void dispatch_me_SSE41(const int*, int*); + * + * extern cb_type callback_tab[TAB_SIZE]; + * extern cb_type callback_tab_SSE41[TAB_SIZE]; + * + * - ppc64: + * void dispatch_me(const int*, int*); + * void dispatch_me_VSX3(const int*, int*); + * + * extern cb_type callback_tab[TAB_SIZE]; + * extern cb_type callback_tab_VSX3[TAB_SIZE]; + * + * - ARM: + * void dispatch_me(const int*, int*); + * void dispatch_me_ASIMDHP(const int*, int*); + * + * extern cb_type callback_tab[TAB_SIZE]; + * extern cb_type callback_tab_ASIMDHP[TAB_SIZE]; + * + * - unsupported arch or when optimization is disabled: + * void dispatch_me(const int*, int*); + * extern cb_type callback_tab[TAB_SIZE]; + * + * For runtime dispatching, see 'NPY_CPU_DISPATCH_CALL' + */ +#define NPY_CPU_DISPATCH_DECLARE(...) \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \ + NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__) +// Preprocessor callbacks +#define NPY_CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \ + NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; +#define NPY_CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \ + LEFT __VA_ARGS__; +// Dummy CPU runtime checking +#define NPY_CPU_DISPATCH_DECLARE_CHK_(FEATURE) +/** + * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...) + * + * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even + * if it was provided within the configration statments. + */ +#define NPY_CPU_DISPATCH_DECLARE_XB(...) \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) +/** + * Macro NPY_CPU_DISPATCH_CALL(LEFT, ...) is used for runtime dispatching + * of the exported functions and variables within the dispatch-able sources + * according to the highested interesed CPU features that supported by the + * running machine depending on the required optimizations. + * + * The first argument should ends with the exported function or variable name, + * while the Macro pasting the extra arguments. + * + * Example: + * Assume we have a dispatch-able source exporting the following function: + * + * @targets baseline avx2 avx512_skx // configration statments + * + * void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst) + * { + * // the kernel + * } + * + * In order to call or to assign the pointer of it from outside the dispatch-able source, + * you have to use this Macro as follows: + * + * // bring the genreated config header of the dispatch-abel source + * #ifndef NPY_DISABLE_OPTIMIZATION + * #include "dispatchable_source_name.dispatch.h" + * #endif + * // forward declaration + * NPY_CPU_DISPATCH_DECLARE(dispatch_me, (const int *src, int *dst)) + * + * typedef void(*func_type)(const int*, int*); + * func_type the_callee(const int *src, int *dst, func_type *cb) + * { + * // direct call + * NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst)) + * // assign the pointer + * NPY_CPU_DISPATCH_CALL(*cb = dispatch_me) + * // return the pointer + * NPY_CPU_DISPATCH_CALL(return dispatch_me) + * } + */ +#define NPY_CPU_DISPATCH_CALL(...) \ + if (0) {/*DUMMY*/} \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \ + NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__) +// Preprocessor callbacks +#define NPY_CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \ + else if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; } +#define NPY_CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \ + else { LEFT __VA_ARGS__; } +/** + * Macro NPY_CPU_DISPATCH_CALL_XB(LEFT, ...) + * + * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even + * if it was provided within the configration statments. + */ +#define NPY_CPU_DISPATCH_CALL_XB(...) \ + if (0) {/*DUMMY*/} \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) +/** + * Macro NPY_CPU_DISPATCH_CALL_ALL(LEFT, ...) + * + * Same as `NPY_CPU_DISPATCH_CALL` but dispatching all the required optimizations for + * the exported functions and variables instead of highest interested one. + */ +#define NPY_CPU_DISPATCH_CALL_ALL(...) \ + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \ + NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__) +// Preprocessor callbacks +#define NPY_CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \ + if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; } +#define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \ + { LEFT __VA_ARGS__; } + +#endif // NPY_CPU_DISPATCH_H_ diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src index d35199760..facd27f3c 100644 --- a/numpy/core/src/common/npy_cpu_features.c.src +++ b/numpy/core/src/common/npy_cpu_features.c.src @@ -1,6 +1,7 @@ #include "npy_cpu_features.h" +#include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope. #include "numpy/npy_common.h" // for NPY_INLINE -#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope. +#include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope. /******************** Private Definitions *********************/ @@ -55,6 +56,44 @@ npy_cpu_features_dict(void) return dict; } +#define NPY__CPU_PYLIST_APPEND_CB(FEATURE, LIST) \ + item = PyUnicode_FromString(NPY_TOSTRING(FEATURE)); \ + if (item == NULL) { \ + Py_DECREF(LIST); \ + return NULL; \ + } \ + PyList_SET_ITEM(LIST, index++, item); + +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_baseline_list(void) +{ +#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0 + PyObject *list = PyList_New(NPY_WITH_CPU_BASELINE_N), *item; + int index = 0; + if (list != NULL) { + NPY_WITH_CPU_BASELINE_CALL(NPY__CPU_PYLIST_APPEND_CB, list) + } + return list; +#else + return PyList_New(0); +#endif +} + +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_dispatch_list(void) +{ +#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0 + PyObject *list = PyList_New(NPY_WITH_CPU_DISPATCH_N), *item; + int index = 0; + if (list != NULL) { + NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_PYLIST_APPEND_CB, list) + } + return list; +#else + return PyList_New(0); +#endif +} + /**************************************************************** * This section is reserved to defining @npy__cpu_init_features * for each CPU architecture, please try to keep it clean. Ty @@ -366,7 +405,7 @@ npy__cpu_init_features(void) return; #endif // We have nothing else todo -#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) +#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1; #endif diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h index 0e8901328..fffdef38e 100644 --- a/numpy/core/src/common/npy_cpu_features.h +++ b/numpy/core/src/common/npy_cpu_features.h @@ -109,6 +109,48 @@ npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME) */ NPY_VISIBILITY_HIDDEN PyObject * npy_cpu_features_dict(void); +/* + * Return a new a Python list contains the minimal set of required optimizations + * that supported by the compiler and platform according to the specified + * values to command argument '--cpu-baseline'. + * + * This function is mainly used to implement umath's attrbute '__cpu_baseline__', + * and the items are sorted from the lowest to highest interest. + * + * For example, according to the default build configuration and by assuming the compiler + * support all the involved optimizations then the returned list should equivalent to: + * + * On x86: ['SSE', 'SSE2'] + * On x64: ['SSE', 'SSE2', 'SSE3'] + * On armhf: [] + * On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD'] + * On ppc64: [] + * On ppc64le: ['VSX', 'VSX2'] + * On any other arch or if the optimization is disabled: [] + */ +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_baseline_list(void); +/* + * Return a new a Python list contains the dispatched set of additional optimizations + * that supported by the compiler and platform according to the specified + * values to command argument '--cpu-dispatch'. + * + * This function is mainly used to implement umath's attrbute '__cpu_dispatch__', + * and the items are sorted from the lowest to highest interest. + * + * For example, according to the default build configuration and by assuming the compiler + * support all the involved optimizations then the returned list should equivalent to: + * + * On x86: ['SSE3', 'SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...] + * On x64: ['SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...] + * On armhf: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD', 'ASIMDHP', 'ASIMDDP', 'ASIMDFHM'] + * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM'] + * On ppc64: ['VSX', 'VSX2', 'VSX3'] + * On ppc64le: ['VSX3'] + * On any other arch or if the optimization is disabled: [] + */ +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_dispatch_list(void); #ifdef __cplusplus } diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 84c22ba65..4190c53bd 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -4542,6 +4542,26 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { } Py_DECREF(s); + s = npy_cpu_baseline_list(); + if (s == NULL) { + goto err; + } + if (PyDict_SetItemString(d, "__cpu_baseline__", s) < 0) { + Py_DECREF(s); + goto err; + } + Py_DECREF(s); + + s = npy_cpu_dispatch_list(); + if (s == NULL) { + goto err; + } + if (PyDict_SetItemString(d, "__cpu_dispatch__", s) < 0) { + Py_DECREF(s); + goto err; + } + Py_DECREF(s); + s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL); if (s == NULL) { goto err; diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src index abc8d78c4..d08aabd64 100644 --- a/numpy/core/src/umath/_umath_tests.c.src +++ b/numpy/core/src/umath/_umath_tests.c.src @@ -576,6 +576,51 @@ fail: return NULL; } +// Testing the utilites of the CPU dispatcher +#ifndef NPY_DISABLE_OPTIMIZATION + #include "_umath_tests.dispatch.h" +#endif +NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var) +NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void)) +NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list)) + +static PyObject * +UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2)) +{ + const char *highest_func, *highest_var; + NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ()) + NPY_CPU_DISPATCH_CALL(highest_var = _umath_tests_dispatch_var) + const char *highest_func_xb = "nobase", *highest_var_xb = "nobase"; + NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ()) + NPY_CPU_DISPATCH_CALL_XB(highest_var_xb = _umath_tests_dispatch_var) + + PyObject *dict = PyDict_New(), *item; + if (dict == NULL) { + return NULL; + } + /**begin repeat + * #str = func, var, func_xb, var_xb# + */ + item = PyUnicode_FromString(highest_@str@); + if (item == NULL || PyDict_SetItemString(dict, "@str@", item) < 0) { + goto err; + } + /**end repeat**/ + item = PyList_New(0); + if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) { + goto err; + } + NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item)) + if (PyErr_Occurred()) { + goto err; + } + return dict; +err: + Py_XDECREF(item); + Py_DECREF(dict); + return NULL; +} + static PyMethodDef UMath_TestsMethods[] = { {"test_signature", UMath_Tests_test_signature, METH_VARARGS, "Test signature parsing of ufunc. \n" @@ -583,6 +628,7 @@ static PyMethodDef UMath_TestsMethods[] = { "If fails, it returns NULL. Otherwise it returns a tuple of ufunc " "internals. \n", }, + {"test_dispatch", UMath_Tests_test_dispatch, METH_NOARGS, NULL}, {NULL, NULL, 0, NULL} /* Sentinel */ }; @@ -604,6 +650,11 @@ PyMODINIT_FUNC PyInit__umath_tests(void) { PyObject *d; PyObject *version; + // Initialize CPU features + if (npy_cpu_init() < 0) { + return NULL; + } + m = PyModule_Create(&moduledef); if (m == NULL) { return NULL; @@ -632,6 +683,5 @@ PyMODINIT_FUNC PyInit__umath_tests(void) { "cannot load _umath_tests module."); return NULL; } - return m; } diff --git a/numpy/core/src/umath/_umath_tests.dispatch.c b/numpy/core/src/umath/_umath_tests.dispatch.c new file mode 100644 index 000000000..d86a54411 --- /dev/null +++ b/numpy/core/src/umath/_umath_tests.dispatch.c @@ -0,0 +1,33 @@ +/** + * Testing the utilites of the CPU dispatcher + * + * @targets $werror baseline + * SSE2 SSE41 AVX2 + * VSX VSX2 VSX3 + * NEON ASIMD ASIMDHP + */ +#include <Python.h> +#include "npy_cpu_dispatch.h" + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "_umath_tests.dispatch.h" +#endif + +NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void)) +NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var) +NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list)) + +const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_var) = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(var)); +const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_func)(void) +{ + static const char *current = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func)); + return current; +} + +void NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_attach)(PyObject *list) +{ + PyObject *item = PyUnicode_FromString(NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func))); + if (item) { + PyList_Append(list, item); + } +} diff --git a/numpy/core/tests/test_cpu_dispatcher.py b/numpy/core/tests/test_cpu_dispatcher.py new file mode 100644 index 000000000..8712dee1a --- /dev/null +++ b/numpy/core/tests/test_cpu_dispatcher.py @@ -0,0 +1,42 @@ +from numpy.core._multiarray_umath import __cpu_features__, __cpu_baseline__, __cpu_dispatch__ +from numpy.core import _umath_tests +from numpy.testing import assert_equal + +def test_dispatcher(): + """ + Testing the utilites of the CPU dispatcher + """ + targets = ( + "SSE2", "SSE41", "AVX2", + "VSX", "VSX2", "VSX3", + "NEON", "ASIMD", "ASIMDHP" + ) + highest_sfx = "" # no suffix for the baseline + all_sfx = [] + for feature in reversed(targets): + # skip baseline features, by the default `CCompilerOpt` do not generate separated objects + # for the baseline, just one object combined all of them via 'baseline' option + # within the configuration statments. + if feature in __cpu_baseline__: + continue + # check compiler and running machine support + if feature not in __cpu_dispatch__ or not __cpu_features__[feature]: + continue + + if not highest_sfx: + highest_sfx = "_" + feature + all_sfx.append("func" + "_" + feature) + + test = _umath_tests.test_dispatch() + assert_equal(test["func"], "func" + highest_sfx) + assert_equal(test["var"], "var" + highest_sfx) + + if highest_sfx: + assert_equal(test["func_xb"], "func" + highest_sfx) + assert_equal(test["var_xb"], "var" + highest_sfx) + else: + assert_equal(test["func_xb"], "nobase") + assert_equal(test["var_xb"], "nobase") + + all_sfx.append("func") # add the baseline + assert_equal(test["all"], all_sfx) diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py new file mode 100644 index 000000000..0488173ca --- /dev/null +++ b/numpy/distutils/ccompiler_opt.py @@ -0,0 +1,2438 @@ +"""Provides the `CCompilerOpt` class, used for handling the CPU/hardware +optimization, starting from parsing the command arguments, to managing the +relation between the CPU baseline and dispatch-able features, +also generating the required C headers and ending with compiling +the sources with proper compiler's flags. + +`CCompilerOpt` doesn't provide runtime detection for the CPU features, +instead only focuses on the compiler side, but it creates abstract C headers +that can be used later for the final runtime dispatching process.""" + +import sys, io, os, re, textwrap, pprint, inspect, atexit, subprocess + +class _Config: + """An abstract class holds all configurable attributes of `CCompilerOpt`, + these class attributes can be used to change the default behavior + of `CCompilerOpt` in order to fit other requirements. + + Attributes + ---------- + conf_nocache : bool + Set True to disable memory and file cache. + Default is False. + + conf_noopt : bool + Set True to forces the optimization to be disabled, + in this case `CCompilerOpt` tends to generate all + expected headers in order to 'not' break the build. + Default is False. + + conf_cache_factors : list + Add extra factors to the primary caching factors. The caching factors + are utilized to determine if there are changes had happened that + requires to discard the cache and re-updating it. The primary factors + are the arguments of `CCompilerOpt` and `CCompiler`'s properties(type, flags, etc). + Default is list of two items, containing the time of last modification + of `ccompiler_opt` and value of attribute "conf_noopt" + + conf_tmp_path : str, + The path of temporary directory. Default is auto-created + temporary directory via ``tempfile.mkdtemp()``. + + conf_check_path : str + The path of testing files. Each added CPU feature must have a + **C** source file contains at least one intrinsic or instruction that + related to this feature, so it can be tested against the compiler. + Default is ``./distutils/checks``. + + conf_target_groups : dict + Extra tokens that can be reached from dispatch-able sources through + the special mark ``@targets``. Default is an empty dictionary. + + **Notes**: + - case-insensitive for tokens and group names + - sign '#' must stick in the begin of group name and only within ``@targets`` + + **Example**: + .. code-block:: console + + $ "@targets #avx_group other_tokens" > group_inside.c + + >>> CCompilerOpt.conf_target_groups["avx_group"] = \\ + "$werror $maxopt avx2 avx512f avx512_skx" + >>> cco = CCompilerOpt(cc_instance) + >>> cco.try_dispatch(["group_inside.c"]) + + conf_c_prefix : str + The prefix of public C definitions. Default is ``"NPY_"``. + + conf_c_prefix_ : str + The prefix of internal C definitions. Default is ``"NPY__"``. + + conf_cc_flags : dict + Nested dictionaries defining several compiler flags + that linked to some major functions, the main key + represent the compiler name and sub-keys represent + flags names. Default is already covers all supported + **C** compilers. + + Sub-keys explained as follows: + + "native": str or None + used by argument option `native`, to detect the current + machine support via the compiler. + "werror": str or None + utilized to treat warning as errors during testing CPU features + against the compiler and also for target's policy `$werror` + via dispatch-able sources. + "maxopt": str or None + utilized for target's policy '$maxopt' and the value should + contains the maximum acceptable optimization by the compiler. + e.g. in gcc `'-O3'` + + **Notes**: + * case-sensitive for compiler names and flags + * use space to separate multiple flags + * any flag will tested against the compiler and it will skipped + if it's not applicable. + + conf_min_features : dict + A dictionary defines the used CPU features for + argument option `'min'`, the key represent the CPU architecture + name e.g. `'x86'`. Default values provide the best effort + on wide range of users platforms. + + **Note**: case-sensitive for architecture names. + + conf_features : dict + Nested dictionaries used for identifying the CPU features. + the primary key is represented as a feature name or group name + that gathers several features. Default values covers all + supported features but without the major options like "flags", + these undefined options handle it by method `conf_features_partial()`. + Default value is covers almost all CPU features for *X86*, *IBM/Power64* + and *ARM 7/8*. + + Sub-keys explained as follows: + + "implies" : str or list, optional, + List of CPU feature names to be implied by it, + the feature name must be defined within `conf_features`. + Default is None. + + "flags": str or list, optional + List of compiler flags. Default is None. + + "detect": str or list, optional + List of CPU feature names that required to be detected + in runtime. By default, its the feature name or features + in "group" if its specified. + + "implies_detect": bool, optional + If True, all "detect" of implied features will be combined. + Default is True. see `feature_detect()`. + + "group": str or list, optional + Same as "implies" but doesn't require the feature name to be + defined within `conf_features`. + + "interest": int, required + a key for sorting CPU features + + "headers": str or list, optional + intrinsics C header file + + "disable": str, optional + force disable feature, the string value should contains the + reason of disabling. + + "autovec": bool or None, optional + True or False to declare that CPU feature can be auto-vectorized + by the compiler. + By default(None), treated as True if the feature contains at + least one applicable flag. see `feature_can_autovec()` + + **NOTES**: + * space can be used as separator with options that supports "str or list" + * case-sensitive for all values and feature name must be in upper-case. + * if flags aren't applicable, its will skipped rather than disable the + CPU feature + * the CPU feature will disabled if the compiler fail to compile + the test file + """ + conf_nocache = False + conf_noopt = False + conf_cache_factors = None + conf_tmp_path = None + conf_check_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "checks" + ) + conf_target_groups = {} + conf_c_prefix = 'NPY_' + conf_c_prefix_ = 'NPY__' + conf_cc_flags = dict( + gcc = dict( + # native should always fail on arm and ppc64, + # native usually works only with x86 + native = '-march=native', + opt = '-O3', + werror = '-Werror' + ), + clang = dict( + native = '-march=native', + opt = "-O3", + werror = '-Werror' + ), + icc = dict( + native = '-xHost', + opt = '-O3', + werror = '-Werror' + ), + iccw = dict( + native = '/QxHost', + opt = '/O3', + werror = '/Werror' + ), + msvc = dict( + native = None, + opt = '/O2', + werror = '/WX' + ) + ) + conf_min_features = dict( + x86 = "SSE SSE2", + x64 = "SSE SSE2 SSE3", + ppc64 = '', # play it safe + ppc64le = "VSX VSX2", + armhf = '', # play it safe + aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD" + ) + conf_features = dict( + # X86 + SSE = dict( + interest=1, headers="xmmintrin.h", + # enabling SSE without SSE2 is useless also + # it's non-optional for x86_64 + implies="SSE2" + ), + SSE2 = dict(interest=2, implies="SSE", headers="emmintrin.h"), + SSE3 = dict(interest=3, implies="SSE2", headers="pmmintrin.h"), + SSSE3 = dict(interest=4, implies="SSE3", headers="tmmintrin.h"), + SSE41 = dict(interest=5, implies="SSSE3", headers="smmintrin.h"), + POPCNT = dict(interest=6, implies="SSE41", headers="popcntintrin.h"), + SSE42 = dict(interest=7, implies="POPCNT"), + AVX = dict( + interest=8, implies="SSE42", headers="immintrin.h", + implies_detect=False + ), + XOP = dict(interest=9, implies="AVX", headers="x86intrin.h"), + FMA4 = dict(interest=10, implies="AVX", headers="x86intrin.h"), + F16C = dict(interest=11, implies="AVX"), + FMA3 = dict(interest=12, implies="F16C"), + AVX2 = dict(interest=13, implies="F16C"), + AVX512F = dict(interest=20, implies="FMA3 AVX2", implies_detect=False), + AVX512CD = dict(interest=21, implies="AVX512F"), + AVX512_KNL = dict( + interest=40, implies="AVX512CD", group="AVX512ER AVX512PF", + detect="AVX512_KNL", implies_detect=False + ), + AVX512_KNM = dict( + interest=41, implies="AVX512_KNL", + group="AVX5124FMAPS AVX5124VNNIW AVX512VPOPCNTDQ", + detect="AVX512_KNM", implies_detect=False + ), + AVX512_SKX = dict( + interest=42, implies="AVX512CD", group="AVX512VL AVX512BW AVX512DQ", + detect="AVX512_SKX", implies_detect=False + ), + AVX512_CLX = dict( + interest=43, implies="AVX512_SKX", group="AVX512VNNI", + detect="AVX512_CLX" + ), + AVX512_CNL = dict( + interest=44, implies="AVX512_SKX", group="AVX512IFMA AVX512VBMI", + detect="AVX512_CNL", implies_detect=False + ), + AVX512_ICL = dict( + interest=45, implies="AVX512_CLX AVX512_CNL", + group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ", + detect="AVX512_ICL", implies_detect=False + ), + # IBM/Power + ## Power7/ISA 2.06 + VSX = dict(interest=1, headers="altivec.h"), + ## Power8/ISA 2.07 + VSX2 = dict(interest=2, implies="VSX", implies_detect=False), + ## Power9/ISA 3.00 + VSX3 = dict(interest=3, implies="VSX2", implies_detect=False), + # ARM + NEON = dict(interest=1, headers="arm_neon.h"), + NEON_FP16 = dict(interest=2, implies="NEON"), + ## FMA + NEON_VFPV4 = dict(interest=3, implies="NEON_FP16"), + ## Advanced SIMD + ASIMD = dict(interest=4, implies="NEON_FP16 NEON_VFPV4", implies_detect=False), + ## ARMv8.2 half-precision & vector arithm + ASIMDHP = dict(interest=5, implies="ASIMD"), + ## ARMv8.2 dot product + ASIMDDP = dict(interest=6, implies="ASIMD"), + ## ARMv8.2 Single & half-precision Multiply + ASIMDFHM = dict(interest=7, implies="ASIMDHP"), + ) + def conf_features_partial(self): + """Return a dictionary of supported CPU features by the platform, + and accumulate the rest of undefined options in `conf_features`, + the returned dict has same rules and notes in + class attribute `conf_features`, also its override + any options that been set in 'conf_features'. + """ + if self.cc_noopt: + # optimization is disabled + return {} + + on_x86 = self.cc_on_x86 or self.cc_on_x64 + is_unix = self.cc_is_gcc or self.cc_is_clang + + if on_x86 and is_unix: return dict( + SSE = dict(flags="-msse"), + SSE2 = dict(flags="-msse2"), + SSE3 = dict(flags="-msse3"), + SSSE3 = dict(flags="-mssse3"), + SSE41 = dict(flags="-msse4.1"), + POPCNT = dict(flags="-mpopcnt"), + SSE42 = dict(flags="-msse4.2"), + AVX = dict(flags="-mavx"), + F16C = dict(flags="-mf16c"), + XOP = dict(flags="-mxop"), + FMA4 = dict(flags="-mfma4"), + FMA3 = dict(flags="-mfma"), + AVX2 = dict(flags="-mavx2"), + AVX512F = dict(flags="-mavx512f"), + AVX512CD = dict(flags="-mavx512cd"), + AVX512_KNL = dict(flags="-mavx512er -mavx512pf"), + AVX512_KNM = dict( + flags="-mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq" + ), + AVX512_SKX = dict(flags="-mavx512vl -mavx512bw -mavx512dq"), + AVX512_CLX = dict(flags="-mavx512vnni"), + AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"), + AVX512_ICL = dict( + flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq" + ) + ) + if on_x86 and self.cc_is_icc: return dict( + SSE = dict(flags="-msse"), + SSE2 = dict(flags="-msse2"), + SSE3 = dict(flags="-msse3"), + SSSE3 = dict(flags="-mssse3"), + SSE41 = dict(flags="-msse4.1"), + POPCNT = {}, + SSE42 = dict(flags="-msse4.2"), + AVX = dict(flags="-mavx"), + F16C = {}, + XOP = dict(disable="Intel Compiler doesn't support it"), + FMA4 = dict(disable="Intel Compiler doesn't support it"), + # Intel Compiler doesn't support AVX2 or FMA3 independently + FMA3 = dict( + implies="F16C AVX2", flags="-march=core-avx2" + ), + AVX2 = dict(implies="FMA3", flags="-march=core-avx2"), + # Intel Compiler doesn't support AVX512F or AVX512CD independently + AVX512F = dict( + implies="AVX2 AVX512CD", flags="-march=common-avx512" + ), + AVX512CD = dict( + implies="AVX2 AVX512F", flags="-march=common-avx512" + ), + AVX512_KNL = dict(flags="-xKNL"), + AVX512_KNM = dict(flags="-xKNM"), + AVX512_SKX = dict(flags="-xSKYLAKE-AVX512"), + AVX512_CLX = dict(flags="-xCASCADELAKE"), + AVX512_CNL = dict(flags="-xCANNONLAKE"), + AVX512_ICL = dict(flags="-xICELAKE-CLIENT"), + ) + if on_x86 and self.cc_is_iccw: return dict( + SSE = dict(flags="/arch:SSE"), + SSE2 = dict(flags="/arch:SSE2"), + SSE3 = dict(flags="/arch:SSE3"), + SSSE3 = dict(flags="/arch:SSSE3"), + SSE41 = dict(flags="/arch:SSE4.1"), + POPCNT = {}, + SSE42 = dict(flags="/arch:SSE4.2"), + AVX = dict(flags="/arch:AVX"), + F16C = {}, + XOP = dict(disable="Intel Compiler doesn't support it"), + FMA4 = dict(disable="Intel Compiler doesn't support it"), + # Intel Compiler doesn't support FMA3 or AVX2 independently + FMA3 = dict( + implies="F16C AVX2", flags="/arch:CORE-AVX2" + ), + AVX2 = dict( + implies="FMA3", flags="/arch:CORE-AVX2" + ), + # Intel Compiler doesn't support AVX512F or AVX512CD independently + AVX512F = dict( + implies="AVX2 AVX512CD", flags="/Qx:COMMON-AVX512" + ), + AVX512CD = dict( + implies="AVX2 AVX512F", flags="/Qx:COMMON-AVX512" + ), + AVX512_KNL = dict(flags="/Qx:KNL"), + AVX512_KNM = dict(flags="/Qx:KNM"), + AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"), + AVX512_CLX = dict(flags="/Qx:CASCADELAKE"), + AVX512_CNL = dict(flags="/Qx:CANNONLAKE"), + AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT") + ) + if on_x86 and self.cc_is_msvc: return dict( + SSE = dict(flags="/arch:SSE"), + SSE2 = dict(flags="/arch:SSE2"), + SSE3 = {}, + SSSE3 = {}, + SSE41 = {}, + POPCNT = dict(headers="nmmintrin.h"), + SSE42 = {}, + AVX = dict(flags="/arch:AVX"), + F16C = {}, + XOP = dict(headers="ammintrin.h"), + FMA4 = dict(headers="ammintrin.h"), + # MSVC doesn't support FMA3 or AVX2 independently + FMA3 = dict( + implies="F16C AVX2", flags="/arch:AVX2" + ), + AVX2 = dict( + implies="F16C FMA3", flags="/arch:AVX2" + ), + # MSVC doesn't support AVX512F or AVX512CD independently, + # always generate instructions belong to (VL/VW/DQ) + AVX512F = dict( + implies="AVX2 AVX512CD AVX512_SKX", flags="/arch:AVX512" + ), + AVX512CD = dict( + implies="AVX512F AVX512_SKX", flags="/arch:AVX512" + ), + AVX512_KNL = dict( + disable="MSVC compiler doesn't support it" + ), + AVX512_KNM = dict( + disable="MSVC compiler doesn't support it" + ), + AVX512_SKX = dict(flags="/arch:AVX512"), + AVX512_CLX = {}, + AVX512_CNL = {}, + AVX512_ICL = {} + ) + + on_power = self.cc_on_ppc64le or self.cc_on_ppc64 + if on_power: + partial = dict( + VSX = dict( + implies=("VSX2" if self.cc_on_ppc64le else ""), + flags="-mvsx" + ), + VSX2 = dict( + flags="-mcpu=power8", implies_detect=False + ), + VSX3 = dict( + flags="-mcpu=power9 -mtune=power9", implies_detect=False + ) + ) + if self.cc_is_clang: + partial["VSX"]["flags"] = "-maltivec -mvsx" + partial["VSX2"]["flags"] = "-mpower8-vector" + partial["VSX3"]["flags"] = "-mpower9-vector" + + return partial + + if self.cc_on_aarch64 and is_unix: return dict( + NEON = dict( + implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True + ), + NEON_FP16 = dict( + implies="NEON NEON_VFPV4 ASIMD", autovec=True + ), + NEON_VFPV4 = dict( + implies="NEON NEON_FP16 ASIMD", autovec=True + ), + ASIMD = dict( + implies="NEON NEON_FP16 NEON_VFPV4", autovec=True + ), + ASIMDHP = dict( + flags="-march=armv8.2-a+fp16" + ), + ASIMDDP = dict( + flags="-march=armv8.2-a+dotprod" + ), + ASIMDFHM = dict( + flags="-march=armv8.2-a+fp16fml" + ), + ) + if self.cc_on_armhf and is_unix: return dict( + NEON = dict( + flags="-mfpu=neon" + ), + NEON_FP16 = dict( + flags="-mfpu=neon-fp16 -mfp16-format=ieee" + ), + NEON_VFPV4 = dict( + flags="-mfpu=neon-vfpv4", + ), + ASIMD = dict( + flags="-mfpu=neon-fp-armv8 -march=armv8-a+simd", + ), + ASIMDHP = dict( + flags="-march=armv8.2-a+fp16" + ), + ASIMDDP = dict( + flags="-march=armv8.2-a+dotprod", + ), + ASIMDFHM = dict( + flags="-march=armv8.2-a+fp16fml" + ) + ) + # TODO: ARM MSVC + return {} + + def __init__(self): + if self.conf_tmp_path is None: + import tempfile, shutil + tmp = tempfile.mkdtemp() + def rm_temp(): + try: + shutil.rmtree(tmp) + except IOError: + pass + atexit.register(rm_temp) + self.conf_tmp_path = tmp + + if self.conf_cache_factors is None: + self.conf_cache_factors = [ + os.path.getmtime(__file__), + self.conf_nocache + ] + +class _Distutils: + """A helper class that provides a collection of fundamental methods + implemented in a top of Python and NumPy Distutils. + + The idea behind this class is to gather all methods that it may + need to override in case of reuse 'CCompilerOpt' in environment + different than of what NumPy has. + + Parameters + ---------- + ccompiler : `CCompiler` + The generate instance that returned from `distutils.ccompiler.new_compiler()`. + """ + def __init__(self, ccompiler): + self._ccompiler = ccompiler + + def dist_compile(self, sources, flags, **kwargs): + """Wrap CCompiler.compile()""" + assert(isinstance(sources, list)) + assert(isinstance(flags, list)) + flags = kwargs.pop("extra_postargs", []) + flags + return self._ccompiler.compile( + sources, extra_postargs=flags, **kwargs + ) + + def dist_test(self, source, flags): + """Return True if 'CCompiler.compile()' able to compile + a source file with certain flags. + """ + assert(isinstance(source, str)) + from distutils.errors import CompileError + cc = self._ccompiler; + bk_spawn = getattr(cc, 'spawn', None) + if bk_spawn: + cc_type = getattr(self._ccompiler, "compiler_type", "") + if cc_type in ("msvc",): + setattr(cc, 'spawn', self._dist_test_spawn_paths) + else: + setattr(cc, 'spawn', self._dist_test_spawn) + test = False + try: + self.dist_compile( + [source], flags, output_dir=self.conf_tmp_path + ) + test = True + except CompileError as e: + self.dist_log(str(e), stderr=True) + if bk_spawn: + setattr(cc, 'spawn', bk_spawn) + return test + + def dist_info(self): + """Return a string containing all environment information, required + by the abstract class '_CCompiler' to discovering the platform + environment, also used as a cache factor in order to detect + any changes from outside. + """ + if hasattr(self, "_dist_info"): + return self._dist_info + # play it safe + cc_info = "" + compiler = getattr(self._ccompiler, "compiler", None) + if compiler is not None: + if isinstance(compiler, str): + cc_info += compiler + elif hasattr(compiler, "__iter__"): + cc_info += ' '.join(compiler) + # in case if 'compiler' attribute doesn't provide anything + cc_type = getattr(self._ccompiler, "compiler_type", "") + if cc_type in ("intelem", "intelemw", "mingw64"): + cc_info += "x86_64" + elif cc_type in ("intel", "intelw", "intele"): + cc_info += "x86" + elif cc_type in ("msvc", "mingw32"): + import platform + if platform.architecture()[0] == "32bit": + cc_info += "x86" + else: + cc_info += "x86_64" + else: + # the last hope, too bad for cross-compiling + import platform + cc_info += platform.machine() + + cc_info += cc_type + cflags = os.environ.get("CFLAGS", "") + if cflags not in cc_info: + cc_info += cflags + + self._dist_info = cc_info + return cc_info + + @staticmethod + def dist_error(*args): + """Raise a compiler error""" + from distutils.errors import CompileError + raise CompileError(_Distutils._dist_str(*args)) + + @staticmethod + def dist_fatal(*args): + """Raise a distutils error""" + from distutils.errors import DistutilsError + raise DistutilsError(_Distutils._dist_str(*args)) + + @staticmethod + def dist_log(*args, stderr=False): + """Print a console message""" + from numpy.distutils import log + out = _Distutils._dist_str(*args) + if stderr: + log.warn(out) + else: + log.info(out) + + @staticmethod + def dist_load_module(name, path): + """Load a module from file, required by the abstract class '_Cache'.""" + from numpy.compat import npy_load_module + try: + return npy_load_module(name, path) + except Exception as e: + _Distutils.dist_log(e, stderr=True) + return None + + @staticmethod + def _dist_str(*args): + """Return a string to print by log and errors.""" + def to_str(arg): + if not isinstance(arg, str) and hasattr(arg, '__iter__'): + ret = [] + for a in arg: + ret.append(to_str(a)) + return '('+ ' '.join(ret) + ')' + return str(arg) + + stack = inspect.stack()[2] + start = "CCompilerOpt.%s[%d] : " % (stack.function, stack.lineno) + out = ' '.join([ + to_str(a) + for a in (*args,) + ]) + return start + out + + def _dist_test_spawn_paths(self, cmd, display=None): + """ + Fix msvc SDK ENV path same as distutils do + without it we get c1: fatal error C1356: unable to find mspdbcore.dll + """ + if not hasattr(self._ccompiler, "_paths"): + self._dist_test_spawn(cmd) + return + old_path = os.getenv("path") + try: + os.environ["path"] = self._ccompiler._paths + self._dist_test_spawn(cmd) + finally: + os.environ["path"] = old_path + + _dist_warn_regex = re.compile( + # intel and msvc compilers don't raise + # fatal errors when flags are wrong or unsupported + ".*(" + "ignoring unknown option|" # msvc + "invalid argument for option" # intel + ").*" + ) + @staticmethod + def _dist_test_spawn(cmd, display=None): + from distutils.errors import CompileError + try: + o = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + if isinstance(o, bytes): + o = o.decode() + if o and re.match(_Distutils._dist_warn_regex, o): + _Distutils.dist_error( + "Flags in command", cmd ,"aren't supported by the compiler" + ", output -> \n%s" % o + ) + except subprocess.CalledProcessError as exc: + o = exc.output + s = exc.returncode + except OSError: + o = b'' + s = 127 + else: + return None + o = o.decode() + _Distutils.dist_error( + "Command", cmd, "failed with exit status %d output -> \n%s" % ( + s, o + )) + +_share_cache = {} +class _Cache: + """An abstract class handles caching functionality, provides two + levels of caching, in-memory by share instances attributes among + each other and by store attributes into files. + + **Note**: + any attributes that start with ``_`` or ``conf_`` will be ignored. + + Parameters + ---------- + cache_path: str or None + The path of cache file, if None then cache in file will disabled. + + *factors: + The caching factors that need to utilize next to `conf_cache_factors`. + + Attributes + ---------- + cache_private: set + Hold the attributes that need be skipped from "in-memory cache". + + cache_infile: bool + Utilized during initializing this class, to determine if the cache was able + to loaded from the specified cache path in 'cache_path'. + """ + + # skip attributes from cache + _cache_ignore = re.compile("^(_|conf_)") + + def __init__(self, cache_path=None, *factors): + self.cache_me = {} + self.cache_private = set() + self.cache_infile = False + + if self.conf_nocache: + self.dist_log("cache is disabled by `Config`") + return + + chash = self.cache_hash(*factors, *self.conf_cache_factors) + if cache_path: + if os.path.exists(cache_path): + self.dist_log("load cache from file ->", cache_path) + cache_mod = self.dist_load_module("cache", cache_path) + if not cache_mod: + self.dist_log( + "unable to load the cache file as a module", + stderr=True + ) + elif not hasattr(cache_mod, "hash") or \ + not hasattr(cache_mod, "data"): + self.dist_log("invalid cache file", stderr=True) + elif chash == cache_mod.hash: + self.dist_log("hit the file cache") + for attr, val in cache_mod.data.items(): + setattr(self, attr, val) + self.cache_infile = True + else: + self.dist_log("miss the file cache") + + atexit.register(self._cache_write, cache_path, chash) + + if not self.cache_infile: + other_cache = _share_cache.get(chash) + if other_cache: + self.dist_log("hit the memory cache") + for attr, val in other_cache.__dict__.items(): + if attr in other_cache.cache_private or \ + re.match(self._cache_ignore, attr): + continue + setattr(self, attr, val) + + _share_cache[chash] = self + + def __del__(self): + # TODO: remove the cache form share on del + pass + + def _cache_write(self, cache_path, cache_hash): + # TODO: don't write if the cache doesn't change + self.dist_log("write cache to path ->", cache_path) + for attr in list(self.__dict__.keys()): + if re.match(self._cache_ignore, attr): + self.__dict__.pop(attr) + + d = os.path.dirname(cache_path) + if not os.path.exists(d): + os.makedirs(d) + + repr_dict = pprint.pformat(self.__dict__, compact=True) + with open(cache_path, "w") as f: + f.write(textwrap.dedent("""\ + # AUTOGENERATED DON'T EDIT + # Please make changes to the code generator \ + (distutils/ccompiler_opt.py) + hash = {} + data = \\ + """).format(cache_hash)) + f.write(repr_dict) + + def cache_hash(self, *factors): + # is there a built-in non-crypto hash? + # sdbm + chash = 0 + for f in factors: + for char in str(f): + chash = ord(char) + (chash << 6) + (chash << 16) - chash + chash &= 0xFFFFFFFF + return chash + + @staticmethod + def me(cb): + """ + A static method that can be treated as a decorator to + dynamically cache certain methods. + """ + def cache_wrap_me(self, *args, **kwargs): + # good for normal args + cache_key = str(( + cb.__name__, *args, *kwargs.keys(), *kwargs.values() + )) + if cache_key in self.cache_me: + return self.cache_me[cache_key] + ccb = cb(self, *args, **kwargs) + self.cache_me[cache_key] = ccb + return ccb + return cache_wrap_me + +class _CCompiler(object): + """A helper class for `CCompilerOpt` containing all utilities that + related to the fundamental compiler's functions. + + Attributes + ---------- + cc_on_x86 : bool + True when the target architecture is 32-bit x86 + cc_on_x64 : bool + True when the target architecture is 64-bit x86 + cc_on_ppc64 : bool + True when the target architecture is 64-bit big-endian PowerPC + cc_on_armhf : bool + True when the target architecture is 32-bit ARMv7+ + cc_on_aarch64 : bool + True when the target architecture is 64-bit Armv8-a+ + cc_on_noarch : bool + True when the target architecture is unknown or not supported + cc_is_gcc : bool + True if the compiler is GNU or + if the compiler is unknown + cc_is_clang : bool + True if the compiler is Clang + cc_is_icc : bool + True if the compiler is Intel compiler (unix like) + cc_is_iccw : bool + True if the compiler is Intel compiler (msvc like) + cc_is_nocc : bool + True if the compiler isn't supported directly, + Note: that cause a fail-back to gcc + cc_has_debug : bool + True if the compiler has debug flags + cc_has_native : bool + True if the compiler has native flags + cc_noopt : bool + True if the compiler has definition 'DISABLE_OPT*', + or 'cc_on_noarch' is True + cc_march : str + The target architecture name, or "unknown" if + the architecture isn't supported + cc_name : str + The compiler name, or "unknown" if the compiler isn't supported + cc_flags : dict + Dictionary containing the initialized flags of `_Config.conf_cc_flags` + """ + def __init__(self): + if hasattr(self, "cc_is_cached"): + return + to_detect = ( + # attr regex + ( + ("cc_on_x64", "^(x|x86_|amd)64"), + ("cc_on_x86", "^(x86|i386|i686)"), + ("cc_on_ppc64le", "^(powerpc|ppc)64(el|le)"), + ("cc_on_ppc64", "^(powerpc|ppc)64"), + ("cc_on_armhf", "^arm"), + ("cc_on_aarch64", "^aarch64"), + # priority is given to first of string + # if it fail we search in the rest, due + # to append platform.machine() at the end, + # check method 'dist_info()' for more clarification. + ("cc_on_x64", ".*(x|x86_|amd)64.*"), + ("cc_on_x86", ".*(x86|i386|i686).*"), + ("cc_on_ppc64le", ".*(powerpc|ppc)64(el|le).*"), + ("cc_on_ppc64", ".*(powerpc|ppc)64.*"), + ("cc_on_armhf", ".*arm.*"), + ("cc_on_aarch64", ".*aarch64.*"), + # undefined platform + ("cc_on_noarch", ""), + ), + ( + ("cc_is_gcc", r".*(gcc|gnu\-g).*"), + ("cc_is_clang", ".*clang.*"), + ("cc_is_iccw", ".*(intelw|intelemw|iccw).*"), # intel msvc like + ("cc_is_icc", ".*(intel|icc).*"), # intel unix like + ("cc_is_msvc", ".*msvc.*"), + ("cc_is_nocc", ""), + ), + (("cc_has_debug", ".*(O0|Od|ggdb|coverage|debug:full).*"),), + (("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"),), + # in case if the class run with -DNPY_DISABLE_OPTIMIZATION + (("cc_noopt", ".*DISABLE_OPT.*"),), + ) + for section in to_detect: + for attr, rgex in section: + setattr(self, attr, False) + + dist_info = self.dist_info() + for section in to_detect: + for attr, rgex in section: + if rgex and not re.match(rgex, dist_info, re.IGNORECASE): + continue + setattr(self, attr, True) + break + + if self.cc_on_noarch: + self.dist_log( + "unable to detect CPU arch via compiler info, " + "optimization is disabled \ninfo << %s >> " % dist_info, + stderr=True + ) + self.cc_noopt = True + + if self.conf_noopt: + self.dist_log("Optimization is disabled by the Config", stderr=True) + self.cc_noopt = True + + if self.cc_is_nocc: + """ + mingw can be treated as a gcc, and also xlc even if it based on clang, + but still has the same gcc optimization flags. + """ + self.dist_log( + "unable to detect compiler name via info <<\n%s\n>> " + "treating it as a gcc" % dist_info, + stderr=True + ) + self.cc_is_gcc = True + + self.cc_march = "unknown" + for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"): + if getattr(self, "cc_on_" + arch): + self.cc_march = arch + break + + self.cc_name = "unknown" + for name in ("gcc", "clang", "iccw", "icc", "msvc"): + if getattr(self, "cc_is_" + name): + self.cc_name = name + break + + self.cc_flags = {} + compiler_flags = self.conf_cc_flags.get(self.cc_name) + if compiler_flags is None: + self.dist_fatal( + "undefined flag for compiler '%s', " + "leave an empty dict instead" % self.cc_name + ) + for name, flags in compiler_flags.items(): + self.cc_flags[name] = nflags = [] + if flags: + assert(isinstance(flags, str)) + flags = flags.split() + for f in flags: + if self.cc_test_flags([f]): + nflags.append(f) + + self.cc_is_cached = True + + @_Cache.me + def cc_test_flags(self, flags): + """ + Returns True if the compiler supports 'flags'. + """ + assert(isinstance(flags, list)) + self.dist_log("testing flags", flags) + test_path = os.path.join(self.conf_check_path, "test_flags.c") + test = self.dist_test(test_path, flags) + if not test: + self.dist_log("testing failed", stderr=True) + return test + + def cc_normalize_flags(self, flags): + """ + Remove the conflicts that caused due gathering implied features flags. + + Parameters + ---------- + 'flags' list, compiler flags + flags should be sorted from the lowest to the highest interest. + + Returns + ------- + list, filtered from any conflicts. + + Examples + -------- + >>> self.cc_normalize_flags(['-march=armv8.2-a+fp16', '-march=armv8.2-a+dotprod']) + ['armv8.2-a+fp16+dotprod'] + + >>> self.cc_normalize_flags( + ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-mavx', '-march=core-avx2'] + ) + ['-march=core-avx2'] + """ + assert(isinstance(flags, list)) + if self.cc_is_gcc or self.cc_is_clang or self.cc_is_icc: + return self._cc_normalize_unix(flags) + + if self.cc_is_msvc or self.cc_is_iccw: + return self._cc_normalize_win(flags) + return flags + + _cc_normalize_unix_mrgx = re.compile( + # 1- to check the highest of + r"^(-mcpu=|-march=|-x[A-Z0-9\-])" + ) + _cc_normalize_unix_frgx = re.compile( + # 2- to remove any flags starts with + # -march, -mcpu, -x(INTEL) and '-m' without '=' + r"^(?!(-mcpu=|-march=|-x[A-Z0-9\-]))(?!-m[a-z0-9\-\.]*.$)" + ) + _cc_normalize_unix_krgx = re.compile( + # 3- keep only the highest of + r"^(-mfpu|-mtune)" + ) + _cc_normalize_arch_ver = re.compile( + r"[0-9.]" + ) + def _cc_normalize_unix(self, flags): + def ver_flags(f): + # arch ver subflag + # -march=armv8.2-a+fp16fml + tokens = f.split('+') + ver = float('0' + ''.join( + re.findall(self._cc_normalize_arch_ver, tokens[0]) + )) + return ver, tokens[0], tokens[1:] + + if len(flags) <= 1: + return flags + # get the highest matched flag + for i, cur_flag in enumerate(reversed(flags)): + if not re.match(self._cc_normalize_unix_mrgx, cur_flag): + continue + lower_flags = flags[:-(i+1)] + upper_flags = flags[-i:] + filterd = list(filter( + self._cc_normalize_unix_frgx.search, lower_flags + )) + # gather subflags + ver, arch, subflags = ver_flags(cur_flag) + if ver > 0 and len(subflags) > 0: + for xflag in lower_flags: + xver, _, xsubflags = ver_flags(xflag) + if ver == xver: + subflags = xsubflags + subflags + cur_flag = arch + '+' + '+'.join(subflags) + + flags = filterd + [cur_flag] + if i > 0: + flags += upper_flags + break + + # to remove overridable flags + final_flags = [] + matched = set() + for f in reversed(flags): + match = re.match(self._cc_normalize_unix_krgx, f) + if not match: + pass + elif match[0] in matched: + continue + else: + matched.add(match[0]) + final_flags.insert(0, f) + return final_flags + + _cc_normalize_win_frgx = re.compile( + r"^(?!(/arch\:|/Qx\:))" + ) + _cc_normalize_win_mrgx = re.compile( + r"^(/arch|/Qx:)" + ) + def _cc_normalize_win(self, flags): + for i, f in enumerate(reversed(flags)): + if not re.match(self._cc_normalize_win_mrgx, f): + continue + i += 1 + return list(filter( + self._cc_normalize_win_frgx.search, flags[:-i] + )) + flags[-i:] + return flags + +class _Feature: + """A helper class for `CCompilerOpt` that managing CPU features. + + Attributes + ---------- + feature_supported : dict + Dictionary containing all CPU features that supported + by the platform, according to the specified values in attribute + `_Config.conf_features` and `_Config.conf_features_partial()` + + feature_min : set + The minimum support of CPU features, according to + the specified values in attribute `_Config.conf_min_features`. + """ + def __init__(self): + if hasattr(self, "feature_is_cached"): + return + self.feature_supported = pfeatures = self.conf_features_partial() + for feature_name in list(pfeatures.keys()): + feature = pfeatures[feature_name] + cfeature = self.conf_features[feature_name] + feature.update({ + k:v for k,v in cfeature.items() if k not in feature + }) + disabled = feature.get("disable") + if disabled is not None: + pfeatures.pop(feature_name) + self.dist_log( + "feature '%s' is disabled," % feature_name, + disabled, stderr=True + ) + continue + # list is used internally for these options + for option in ( + "implies", "group", "detect", "headers", "flags" + ) : + oval = feature.get(option) + if isinstance(oval, str): + feature[option] = oval.split() + + self.feature_min = set() + min_f = self.conf_min_features.get(self.cc_march, "") + for F in min_f.upper().split(): + if F in self.feature_supported: + self.feature_min.add(F) + + self.feature_is_cached = True + + def feature_names(self, names=None, force_flags=None): + """ + Returns a set of CPU feature names that supported by platform and the **C** compiler. + + Parameters + ---------- + 'names': sequence or None, optional + Specify certain CPU features to test it against the **C** compiler. + if None(default), it will test all current supported features. + **Note**: feature names must be in upper-case. + + 'force_flags': list or None, optional + If None(default), default compiler flags for every CPU feature will be used + during the test. + """ + assert( + names is None or ( + not isinstance(names, str) and + hasattr(names, "__iter__") + ) + ) + assert(force_flags is None or isinstance(force_flags, list)) + if names is None: + names = self.feature_supported.keys() + supported_names = set() + for f in names: + if self.feature_is_supported(f, force_flags=force_flags): + supported_names.add(f) + return supported_names + + def feature_is_exist(self, name): + """ + Returns True if a certain feature is exist and covered within + `_Config.conf_features`. + + Parameters + ---------- + 'name': str + feature name in uppercase. + """ + assert(name.isupper()) + return name in self.conf_features + + def feature_sorted(self, names, reverse=False): + """ + Sort a list of CPU features ordered by the lowest interest. + + Parameters + ---------- + 'names': sequence + sequence of supported feature names in uppercase. + 'reverse': bool, optional + If true, the sorted features is reversed. (highest interest) + + Returns + ------- + list, sorted CPU features + """ + def sort_cb(k): + if isinstance(k, str): + return self.feature_supported[k]["interest"] + # multiple features + rank = max([self.feature_supported[f]["interest"] for f in k]) + # FIXME: that's not a safe way to increase the rank for + # multi targets + rank += len(k) -1 + return rank + return sorted(names, reverse=reverse, key=sort_cb) + + def feature_implies(self, names): + """Return a set of CPU features that implied by 'names'""" + def get_implies(name, _caller=[]): + implies = set() + d = self.feature_supported[name] + for i in d.get("implies", []): + implies.add(i) + if i in _caller: + # infinity recursive guard since + # features can imply each other + continue + _caller.append(name) + implies = implies.union(get_implies(i, _caller)) + return implies + + if isinstance(names, str): + return get_implies(names) + + assert(hasattr(names, "__iter__")) + implies = set() + for n in names: + implies = implies.union(get_implies(n)) + return implies + + def feature_implies_c(self, names): + """same as feature_implies() but combining 'names'""" + if isinstance(names, str): + names = set((names,)) + else: + names = set(names) + return names.union(self.feature_implies(names)) + + def feature_ahead(self, names): + """ + Return list of features in 'names' after remove any + implied features and keep the origins. + + Parameters + ---------- + 'names': sequence + sequence of CPU feature names in uppercase. + + Returns + ------- + list of CPU features sorted as-is 'names' + + Examples + -------- + >>> self.feature_untied(["SSE2", "SSE3", "SSE41"]) + ["SSE41"] + # assume AVX2 and FMA3 implies each other and AVX2 + # is the highest interest + >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"]) + ["AVX2"] + # assume AVX2 and FMA3 don't implies each other + >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"]) + ["AVX2", "FMA3"] + """ + assert( + not isinstance(names, str) + and hasattr(names, '__iter__') + ) + implies = self.feature_implies(names) + ahead = [n for n in names if n not in implies] + if len(ahead) == 0: + # return the highest interested feature + # if all features imply each other + ahead = self.feature_sorted(names, reverse=True)[:1] + return ahead + + def feature_untied(self, names): + """ + same as 'feature_ahead()' but if both features implied each other + and keep the highest interest. + + Parameters + ---------- + 'names': sequence + sequence of CPU feature names in uppercase. + + Returns + ------- + list of CPU features sorted as-is 'names' + + Examples + -------- + >>> self.feature_untied(["SSE2", "SSE3", "SSE41"]) + ["SSE2", "SSE3", "SSE41"] + # assume AVX2 and FMA3 implies each other + >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "FMA3", "AVX2"]) + ["SSE2", "SSE3", "SSE41", "AVX2"] + """ + assert( + not isinstance(names, str) + and hasattr(names, '__iter__') + ) + final = [] + for n in names: + implies = self.feature_implies(n) + tied = [ + nn for nn in final + if nn in implies and n in self.feature_implies(nn) + ] + if tied: + tied = self.feature_sorted(tied + [n]) + if n not in tied[1:]: + continue + final.remove(tied[:1][0]) + final.append(n) + return final + + def feature_get_til(self, names, keyisfalse): + """ + same as `feature_implies_c()` but stop collecting implied + features when feature's option that provided through + parameter 'keyisfalse' is False, also sorting the returned + features. + """ + def til(tnames): + # sort from highest to lowest interest then cut if "key" is False + tnames = self.feature_implies_c(tnames) + tnames = self.feature_sorted(tnames, reverse=True) + for i, n in enumerate(tnames): + if not self.feature_supported[n].get(keyisfalse, True): + tnames = tnames[:i+1] + break + return tnames + + if isinstance(names, str) or len(names) <= 1: + names = til(names) + # normalize the sort + names.reverse() + return names + + names = self.feature_ahead(names) + names = {t for n in names for t in til(n)} + return self.feature_sorted(names) + + def feature_detect(self, names): + """ + Return a list of CPU features that required to be detected + sorted from the lowest to highest interest. + """ + names = self.feature_get_til(names, "implies_detect") + detect = [] + for n in names: + d = self.feature_supported[n] + detect += d.get("detect", d.get("group", [n])) + return detect + + @_Cache.me + def feature_flags(self, names): + """ + Return a list of CPU features flags sorted from the lowest + to highest interest. + """ + names = self.feature_sorted(self.feature_implies_c(names)) + flags = [] + for n in names: + d = self.feature_supported[n] + f = d.get("flags", []) + if not f or not self.cc_test_flags(f): + continue + flags += f + return self.cc_normalize_flags(flags) + + @_Cache.me + def feature_test(self, name, force_flags=None): + """ + Test a certain CPU feature against the compiler through its own + check file. + + Parameters + ---------- + 'name': str + Supported CPU feature name. + + 'force_flags': list or None, optional + If None(default), the returned flags from `feature_flags()` + will be used. + """ + if force_flags is None: + force_flags = self.feature_flags(name) + + self.dist_log( + "testing feature '%s' with flags (%s)" % ( + name, ' '.join(force_flags) + )) + # Each CPU feature must have C source code contains at + # least one intrinsic or instruction related to this feature. + test_path = os.path.join( + self.conf_check_path, "cpu_%s.c" % name.lower() + ) + if not os.path.exists(test_path): + self.dist_fatal("feature test file is not exist", path) + + test = self.dist_test(test_path, force_flags + self.cc_flags["werror"]) + if not test: + self.dist_log("testing failed", stderr=True) + return test + + @_Cache.me + def feature_is_supported(self, name, force_flags=None): + """ + Check if a certain CPU feature is supported by the platform and compiler. + + Parameters + ---------- + 'name': str + CPU feature name in uppercase. + + 'force_flags': list or None, optional + If None(default), default compiler flags for every CPU feature will be used + during test. + """ + assert(name.isupper()) + assert(force_flags is None or isinstance(force_flags, list)) + + supported = name in self.feature_supported + if supported: + for impl in self.feature_implies(name): + if not self.feature_test(impl, force_flags): + return False + if not self.feature_test(name, force_flags): + return False + return supported + + @_Cache.me + def feature_can_autovec(self, name): + """ + check if the feature can be auto-vectorized by the compiler + """ + assert(isinstance(name, str)) + d = self.feature_supported[name] + can = d.get("autovec", None) + if can is None: + valid_flags = [ + self.cc_test_flags([f]) for f in d.get("flags", []) + ] + can = valid_flags and any(valid_flags) + return can + + def feature_c_preprocessor(self, feature_name, tabs=0): + """ + Generate C preprocessor definitions and include headers of a CPU feature. + + Parameters + ---------- + 'feature_name': str + CPU feature name in uppercase. + 'tabs': int + if > 0, align the generated strings to the right depend on number of tabs. + + Returns + ------- + str, generated C preprocessor + + Examples + -------- + >>> self.feature_c_preprocessor("SSE3") + /** SSE3 **/ + #define NPY_HAVE_SSE3 1 + #include <pmmintrin.h> + """ + assert(feature_name.isupper()) + feature = self.feature_supported.get(feature_name) + assert(feature is not None) + + prepr = [ + "/** %s **/" % feature_name, + "#define %sHAVE_%s 1" % (self.conf_c_prefix, feature_name) + ] + prepr += [ + "#include <%s>" % h for h in feature.get("headers", []) + ] + group = feature.get("group", []) + for f in group: + # Guard features in case of duplicate definitions + prepr += [ + "#ifndef %sHAVE_%s" % (self.conf_c_prefix, f), + "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, f), + "#endif", + ] + if tabs > 0: + prepr = [('\t'*tabs) + l for l in prepr] + return '\n'.join(prepr) + +class _Parse: + """A helper class that parsing main arguments of `CCompilerOpt`, + also parsing configuration statements in dispatch-able sources. + + Parameters + ---------- + cpu_baseline: str or None + minimal set of required CPU features or special options. + + cpu_dispatch: str or None + dispatched set of additional CPU features or special options. + + Special options can be: + - **MIN**: Enables the minimum CPU features that utilized via `_Config.conf_min_features` + - **MAX**: Enables all supported CPU features by the Compiler and platform. + - **NATIVE**: Enables all CPU features that supported by the current machine. + - **NONE**: Enables nothing + - **Operand +/-**: remove or add features, useful with options **MAX**, **MIN** and **NATIVE**. + NOTE: operand + is only added for nominal reason. + + NOTES: + - Case-insensitive among all CPU features and special options. + - Comma or space can be used as a separator. + - If the CPU feature is not supported by the user platform or compiler, + it will be skipped rather than raising a fatal error. + - Any specified CPU features to 'cpu_dispatch' will be skipped if its part of CPU baseline features + - 'cpu_baseline' force enables implied features. + + Attributes + ---------- + parse_baseline_names : list + Final CPU baseline's feature names(sorted from low to high) + parse_baseline_flags : list + Compiler flags of baseline features + parse_dispatch_names : list + Final CPU dispatch-able feature names(sorted from low to high) + parse_target_groups : dict + Dictionary containing initialized target groups that configured + through class attribute `conf_target_groups`. + + The key is represent the group name and value is a tuple + contains three items : + - bool, True if group has the 'baseline' option. + - list, list of CPU features. + - list, list of extra compiler flags. + + """ + def __init__(self, cpu_baseline, cpu_dispatch): + self._parse_policies = dict( + # POLICY NAME, (HAVE, NOT HAVE, [DEB]) + KEEP_BASELINE = ( + None, self._parse_policy_not_keepbase, + [] + ), + KEEP_SORT = ( + self._parse_policy_keepsort, + self._parse_policy_not_keepsort, + [] + ), + MAXOPT = ( + self._parse_policy_maxopt, None, + [] + ), + WERROR = ( + self._parse_policy_werror, None, + [] + ), + AUTOVEC = ( + self._parse_policy_autovec, None, + ["MAXOPT"] + ) + ) + if hasattr(self, "parse_is_cached"): + return + + self.parse_baseline_names = [] + self.parse_baseline_flags = [] + self.parse_dispatch_names = [] + self.parse_target_groups = {} + + if self.cc_noopt: + # skip parsing baseline and dispatch args and keep parsing target groups + cpu_baseline = cpu_dispatch = None + + self.dist_log("check requested baseline") + if cpu_baseline is not None: + cpu_baseline = self._parse_arg_features("cpu_baseline", cpu_baseline) + baseline_names = self.feature_names(cpu_baseline) + self.parse_baseline_flags = self.feature_flags(baseline_names) + self.parse_baseline_names = self.feature_sorted( + self.feature_implies_c(baseline_names) + ) + + self.dist_log("check requested dispatch-able features") + if cpu_dispatch is not None: + cpu_dispatch_ = self._parse_arg_features("cpu_dispatch", cpu_dispatch) + cpu_dispatch = { + f for f in cpu_dispatch_ + if f not in self.parse_baseline_names + } + conflict_baseline = cpu_dispatch_.difference(cpu_dispatch) + self.parse_dispatch_names = self.feature_sorted( + self.feature_names(cpu_dispatch) + ) + if len(conflict_baseline) > 0: + self.dist_log( + "skip features", conflict_baseline, "since its part of baseline" + ) + + self.dist_log("initialize targets groups") + for group_name, tokens in self.conf_target_groups.items(): + self.dist_log("parse target group", group_name) + GROUP_NAME = group_name.upper() + if not tokens or not tokens.strip(): + # allow empty groups, useful in case if there's a need + # to disable certain group since '_parse_target_tokens()' + # requires at least one valid target + self.parse_target_groups[GROUP_NAME] = ( + False, [], [] + ) + continue + has_baseline, features, extra_flags = \ + self._parse_target_tokens(tokens) + self.parse_target_groups[GROUP_NAME] = ( + has_baseline, features, extra_flags + ) + + self.parse_is_cached = True + + def parse_targets(self, source): + """ + Fetch and parse configuration statements that required for + defining the targeted CPU features, statements should be declared + in the top of source in between **C** comment and start + with a special mark **@targets**. + + Configuration statements are sort of keywords representing + CPU features names, group of statements and policies, combined + together to determine the required optimization. + + Parameters + ---------- + source: str + the path of **C** source file. + + Returns + ------- + - bool, True if group has the 'baseline' option + - list, list of CPU features + - list, list of extra compiler flags + """ + self.dist_log("looking for '@targets' inside -> ", source) + # get lines between /*@targets and */ + with open(source) as fd: + tokens = "" + max_to_reach = 1000 # good enough, isn't? + start_with = "@targets" + start_pos = -1 + end_with = "*/" + end_pos = -1 + for current_line, line in enumerate(fd): + if current_line == max_to_reach: + self.dist_fatal("reached the max of lines") + break + if start_pos == -1: + start_pos = line.find(start_with) + if start_pos == -1: + continue + start_pos += len(start_with) + tokens += line + end_pos = line.find(end_with) + if end_pos != -1: + end_pos += len(tokens) - len(line) + break + + if start_pos == -1: + self.dist_fatal("expected to find '%s' within a C comment" % start_with) + if end_pos == -1: + self.dist_fatal("expected to end with '%s'" % end_with) + + tokens = tokens[start_pos:end_pos] + return self._parse_target_tokens(tokens) + + _parse_regex_arg = re.compile(r'\s|[,]|([+-])') + def _parse_arg_features(self, arg_name, req_features): + if not isinstance(req_features, str): + self.dist_fatal("expected a string in '%s'" % arg_name) + + final_features = set() + # space and comma can be used as a separator + tokens = list(filter(None, re.split(self._parse_regex_arg, req_features))) + append = True # append is the default + for tok in tokens: + if tok[0] in ("#", "$"): + self.dist_fatal( + arg_name, "target groups and policies " + "aren't allowed from arguments, " + "only from dispatch-able sources" + ) + if tok == '+': + append = True + continue + if tok == '-': + append = False + continue + + TOK = tok.upper() # we use upper-case internally + features_to = set() + if TOK == "NONE": + pass + elif TOK == "NATIVE": + native = self.cc_flags["native"] + if not native: + self.dist_fatal(arg_name, + "native option isn't supported by the compiler" + ) + features_to = self.feature_names(force_flags=native) + elif TOK == "MAX": + features_to = self.feature_supported.keys() + elif TOK == "MIN": + features_to = self.feature_min + else: + if TOK in self.feature_supported: + features_to.add(TOK) + else: + if not self.feature_is_exist(TOK): + self.dist_fatal(arg_name, + ", '%s' isn't a known feature or option" % tok + ) + if append: + final_features = final_features.union(features_to) + else: + final_features = final_features.difference(features_to) + + append = True # back to default + + return final_features + + _parse_regex_target = re.compile(r'\s|[*,/]|([()])') + def _parse_target_tokens(self, tokens): + assert(isinstance(tokens, str)) + final_targets = [] # to keep it sorted as specified + extra_flags = [] + has_baseline = False + + skipped = set() + policies = set() + multi_target = None + + tokens = list(filter(None, re.split(self._parse_regex_target, tokens))) + if not tokens: + self.dist_fatal("expected one token at least") + + for tok in tokens: + TOK = tok.upper() + ch = tok[0] + if ch in ('+', '-'): + self.dist_fatal( + "+/- are 'not' allowed from target's groups or @targets, " + "only from cpu_baseline and cpu_dispatch parms" + ) + elif ch == '$': + if multi_target is not None: + self.dist_fatal( + "policies aren't allowed inside multi-target '()'" + ", only CPU features" + ) + policies.add(self._parse_token_policy(TOK)) + elif ch == '#': + if multi_target is not None: + self.dist_fatal( + "target groups aren't allowed inside multi-target '()'" + ", only CPU features" + ) + has_baseline, final_targets, extra_flags = \ + self._parse_token_group(TOK, has_baseline, final_targets, extra_flags) + elif ch == '(': + if multi_target is not None: + self.dist_fatal("unclosed multi-target, missing ')'") + multi_target = set() + elif ch == ')': + if multi_target is None: + self.dist_fatal("multi-target opener '(' wasn't found") + targets = self._parse_multi_target(multi_target) + if targets is None: + skipped.add(tuple(multi_target)) + else: + if len(targets) == 1: + targets = targets[0] + if targets and targets not in final_targets: + final_targets.append(targets) + multi_target = None # back to default + else: + if TOK == "BASELINE": + if multi_target is not None: + self.dist_fatal("baseline isn't allowed inside multi-target '()'") + has_baseline = True + continue + + if multi_target is not None: + multi_target.add(TOK) + continue + + if not self.feature_is_exist(TOK): + self.dist_fatal("invalid target name '%s'" % TOK) + + is_enabled = ( + TOK in self.parse_baseline_names or + TOK in self.parse_dispatch_names + ) + if is_enabled: + if TOK not in final_targets: + final_targets.append(TOK) + continue + + skipped.add(TOK) + + if multi_target is not None: + self.dist_fatal("unclosed multi-target, missing ')'") + if skipped: + self.dist_log( + "skip targets", skipped, + "not part of baseline or dispatch-able features" + ) + + final_targets = self.feature_untied(final_targets) + + # add polices dependencies + for p in list(policies): + _, _, deps = self._parse_policies[p] + for d in deps: + if d in policies: + continue + self.dist_log( + "policy '%s' force enables '%s'" % ( + p, d + )) + policies.add(d) + + # release policies filtrations + for p, (have, nhave, _) in self._parse_policies.items(): + func = None + if p in policies: + func = have + self.dist_log("policy '%s' is ON" % p) + else: + func = nhave + if not func: + continue + has_baseline, final_targets, extra_flags = func( + has_baseline, final_targets, extra_flags + ) + + return has_baseline, final_targets, extra_flags + + def _parse_token_policy(self, token): + """validate policy token""" + if len(token) <= 1 or token[-1:] == token[0]: + self.dist_fatal("'$' must stuck in the begin of policy name") + token = token[1:] + if token not in self._parse_policies: + self.dist_fatal( + "'%s' is an invalid policy name, available policies are" % token, + self._parse_policies.keys() + ) + return token + + def _parse_token_group(self, token, has_baseline, final_targets, extra_flags): + """validate group token""" + if len(token) <= 1 or token[-1:] == token[0]: + self.dist_fatal("'#' must stuck in the begin of group name") + + token = token[1:] + ghas_baseline, gtargets, gextra_flags = self.parse_target_groups.get( + token, (False, None, []) + ) + if gtargets is None: + self.dist_fatal( + "'%s' is an invalid target group name, " % token + \ + "available target groups are", + self.parse_target_groups.keys() + ) + if ghas_baseline: + has_baseline = True + # always keep sorting as specified + final_targets += [f for f in gtargets if f not in final_targets] + extra_flags += [f for f in gextra_flags if f not in extra_flags] + return has_baseline, final_targets, extra_flags + + def _parse_multi_target(self, targets): + """validate multi targets that defined between parentheses()""" + # remove any implied features and keep the origins + if not targets: + self.dist_fatal("empty multi-target '()'") + if not all([ + self.feature_is_exist(tar) for tar in targets + ]) : + self.dist_fatal("invalid target name in multi-target", targets) + if not all([ + ( + tar in self.parse_baseline_names or + tar in self.parse_dispatch_names + ) + for tar in targets + ]) : + return None + targets = self.feature_ahead(targets) + if not targets: + return None + # force sort multi targets, so it can be comparable + targets = self.feature_sorted(targets) + targets = tuple(targets) # hashable + return targets + + def _parse_policy_not_keepbase(self, has_baseline, final_targets, extra_flags): + """skip all baseline features""" + skipped = [] + for tar in final_targets[:]: + is_base = False + if isinstance(tar, str): + is_base = tar in self.parse_baseline_names + else: + # multi targets + is_base = all([ + f in self.parse_baseline_names + for f in tar + ]) + if is_base: + skipped.append(tar) + final_targets.remove(tar) + + if skipped: + self.dist_log("skip baseline features", skipped) + + return has_baseline, final_targets, extra_flags + + def _parse_policy_keepsort(self, has_baseline, final_targets, extra_flags): + """leave a notice that $keep_sort is on""" + self.dist_log( + "policy 'keep_sort' is on, dispatch-able targets", final_targets, "\n" + "are 'not' sorted depend on the highest interest but" + "as specified in the dispatch-able source or the extra group" + ) + return has_baseline, final_targets, extra_flags + + def _parse_policy_not_keepsort(self, has_baseline, final_targets, extra_flags): + """sorted depend on the highest interest""" + final_targets = self.feature_sorted(final_targets, reverse=True) + return has_baseline, final_targets, extra_flags + + def _parse_policy_maxopt(self, has_baseline, final_targets, extra_flags): + """append the compiler optimization flags""" + if self.cc_has_debug: + self.dist_log("debug mode is detected, policy 'maxopt' is skipped.") + elif self.cc_noopt: + self.dist_log("optimization is disabled, policy 'maxopt' is skipped.") + else: + flags = self.cc_flags["opt"] + if not flags: + self.dist_log( + "current compiler doesn't support optimization flags, " + "policy 'maxopt' is skipped", stderr=True + ) + else: + extra_flags += flags + return has_baseline, final_targets, extra_flags + + def _parse_policy_werror(self, has_baseline, final_targets, extra_flags): + """force warnings to treated as errors""" + flags = self.cc_flags["werror"] + if not flags: + self.dist_log( + "current compiler doesn't support werror flags, " + "warnings will 'not' treated as errors", stderr=True + ) + else: + self.dist_log("compiler warnings are treated as errors") + extra_flags += flags + return has_baseline, final_targets, extra_flags + + def _parse_policy_autovec(self, has_baseline, final_targets, extra_flags): + """skip features that has no auto-vectorized support by compiler""" + skipped = [] + for tar in final_targets[:]: + if isinstance(tar, str): + can = self.feature_can_autovec(tar) + else: # multiple target + can = all([ + self.feature_can_autovec(t) + for t in tar + ]) + if not can: + final_targets.remove(tar) + skipped.append(tar) + + if skipped: + self.dist_log("skip non auto-vectorized features", skipped) + + return has_baseline, final_targets, extra_flags + +class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse): + """ + A helper class for `CCompiler` aims to provide extra build options + to effectively control of compiler optimizations that are directly + related to CPU features. + """ + def __init__(self, ccompiler, cpu_baseline="min", cpu_dispatch="max", cache_path=None): + _Config.__init__(self) + _Distutils.__init__(self, ccompiler) + _Cache.__init__(self, cache_path, self.dist_info(), cpu_baseline, cpu_dispatch) + _CCompiler.__init__(self) + _Feature.__init__(self) + if not self.cc_noopt and self.cc_has_native: + self.dist_log( + "native flag is specified through environment variables. " + "force cpu-baseline='native'" + ) + cpu_baseline = "native" + _Parse.__init__(self, cpu_baseline, cpu_dispatch) + # keep the requested features untouched, need it later for report + # and trace purposes + self._requested_baseline = cpu_baseline + self._requested_dispatch = cpu_dispatch + # key is the dispatch-able source and value is a tuple + # contains two items (has_baseline[boolean], dispatched-features[list]) + self.sources_status = getattr(self, "sources_status", {}) + # every instance should has a separate one + self.cache_private.add("sources_status") + # set it at the end to make sure the cache writing was done after init + # this class + self.hit_cache = hasattr(self, "hit_cache") + + def is_cached(self): + """ + Returns True if the class loaded from the cache file + """ + return self.cache_infile and self.hit_cache + + def cpu_baseline_flags(self): + """ + Returns a list of final CPU baseline compiler flags + """ + return self.parse_baseline_flags + + def cpu_baseline_names(self): + """ + return a list of final CPU baseline feature names + """ + return self.parse_baseline_names + + def cpu_dispatch_names(self): + """ + return a list of final CPU dispatch feature names + """ + return self.parse_dispatch_names + + def try_dispatch(self, sources, src_dir=None, **kwargs): + """ + Compile one or more dispatch-able sources and generates object files, + also generates abstract C config headers and macros that + used later for the final runtime dispatching process. + + The mechanism behind it is to takes each source file that specified + in 'sources' and branching it into several files depend on + special configuration statements that must be declared in the + top of each source which contains targeted CPU features, + then it compiles every branched source with the proper compiler flags. + + Parameters + ---------- + sources : list + Must be a list of dispatch-able sources file paths, + and configuration statements must be declared inside + each file. + + src_dir : str + Path of parent directory for the generated headers and wrapped sources. + If None(default) the files will generated in-place. + + **kwargs : any + Arguments to pass on to the `CCompiler.compile()` + + Returns + ------- + list : generated object files + + Raises + ------ + CompileError + Raises by `CCompiler.compile()` on compiling failure. + DistutilsError + Some errors during checking the sanity of configuration statements. + + See Also + -------- + parse_targets() : + Parsing the configuration statements of dispatch-able sources. + """ + to_compile = {} + baseline_flags = self.cpu_baseline_flags() + include_dirs = kwargs.setdefault("include_dirs", []) + + for src in sources: + output_dir = os.path.dirname(src) + if src_dir and not output_dir.startswith(src_dir): + output_dir = os.path.join(src_dir, output_dir) + if output_dir not in include_dirs: + include_dirs.append(output_dir) + + has_baseline, targets, extra_flags = self.parse_targets(src) + nochange = self._generate_config(output_dir, src, targets, has_baseline) + for tar in targets: + tar_src = self._wrap_target(output_dir, src, tar, nochange=nochange) + flags = tuple(extra_flags + self.feature_flags(tar)) + to_compile.setdefault(flags, []).append(tar_src) + + if has_baseline: + flags = tuple(extra_flags + baseline_flags) + to_compile.setdefault(flags, []).append(src) + + self.sources_status[src] = (has_baseline, targets) + + # For these reasons, the sources are compiled in a separate loop: + # - Gathering all sources with the same flags to benefit from + # the parallel compiling as much as possible. + # - To generate all config headers of the dispatchable sources, + # before the compilation in case if there are dependency relationships + # among them. + objects = [] + for flags, srcs in to_compile.items(): + objects += self.dist_compile(srcs, list(flags), **kwargs) + return objects + + def generate_dispatch_header(self, header_path): + """ + Generate the dispatch header which containing all definitions + and headers of instruction-sets for the enabled CPU baseline and + dispatch-able features. + + Its highly recommended to take a look at the generated header + also the generated source files via `try_dispatch()` + in order to get the full picture. + """ + self.dist_log("generate CPU dispatch header: (%s)" % header_path) + + baseline_names = self.cpu_baseline_names() + dispatch_names = self.cpu_dispatch_names() + baseline_len = len(baseline_names) + dispatch_len = len(dispatch_names) + + with open(header_path, 'w') as f: + baseline_calls = ' \\\n'.join([ + ( + "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))" + ) % (self.conf_c_prefix, f) + for f in baseline_names + ]) + dispatch_calls = ' \\\n'.join([ + ( + "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))" + ) % (self.conf_c_prefix, f) + for f in dispatch_names + ]) + f.write(textwrap.dedent("""\ + /* + * AUTOGENERATED DON'T EDIT + * Please make changes to the code generator (distutils/ccompiler_opt.py) + */ + #define {pfx}WITH_CPU_BASELINE "{baseline_str}" + #define {pfx}WITH_CPU_DISPATCH "{dispatch_str}" + #define {pfx}WITH_CPU_BASELINE_N {baseline_len} + #define {pfx}WITH_CPU_DISPATCH_N {dispatch_len} + #define {pfx}WITH_CPU_EXPAND_(X) X + #define {pfx}WITH_CPU_BASELINE_CALL(MACRO_TO_CALL, ...) \\ + {baseline_calls} + #define {pfx}WITH_CPU_DISPATCH_CALL(MACRO_TO_CALL, ...) \\ + {dispatch_calls} + """).format( + pfx=self.conf_c_prefix, baseline_str=" ".join(baseline_names), + dispatch_str=" ".join(dispatch_names), baseline_len=baseline_len, + dispatch_len=dispatch_len, baseline_calls=baseline_calls, + dispatch_calls=dispatch_calls + )) + baseline_pre = '' + for name in baseline_names: + baseline_pre += self.feature_c_preprocessor(name, tabs=1) + '\n' + + dispatch_pre = '' + for name in dispatch_names: + dispatch_pre += textwrap.dedent("""\ + #ifdef {pfx}CPU_TARGET_{name} + {pre} + #endif /*{pfx}CPU_TARGET_{name}*/ + """).format( + pfx=self.conf_c_prefix_, name=name, pre=self.feature_c_preprocessor( + name, tabs=1 + )) + + f.write(textwrap.dedent("""\ + /******* baseline features *******/ + {baseline_pre} + /******* dispatch features *******/ + {dispatch_pre} + """).format( + pfx=self.conf_c_prefix_, baseline_pre=baseline_pre, + dispatch_pre=dispatch_pre + )) + + def report(self, full=False): + report = [] + baseline_rows = [] + dispatch_rows = [] + report.append(("CPU baseline", baseline_rows)) + report.append(("", "")) + report.append(("CPU dispatch", dispatch_rows)) + + ########## baseline ########## + if self.cc_noopt: + baseline_rows.append(( + "Requested", "optimization disabled %s" % ( + "(unsupported arch)" if self.cc_on_noarch else "" + ) + )) + else: + baseline_rows.append(("Requested", repr(self._requested_baseline))) + + baseline_names = self.cpu_baseline_names() + baseline_rows.append(( + "Enabled", (' '.join(baseline_names) if baseline_names else "none") + )) + baseline_flags = self.cpu_baseline_flags() + baseline_rows.append(( + "Flags", (' '.join(baseline_flags) if baseline_flags else "none") + )) + + ########## dispatch ########## + if self.cc_noopt: + dispatch_rows.append(( + "Requested", "optimization disabled %s" % ( + "(unsupported arch)" if self.cc_on_noarch else "" + ) + )) + else: + dispatch_rows.append(("Requested", repr(self._requested_dispatch))) + + dispatch_names = self.cpu_dispatch_names() + dispatch_rows.append(( + "Enabled", (' '.join(dispatch_names) if dispatch_names else "none") + )) + ########## Generated ########## + # TODO: + # - collect object names from 'try_dispatch()' + # then get size of each object and printed + # - give more details about the features that not + # generated due compiler support + # - find a better output's design. + # + target_sources = {} + for source, (_, targets) in self.sources_status.items(): + for tar in targets: + target_sources.setdefault(tar, []).append(source) + + if not full or not target_sources: + generated = "" + for tar in self.feature_sorted(target_sources): + sources = target_sources[tar] + name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar) + generated += name + "[%d] " % len(sources) + dispatch_rows.append(("Generated", generated[:-1] if generated else "none")) + else: + dispatch_rows.append(("Generated", '')) + for tar in self.feature_sorted(target_sources): + sources = target_sources[tar] + name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar) + flags = ' '.join(self.feature_flags(tar)) + implies = ' '.join(self.feature_sorted(self.feature_implies(tar))) + detect = ' '.join(self.feature_detect(tar)) + dispatch_rows.append(('', '')) + dispatch_rows.append((name, implies)) + dispatch_rows.append(("Flags", flags)) + dispatch_rows.append(("Detect", detect)) + for src in sources: + dispatch_rows.append(("", src)) + + ############################### + # TODO: add support for 'markdown' format + text = [] + secs_len = [len(secs) for secs, _ in report] + cols_len = [len(col) for _, rows in report for col, _ in rows] + tab = ' ' * 2 + pad = max(max(secs_len), max(cols_len)) + for sec, rows in report: + if not sec: + text.append("") # empty line + continue + sec += ' ' * (pad - len(sec)) + text.append(sec + tab + ': ') + for col, val in rows: + col += ' ' * (pad - len(col)) + text.append(tab + col + ': ' + val) + + return '\n'.join(text) + + def _wrap_target(self, output_dir, dispatch_src, target, nochange=False): + assert(isinstance(target, (str, tuple))) + if isinstance(target, str): + ext_name = target_name = target + else: + # multi-target + ext_name = '.'.join(target) + target_name = '__'.join(target) + + wrap_path = os.path.join(output_dir, os.path.basename(dispatch_src)) + wrap_path = "{0}.{2}{1}".format(*os.path.splitext(wrap_path), ext_name.lower()) + if nochange and os.path.exists(wrap_path): + return wrap_path + + self.dist_log("wrap dispatch-able target -> ", wrap_path) + # sorting for readability + features = self.feature_sorted(self.feature_implies_c(target)) + target_join = "#define %sCPU_TARGET_" % self.conf_c_prefix_ + target_defs = [target_join + f for f in features] + target_defs = '\n'.join(target_defs) + + with open(wrap_path, "w") as fd: + fd.write(textwrap.dedent("""\ + /** + * AUTOGENERATED DON'T EDIT + * Please make changes to the code generator \ + (distutils/ccompiler_opt.py) + */ + #define {pfx}CPU_TARGET_MODE + #define {pfx}CPU_TARGET_CURRENT {target_name} + {target_defs} + #include "{path}" + """).format( + pfx=self.conf_c_prefix_, target_name=target_name, + path=os.path.abspath(dispatch_src), target_defs=target_defs + )) + return wrap_path + + def _generate_config(self, output_dir, dispatch_src, targets, has_baseline=False): + config_path = os.path.basename(dispatch_src).replace(".c", ".h") + config_path = os.path.join(output_dir, config_path) + # check if targets didn't change to avoid recompiling + cache_hash = self.cache_hash(targets, has_baseline) + try: + with open(config_path) as f: + last_hash = f.readline().split("cache_hash:") + if len(last_hash) == 2 and int(last_hash[1]) == cache_hash: + return True + except IOError: + pass + + self.dist_log("generate dispatched config -> ", config_path) + dispatch_calls = [] + for tar in targets: + if isinstance(tar, str): + target_name = tar + else: # multi target + target_name = '__'.join([t for t in tar]) + req_detect = self.feature_detect(tar) + req_detect = '&&'.join([ + "CHK(%s)" % f for f in req_detect + ]) + dispatch_calls.append( + "\t%sCPU_DISPATCH_EXPAND_(CB((%s), %s, __VA_ARGS__))" % ( + self.conf_c_prefix_, req_detect, target_name + )) + dispatch_calls = ' \\\n'.join(dispatch_calls) + + if has_baseline: + baseline_calls = ( + "\t%sCPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))" + ) % self.conf_c_prefix_ + else: + baseline_calls = '' + + with open(config_path, "w") as fd: + fd.write(textwrap.dedent("""\ + // cache_hash:{cache_hash} + /** + * AUTOGENERATED DON'T EDIT + * Please make changes to the code generator (distutils/ccompiler_opt.py) + */ + #ifndef {pfx}CPU_DISPATCH_EXPAND_ + #define {pfx}CPU_DISPATCH_EXPAND_(X) X + #endif + #undef {pfx}CPU_DISPATCH_BASELINE_CALL + #undef {pfx}CPU_DISPATCH_CALL + #define {pfx}CPU_DISPATCH_BASELINE_CALL(CB, ...) \\ + {baseline_calls} + #define {pfx}CPU_DISPATCH_CALL(CHK, CB, ...) \\ + {dispatch_calls} + """).format( + pfx=self.conf_c_prefix_, baseline_calls=baseline_calls, + dispatch_calls=dispatch_calls, cache_hash=cache_hash + )) + return False + +def new_ccompiler_opt(compiler, **kwargs): + """ + Create a new instance of 'CCompilerOpt' and generate the dispatch header + inside NumPy source dir. + + Parameters + ---------- + 'compiler' : CCompiler instance + '**kwargs': passed as-is to `CCompilerOpt(...)` + + Returns + ------- + new instance of CCompilerOpt + """ + opt = CCompilerOpt(compiler, **kwargs) + npy_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + header_dir = os.path.join(npy_path, *("core/src/common".split("/"))) + header_path = os.path.join(header_dir, "_cpu_dispatch.h") + if not os.path.exists(header_path) or not opt.is_cached(): + if not os.path.exists(header_dir): + opt.dist_log( + "dispatch header dir '%s' isn't exist, creating it" % header_dir, + stderr=True + ) + os.makedirs(header_dir) + opt.generate_dispatch_header(header_path) + return opt diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c new file mode 100644 index 000000000..8df556b6c --- /dev/null +++ b/numpy/distutils/checks/cpu_asimd.c @@ -0,0 +1,25 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + /* MAXMIN */ + int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0); + ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0); + /* ROUNDING */ + ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0); +#ifdef __aarch64__ + { + float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + /* MAXMIN */ + ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0); + ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0); + /* ROUNDING */ + ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0); + } +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c new file mode 100644 index 000000000..0158d1354 --- /dev/null +++ b/numpy/distutils/checks/cpu_asimddp.c @@ -0,0 +1,15 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2); + uint32x4_t va = vdupq_n_u32(3); + int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0); +#ifdef __aarch64__ + ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c new file mode 100644 index 000000000..bb437aa40 --- /dev/null +++ b/numpy/distutils/checks/cpu_asimdfhm.c @@ -0,0 +1,17 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float16x8_t vhp = vdupq_n_f16((float16_t)1); + float16x4_t vlhp = vdup_n_f16((float16_t)1); + float32x4_t vf = vdupq_n_f32(1.0f); + float32x2_t vlf = vdup_n_f32(1.0f); + + int ret = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0); + ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0); + + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c new file mode 100644 index 000000000..80b94000f --- /dev/null +++ b/numpy/distutils/checks/cpu_asimdhp.c @@ -0,0 +1,14 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float16x8_t vhp = vdupq_n_f16((float16_t)-1); + float16x4_t vlhp = vdup_n_f16((float16_t)-1); + + int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0); + ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0); + return ret; +} diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c new file mode 100644 index 000000000..737c0d2e9 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c new file mode 100644 index 000000000..dfb11fd79 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx2.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m256i a = _mm256_abs_epi16(_mm256_setzero_si256()); + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c new file mode 100644 index 000000000..71dad83a7 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_clx.c @@ -0,0 +1,8 @@ +#include <immintrin.h> + +int main(void) +{ + /* VNNI */ + __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c new file mode 100644 index 000000000..dfab4436d --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_cnl.c @@ -0,0 +1,10 @@ +#include <immintrin.h> + +int main(void) +{ + /* IFMA */ + __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + /* VMBI */ + a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c new file mode 100644 index 000000000..cf2706b3b --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_icl.c @@ -0,0 +1,12 @@ +#include <immintrin.h> + +int main(void) +{ + /* VBMI2 */ + __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + /* BITLAG */ + a = _mm512_popcnt_epi8(a); + /* VPOPCNTDQ */ + a = _mm512_popcnt_epi64(a); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c new file mode 100644 index 000000000..0699f37a6 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_knl.c @@ -0,0 +1,11 @@ +#include <immintrin.h> + +int main(void) +{ + int base[128]; + /* ER */ + __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd())); + /* PF */ + _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1); + return base[0]; +} diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c new file mode 100644 index 000000000..db61b4bfa --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_knm.c @@ -0,0 +1,17 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_setzero_si512(); + __m512 b = _mm512_setzero_ps(); + + /* 4FMAPS */ + b = _mm512_4fmadd_ps(b, b, b, b, b, NULL); + /* 4VNNIW */ + a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL); + /* VPOPCNTDQ */ + a = _mm512_popcnt_epi64(a); + + a = _mm512_add_epi32(a, _mm512_castps_si512(b)); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c new file mode 100644 index 000000000..1d5e15b5e --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_skx.c @@ -0,0 +1,12 @@ +#include <immintrin.h> + +int main(void) +{ + /* VL */ + __m256i a = _mm256_abs_epi64(_mm256_setzero_si256()); + /* DQ */ + __m512i b = _mm512_broadcast_i32x8(a); + /* BW */ + b = _mm512_abs_epi16(b); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(b)); +} diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c new file mode 100644 index 000000000..61bef6b82 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512cd.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c new file mode 100644 index 000000000..f60cc09dd --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512f.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_abs_epi32(_mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c new file mode 100644 index 000000000..a5a343e2d --- /dev/null +++ b/numpy/distutils/checks/cpu_f16c.c @@ -0,0 +1,9 @@ +#include <emmintrin.h> +#include <immintrin.h> + +int main(void) +{ + __m128 a = _mm_cvtph_ps(_mm_setzero_si128()); + __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128()); + return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8))); +} diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c new file mode 100644 index 000000000..cf34c6cb1 --- /dev/null +++ b/numpy/distutils/checks/cpu_fma3.c @@ -0,0 +1,8 @@ +#include <xmmintrin.h> +#include <immintrin.h> + +int main(void) +{ + __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c new file mode 100644 index 000000000..1ad717033 --- /dev/null +++ b/numpy/distutils/checks/cpu_fma4.c @@ -0,0 +1,12 @@ +#include <immintrin.h> +#ifdef _MSC_VER + #include <ammintrin.h> +#else + #include <x86intrin.h> +#endif + +int main(void) +{ + __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c new file mode 100644 index 000000000..4eab1f384 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon.c @@ -0,0 +1,15 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0); +#ifdef __aarch64__ + float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c new file mode 100644 index 000000000..745d2e793 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon_fp16.c @@ -0,0 +1,11 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + short z4[] = {0, 0, 0, 0, 0, 0, 0, 0}; + float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4)); + return (int)vgetq_lane_f32(v_z4, 0); +} diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c new file mode 100644 index 000000000..45f7b5d69 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon_vfpv4.c @@ -0,0 +1,19 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f); + float32x4_t v2 = vdupq_n_f32(2.0f); + float32x4_t v3 = vdupq_n_f32(3.0f); + int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0); +#ifdef __aarch64__ + float64x2_t vd1 = vdupq_n_f64(1.0); + float64x2_t vd2 = vdupq_n_f64(2.0); + float64x2_t vd3 = vdupq_n_f64(3.0); + ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c new file mode 100644 index 000000000..e6a80fb40 --- /dev/null +++ b/numpy/distutils/checks/cpu_popcnt.c @@ -0,0 +1,23 @@ +#ifdef _MSC_VER + #include <nmmintrin.h> +#else + #include <popcntintrin.h> +#endif + +int main(void) +{ + long long a = 0; + int b; +#ifdef _MSC_VER + #ifdef _M_X64 + a = _mm_popcnt_u64(1); + #endif + b = _mm_popcnt_u32(1); +#else + #ifdef __x86_64__ + a = __builtin_popcountll(1); + #endif + b = __builtin_popcount(1); +#endif + return (int)a + b; +} diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c new file mode 100644 index 000000000..bb98bf63c --- /dev/null +++ b/numpy/distutils/checks/cpu_sse.c @@ -0,0 +1,7 @@ +#include <xmmintrin.h> + +int main(void) +{ + __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c new file mode 100644 index 000000000..658afc9b4 --- /dev/null +++ b/numpy/distutils/checks/cpu_sse2.c @@ -0,0 +1,7 @@ +#include <emmintrin.h> + +int main(void) +{ + __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128()); + return _mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c new file mode 100644 index 000000000..aece1e601 --- /dev/null +++ b/numpy/distutils/checks/cpu_sse3.c @@ -0,0 +1,7 @@ +#include <pmmintrin.h> + +int main(void) +{ + __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c new file mode 100644 index 000000000..bfdb9feac --- /dev/null +++ b/numpy/distutils/checks/cpu_sse41.c @@ -0,0 +1,7 @@ +#include <smmintrin.h> + +int main(void) +{ + __m128 a = _mm_floor_ps(_mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c new file mode 100644 index 000000000..24f5d93fe --- /dev/null +++ b/numpy/distutils/checks/cpu_sse42.c @@ -0,0 +1,7 @@ +#include <smmintrin.h> + +int main(void) +{ + __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c new file mode 100644 index 000000000..ad0abc1e6 --- /dev/null +++ b/numpy/distutils/checks/cpu_ssse3.c @@ -0,0 +1,7 @@ +#include <tmmintrin.h> + +int main(void) +{ + __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128()); + return (int)_mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c new file mode 100644 index 000000000..0b3f30d6a --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx.c @@ -0,0 +1,21 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #define vsx_ld vec_vsx_ld + #define vsx_st vec_vsx_st +#else + #define vsx_ld vec_xl + #define vsx_st vec_xst +#endif + +int main(void) +{ + unsigned int zout[4]; + unsigned int z4[] = {0, 0, 0, 0}; + __vector unsigned int v_z4 = vsx_ld(0, z4); + vsx_st(v_z4, 0, zout); + return zout[0]; +} diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c new file mode 100644 index 000000000..410fb29d6 --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx2.c @@ -0,0 +1,13 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector unsigned long long v_uint64x2; + +int main(void) +{ + v_uint64x2 z2 = (v_uint64x2){0, 0}; + z2 = (v_uint64x2)vec_cmpeq(z2, z2); + return (int)vec_extract(z2, 0); +} diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c new file mode 100644 index 000000000..857526535 --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx3.c @@ -0,0 +1,13 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector unsigned int v_uint32x4; + +int main(void) +{ + v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0}; + z4 = vec_absd(z4, z4); + return (int)vec_extract(z4, 0); +} diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c new file mode 100644 index 000000000..51d70cf2b --- /dev/null +++ b/numpy/distutils/checks/cpu_xop.c @@ -0,0 +1,12 @@ +#include <immintrin.h> +#ifdef _MSC_VER + #include <ammintrin.h> +#else + #include <x86intrin.h> +#endif + +int main(void) +{ + __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128()); + return _mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c new file mode 100644 index 000000000..4cd09d42a --- /dev/null +++ b/numpy/distutils/checks/test_flags.c @@ -0,0 +1 @@ +int test_flags; diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py index a156a7c6e..60ba4c917 100644 --- a/numpy/distutils/command/build.py +++ b/numpy/distutils/command/build.py @@ -16,6 +16,12 @@ class build(old_build): "specify the Fortran compiler type"), ('warn-error', None, "turn all warnings into errors (-Werror)"), + ('cpu-baseline=', None, + "specify a list of enabled baseline CPU optimizations"), + ('cpu-dispatch=', None, + "specify a list of dispatched CPU optimizations"), + ('disable-optimization', None, + "disable CPU optimized code(dispatch,simd,fast...)"), ] help_options = old_build.help_options + [ @@ -27,6 +33,9 @@ class build(old_build): old_build.initialize_options(self) self.fcompiler = None self.warn_error = False + self.cpu_baseline = "min" + self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default + self.disable_optimization = False def finalize_options(self): build_scripts = self.build_scripts diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py index f6a84e351..87345adbc 100644 --- a/numpy/distutils/command/build_clib.py +++ b/numpy/distutils/command/build_clib.py @@ -13,6 +13,7 @@ from numpy.distutils.misc_util import ( filter_sources, get_lib_source_files, get_numpy_include_dirs, has_cxx_sources, has_f_sources, is_sequence ) +from numpy.distutils.ccompiler_opt import new_ccompiler_opt # Fix Python distutils bug sf #1718574: _l = old_build_clib.user_options @@ -34,9 +35,16 @@ class build_clib(old_build_clib): "number of parallel jobs"), ('warn-error', None, "turn all warnings into errors (-Werror)"), + ('cpu-baseline=', None, + "specify a list of enabled baseline CPU optimizations"), + ('cpu-dispatch=', None, + "specify a list of dispatched CPU optimizations"), + ('disable-optimization', None, + "disable CPU optimized code(dispatch,simd,fast...)"), ] - boolean_options = old_build_clib.boolean_options + ['inplace', 'warn-error'] + boolean_options = old_build_clib.boolean_options + \ + ['inplace', 'warn-error', 'disable-optimization'] def initialize_options(self): old_build_clib.initialize_options(self) @@ -44,6 +52,10 @@ class build_clib(old_build_clib): self.inplace = 0 self.parallel = None self.warn_error = None + self.cpu_baseline = None + self.cpu_dispatch = None + self.disable_optimization = None + def finalize_options(self): if self.parallel: @@ -55,6 +67,9 @@ class build_clib(old_build_clib): self.set_undefined_options('build', ('parallel', 'parallel'), ('warn_error', 'warn_error'), + ('cpu_baseline', 'cpu_baseline'), + ('cpu_dispatch', 'cpu_dispatch'), + ('disable_optimization', 'disable_optimization') ) def have_f_sources(self): @@ -102,6 +117,25 @@ class build_clib(old_build_clib): self.compiler.show_customization() + if not self.disable_optimization: + opt_cache_path = os.path.abspath( + os.path.join(self.build_temp, 'ccompiler_opt_cache_clib.py' + )) + self.compiler_opt = new_ccompiler_opt( + compiler=self.compiler, cpu_baseline=self.cpu_baseline, + cpu_dispatch=self.cpu_dispatch, cache_path=opt_cache_path + ) + if not self.compiler_opt.is_cached(): + log.info("Detected changes on compiler optimizations, force rebuilding") + self.force = True + + import atexit + def report(): + log.info("\n########### CLIB COMPILER OPTIMIZATION ###########") + log.info(self.compiler_opt.report(full=True)) + + atexit.register(report) + if self.have_f_sources(): from numpy.distutils.fcompiler import new_fcompiler self._f_compiler = new_fcompiler(compiler=self.fcompiler, @@ -211,6 +245,8 @@ class build_clib(old_build_clib): 'extra_f90_compile_args') or [] macros = build_info.get('macros') + if macros is None: + macros = [] include_dirs = build_info.get('include_dirs') if include_dirs is None: include_dirs = [] @@ -223,6 +259,31 @@ class build_clib(old_build_clib): if requiref90: self.mkpath(module_build_dir) + dispatch_objects = [] + if not self.disable_optimization: + dispatch_sources = [ + c_sources.pop(c_sources.index(src)) + for src in c_sources[:] if src.endswith(".dispatch.c") + ] + if dispatch_sources: + if not self.inplace: + build_src = self.get_finalized_command("build_src").build_src + else: + build_src = None + dispatch_objects = self.compiler_opt.try_dispatch( + dispatch_sources, + output_dir=self.build_temp, + src_dir=build_src, + macros=macros, + include_dirs=include_dirs, + debug=self.debug, + extra_postargs=extra_postargs + ) + extra_args_baseopt = extra_postargs + self.compiler_opt.cpu_baseline_flags() + else: + extra_args_baseopt = extra_postargs + macros.append(("NPY_DISABLE_OPTIMIZATION", 1)) + if compiler.compiler_type == 'msvc': # this hack works around the msvc compiler attributes # problem, msvc uses its own convention :( @@ -237,7 +298,8 @@ class build_clib(old_build_clib): macros=macros, include_dirs=include_dirs, debug=self.debug, - extra_postargs=extra_postargs) + extra_postargs=extra_args_baseopt) + objects.extend(dispatch_objects) if cxx_sources: log.info("compiling C++ sources") diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py index d53285c92..b6557fcf6 100644 --- a/numpy/distutils/command/build_ext.py +++ b/numpy/distutils/command/build_ext.py @@ -19,7 +19,7 @@ from numpy.distutils.misc_util import ( has_cxx_sources, has_f_sources, is_sequence ) from numpy.distutils.command.config_compiler import show_fortran_compilers - +from numpy.distutils.ccompiler_opt import new_ccompiler_opt class build_ext (old_build_ext): @@ -33,6 +33,12 @@ class build_ext (old_build_ext): "number of parallel jobs"), ('warn-error', None, "turn all warnings into errors (-Werror)"), + ('cpu-baseline=', None, + "specify a list of enabled baseline CPU optimizations"), + ('cpu-dispatch=', None, + "specify a list of dispatched CPU optimizations"), + ('disable-optimization', None, + "disable CPU optimized code(dispatch,simd,fast...)"), ] help_options = old_build_ext.help_options + [ @@ -40,13 +46,16 @@ class build_ext (old_build_ext): show_fortran_compilers), ] - boolean_options = old_build_ext.boolean_options + ['warn-error'] + boolean_options = old_build_ext.boolean_options + ['warn-error', 'disable-optimization'] def initialize_options(self): old_build_ext.initialize_options(self) self.fcompiler = None self.parallel = None self.warn_error = None + self.cpu_baseline = None + self.cpu_dispatch = None + self.disable_optimization = None def finalize_options(self): if self.parallel: @@ -75,6 +84,9 @@ class build_ext (old_build_ext): self.set_undefined_options('build', ('parallel', 'parallel'), ('warn_error', 'warn_error'), + ('cpu_baseline', 'cpu_baseline'), + ('cpu_dispatch', 'cpu_dispatch'), + ('disable_optimization', 'disable_optimization'), ) def run(self): @@ -129,6 +141,22 @@ class build_ext (old_build_ext): self.compiler.show_customization() + if not self.disable_optimization: + opt_cache_path = os.path.abspath(os.path.join(self.build_temp, 'ccompiler_opt_cache_ext.py')) + self.compiler_opt = new_ccompiler_opt(compiler=self.compiler, + cpu_baseline=self.cpu_baseline, + cpu_dispatch=self.cpu_dispatch, + cache_path=opt_cache_path) + if not self.compiler_opt.is_cached(): + log.info("Detected changes on compiler optimizations, force rebuilding") + self.force = True + + import atexit + def report(): + log.info("\n########### EXT COMPILER OPTIMIZATION ###########") + log.info(self.compiler_opt.report(full=True)) + atexit.register(report) + # Setup directory for storing generated extra DLL files on Windows self.extra_dll_dir = os.path.join(self.build_temp, '.libs') if not os.path.isdir(self.extra_dll_dir): @@ -378,6 +406,32 @@ class build_ext (old_build_ext): include_dirs = ext.include_dirs + get_numpy_include_dirs() + dispatch_objects = [] + if not self.disable_optimization: + dispatch_sources = [ + c_sources.pop(c_sources.index(src)) + for src in c_sources[:] if src.endswith(".dispatch.c") + ] + if dispatch_sources: + if not self.inplace: + build_src = self.get_finalized_command("build_src").build_src + else: + build_src = None + dispatch_objects = self.compiler_opt.try_dispatch( + dispatch_sources, + output_dir=output_dir, + src_dir=build_src, + macros=macros, + include_dirs=include_dirs, + debug=self.debug, + extra_postargs=extra_args, + **kws + ) + extra_args_baseopt = extra_args + self.compiler_opt.cpu_baseline_flags() + else: + extra_args_baseopt = extra_args + macros.append(("NPY_DISABLE_OPTIMIZATION", 1)) + c_objects = [] if c_sources: log.info("compiling C sources") @@ -386,8 +440,9 @@ class build_ext (old_build_ext): macros=macros, include_dirs=include_dirs, debug=self.debug, - extra_postargs=extra_args, + extra_postargs=extra_args_baseopt, **kws) + c_objects.extend(dispatch_objects) if cxx_sources: log.info("compiling C++ sources") diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py index 88cd1a160..798c3686f 100644 --- a/numpy/distutils/setup.py +++ b/numpy/distutils/setup.py @@ -7,6 +7,7 @@ def configuration(parent_package='',top_path=None): config.add_subpackage('tests') config.add_data_files('site.cfg') config.add_data_files('mingw/gfortran_vs2003_hack.c') + config.add_data_dir('checks') config.make_config_py() return config diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py new file mode 100644 index 000000000..a789be1ea --- /dev/null +++ b/numpy/distutils/tests/test_ccompiler_opt.py @@ -0,0 +1,787 @@ +import re, textwrap, os +from os import sys, path +from distutils.errors import DistutilsError + +is_standalone = __name__ == '__main__' and __package__ is None +if is_standalone: + import unittest, contextlib, tempfile, shutil + sys.path.append(path.abspath(path.join(path.dirname(__file__), ".."))) + from ccompiler_opt import CCompilerOpt + + # from numpy/testing/_private/utils.py + @contextlib.contextmanager + def tempdir(*args, **kwargs): + tmpdir = tempfile.mkdtemp(*args, **kwargs) + try: + yield tmpdir + finally: + shutil.rmtree(tmpdir) + + def assert_(expr, msg=''): + if not expr: + raise AssertionError(msg) +else: + from numpy.distutils.ccompiler_opt import CCompilerOpt + from numpy.testing import assert_, tempdir + +# architectures and compilers to test +arch_compilers = dict( + x86 = ("gcc", "clang", "icc", "iccw", "msvc"), + x64 = ("gcc", "clang", "icc", "iccw", "msvc"), + ppc64 = ("gcc", "clang"), + ppc64le = ("gcc", "clang"), + armhf = ("gcc", "clang"), + aarch64 = ("gcc", "clang"), + noarch = ("gcc",) +) + +class FakeCCompilerOpt(CCompilerOpt): + fake_info = "" + def __init__(self, trap_files="", trap_flags="", *args, **kwargs): + self.fake_trap_files = trap_files + self.fake_trap_flags = trap_flags + CCompilerOpt.__init__(self, None, **kwargs) + + def __repr__(self): + return textwrap.dedent("""\ + <<<< + march : {} + compiler : {} + ---------------- + {} + >>>> + """).format(self.cc_march, self.cc_name, self.report()) + + def dist_compile(self, sources, flags, **kwargs): + assert(isinstance(sources, list)) + assert(isinstance(flags, list)) + if self.fake_trap_files: + for src in sources: + if re.match(self.fake_trap_files, src): + self.dist_error("source is trapped by a fake interface") + if self.fake_trap_flags: + for f in flags: + if re.match(self.fake_trap_flags, f): + self.dist_error("flag is trapped by a fake interface") + # fake objects + return zip(sources, [' '.join(flags)] * len(sources)) + + def dist_info(self): + return FakeCCompilerOpt.fake_info + + @staticmethod + def dist_log(*args, stderr=False): + pass + +class _Test_CCompilerOpt(object): + arch = None # x86_64 + cc = None # gcc + + def setup(self): + FakeCCompilerOpt.conf_nocache = True + self._opt = None + + def nopt(self, *args, **kwargs): + FakeCCompilerOpt.fake_info = self.arch + '_' + self.cc + return FakeCCompilerOpt(*args, **kwargs) + + def opt(self): + if not self._opt: + self._opt = self.nopt() + return self._opt + + def march(self): + return self.opt().cc_march + + def cc_name(self): + return self.opt().cc_name + + def get_targets(self, targets, groups, **kwargs): + FakeCCompilerOpt.conf_target_groups = groups + opt = self.nopt( + cpu_baseline=kwargs.get("baseline", "min"), + cpu_dispatch=kwargs.get("dispatch", "max"), + trap_files=kwargs.get("trap_files", ""), + trap_flags=kwargs.get("trap_flags", "") + ) + with tempdir() as tmpdir: + file = os.path.join(tmpdir, "test_targets.c") + with open(file, 'w') as f: + f.write(targets) + gtargets = [] + gflags = {} + fake_objects = opt.try_dispatch([file]) + for source, flags in fake_objects: + gtar = source.split('.')[1:-1] + glen = len(gtar) + if glen == 0: + gtar = "baseline" + elif glen == 1: + gtar = gtar[0].upper() + else: + # converting multi-target into parentheses str format to be equivalent + # to the configuration statements syntax. + gtar = ('('+' '.join(gtar)+')').upper() + gtargets.append(gtar) + gflags[gtar] = flags + + has_baseline, targets = opt.sources_status[file] + targets = targets + ["baseline"] if has_baseline else targets + # convert tuple that represent multi-target into parentheses str format + targets = [ + '('+' '.join(tar)+')' if isinstance(tar, tuple) else tar + for tar in targets + ] + if len(targets) != len(gtargets) or not all(t in gtargets for t in targets): + raise AssertionError( + "'sources_status' returns different targets than the compiled targets\n" + "%s != %s" % (targets, gtargets) + ) + # return targets from 'sources_status' since the order is matters + return targets, gflags + + def arg_regex(self, **kwargs): + map2origin = dict( + x64 = "x86", + ppc64le = "ppc64", + aarch64 = "armhf", + clang = "gcc", + ) + march = self.march(); cc_name = self.cc_name() + map_march = map2origin.get(march, march) + map_cc = map2origin.get(cc_name, cc_name) + for key in ( + march, cc_name, map_march, map_cc, + march + '_' + cc_name, + map_march + '_' + cc_name, + march + '_' + map_cc, + map_march + '_' + map_cc, + ) : + regex = kwargs.pop(key, None) + if regex is not None: + break + if regex: + if isinstance(regex, dict): + for k, v in regex.items(): + if v[-1:] not in ')}$?\\.+*': + regex[k] = v + '$' + else: + assert(isinstance(regex, str)) + if regex[-1:] not in ')}$?\\.+*': + regex += '$' + return regex + + def expect(self, dispatch, baseline="", **kwargs): + match = self.arg_regex(**kwargs) + if match is None: + return + opt = self.nopt( + cpu_baseline=baseline, cpu_dispatch=dispatch, + trap_files=kwargs.get("trap_files", ""), + trap_flags=kwargs.get("trap_flags", "") + ) + features = ' '.join(opt.cpu_dispatch_names()) + if not match: + if len(features) != 0: + raise AssertionError( + 'expected empty features, not "%s"' % features + ) + return + if not re.match(match, features, re.IGNORECASE): + raise AssertionError( + 'dispatch features "%s" not match "%s"' % (features, match) + ) + + def expect_baseline(self, baseline, dispatch="", **kwargs): + match = self.arg_regex(**kwargs) + if match is None: + return + opt = self.nopt( + cpu_baseline=baseline, cpu_dispatch=dispatch, + trap_files=kwargs.get("trap_files", ""), + trap_flags=kwargs.get("trap_flags", "") + ) + features = ' '.join(opt.cpu_baseline_names()) + if not match: + if len(features) != 0: + raise AssertionError( + 'expected empty features, not "%s"' % features + ) + return + if not re.match(match, features, re.IGNORECASE): + raise AssertionError( + 'baseline features "%s" not match "%s"' % (features, match) + ) + + def expect_flags(self, baseline, dispatch="", **kwargs): + match = self.arg_regex(**kwargs) + if match is None: + return + opt = self.nopt( + cpu_baseline=baseline, cpu_dispatch=dispatch, + trap_files=kwargs.get("trap_files", ""), + trap_flags=kwargs.get("trap_flags", "") + ) + flags = ' '.join(opt.cpu_baseline_flags()) + if not match: + if len(flags) != 0: + raise AssertionError( + 'expected empty flags not "%s"' % flags + ) + return + if not re.match(match, flags): + raise AssertionError( + 'flags "%s" not match "%s"' % (flags, match) + ) + + def expect_targets(self, targets, groups={}, **kwargs): + match = self.arg_regex(**kwargs) + if match is None: + return + targets, _ = self.get_targets(targets=targets, groups=groups, **kwargs) + targets = ' '.join(targets) + if not match: + if len(targets) != 0: + raise AssertionError( + 'expected empty targets, not "%s"' % targets + ) + return + if not re.match(match, targets, re.IGNORECASE): + raise AssertionError( + 'targets "%s" not match "%s"' % (targets, match) + ) + + def expect_target_flags(self, targets, groups={}, **kwargs): + match_dict = self.arg_regex(**kwargs) + if match_dict is None: + return + assert(isinstance(match_dict, dict)) + _, tar_flags = self.get_targets(targets=targets, groups=groups) + + for match_tar, match_flags in match_dict.items(): + if match_tar not in tar_flags: + raise AssertionError( + 'expected to find target "%s"' % match_tar + ) + flags = tar_flags[match_tar] + if not match_flags: + if len(flags) != 0: + raise AssertionError( + 'expected to find empty flags in target "%s"' % match_tar + ) + if not re.match(match_flags, flags): + raise AssertionError( + '"%s" flags "%s" not match "%s"' % (match_tar, flags, match_flags) + ) + + def test_interface(self): + wrong_arch = "ppc64" if self.arch != "ppc64" else "x86" + wrong_cc = "clang" if self.cc != "clang" else "icc" + opt = self.opt() + assert_(getattr(opt, "cc_on_" + self.arch)) + assert_(not getattr(opt, "cc_on_" + wrong_arch)) + assert_(getattr(opt, "cc_is_" + self.cc)) + assert_(not getattr(opt, "cc_is_" + wrong_cc)) + + def test_args_empty(self): + for baseline, dispatch in ( + ("", "none"), + (None, ""), + ("none +none", "none - none"), + ("none -max", "min - max"), + ("+vsx2 -VSX2", "vsx avx2 avx512f -max"), + ("max -vsx - avx + avx512f neon -MAX ", + "min -min + max -max -vsx + avx2 -avx2 +NONE") + ) : + opt = self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch) + assert(len(opt.cpu_baseline_names()) == 0) + assert(len(opt.cpu_dispatch_names()) == 0) + + def test_args_validation(self): + if self.march() == "unknown": + return + # check sanity of argument's validation + for baseline, dispatch in ( + ("unkown_feature - max +min", "unknown max min"), # unknowing features + ("#avx2", "$vsx") # groups and polices aren't acceptable + ) : + try: + self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch) + raise AssertionError("excepted an exception for invalid arguments") + except DistutilsError: + pass + + def test_skip(self): + # only takes what platform supports and skip the others + # without casing exceptions + self.expect( + "sse vsx neon", + x86="sse", ppc64="vsx", armhf="neon", unknown="" + ) + self.expect( + "sse41 avx avx2 vsx2 vsx3 neon_vfpv4 asimd", + x86 = "sse41 avx avx2", + ppc64 = "vsx2 vsx3", + armhf = "neon_vfpv4 asimd", + unknown = "" + ) + # any features in cpu_dispatch must be ignored if it's part of baseline + self.expect( + "sse neon vsx", baseline="sse neon vsx", + x86="", ppc64="", armhf="" + ) + self.expect( + "avx2 vsx3 asimdhp", baseline="avx2 vsx3 asimdhp", + x86="", ppc64="", armhf="" + ) + + def test_implies(self): + # baseline combining implied features, so we count + # on it instead of testing 'feature_implies()'' directly + self.expect_baseline( + "fma3 avx2 asimd vsx3", + # .* between two spaces can validate features in between + x86 = "sse .* sse41 .* fma3.*avx2", + ppc64 = "vsx vsx2 vsx3", + armhf = "neon neon_fp16 neon_vfpv4 asimd" + ) + """ + special cases + """ + # in icc and msvc, FMA3 and AVX2 can't be separated + # both need to implies each other, same for avx512f & cd + for f0, f1 in ( + ("fma3", "avx2"), + ("avx512f", "avx512cd"), + ): + diff = ".* sse42 .* %s .*%s$" % (f0, f1) + self.expect_baseline(f0, + x86_gcc=".* sse42 .* %s$" % f0, + x86_icc=diff, x86_iccw=diff + ) + self.expect_baseline(f1, + x86_gcc=".* avx .* %s$" % f1, + x86_icc=diff, x86_iccw=diff + ) + # in msvc, following features can't be separated too + for f in (("fma3", "avx2"), ("avx512f", "avx512cd", "avx512_skx")): + for ff in f: + self.expect_baseline(ff, + x86_msvc=".*%s" % ' '.join(f) + ) + + # in ppc64le VSX and VSX2 can't be separated + self.expect_baseline("vsx", ppc64le="vsx vsx2") + # in aarch64 following features can't be separated + for f in ("neon", "neon_fp16", "neon_vfpv4", "asimd"): + self.expect_baseline(f, aarch64="neon neon_fp16 neon_vfpv4 asimd") + + def test_args_options(self): + # max & native + for o in ("max", "native"): + if o == "native" and self.cc_name() == "msvc": + continue + self.expect(o, + trap_files=".*cpu_(sse|vsx|neon).c", + x86="", ppc64="", armhf="" + ) + self.expect(o, + trap_files=".*cpu_(sse3|vsx2|neon_vfpv4).c", + x86="sse sse2", ppc64="vsx", armhf="neon neon_fp16", + aarch64="", ppc64le="" + ) + self.expect(o, + trap_files=".*cpu_(popcnt|vsx3).c", + x86="sse .* sse41", ppc64="vsx vsx2", + armhf="neon neon_fp16 .* asimd .*" + ) + self.expect(o, + x86_gcc=".* xop fma4 .* avx512f .* avx512_knl avx512_knm avx512_skx .*", + # in icc, xop and fam4 aren't supported + x86_icc=".* avx512f .* avx512_knl avx512_knm avx512_skx .*", + x86_iccw=".* avx512f .* avx512_knl avx512_knm avx512_skx .*", + # in msvc, avx512_knl avx512_knm aren't supported + x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*", + armhf=".* asimd asimdhp asimddp .*", + ppc64="vsx vsx2 vsx3.*" + ) + # min + self.expect("min", + x86="sse sse2", x64="sse sse2 sse3", + armhf="", aarch64="neon neon_fp16 .* asimd", + ppc64="", ppc64le="vsx vsx2" + ) + self.expect( + "min", trap_files=".*cpu_(sse2|vsx2).c", + x86="", ppc64le="" + ) + # an exception must triggered if native flag isn't supported + # when option "native" is activated through the args + try: + self.expect("native", + trap_flags=".*(-march=native|-xHost|/QxHost).*", + x86=".*", ppc64=".*", armhf=".*" + ) + if self.march() != "unknown": + raise AssertionError( + "excepted an exception for %s" % self.march() + ) + except DistutilsError: + if self.march() == "unknown": + raise AssertionError("excepted no exceptions") + + def test_flags(self): + self.expect_flags( + "sse sse2 vsx vsx2 neon neon_fp16", + x86_gcc="-msse -msse2", x86_icc="-msse -msse2", + x86_iccw="/arch:SSE2", x86_msvc="/arch:SSE2", + ppc64_gcc= "-mcpu=power8", + ppc64_clang="-maltivec -mvsx -mpower8-vector", + armhf_gcc="-mfpu=neon-fp16 -mfp16-format=ieee", + aarch64="" + ) + # testing normalize -march + self.expect_flags( + "asimd", + aarch64="", + armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8-a\+simd" + ) + self.expect_flags( + "asimdhp", + aarch64_gcc=r"-march=armv8.2-a\+fp16", + armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8.2-a\+fp16" + ) + self.expect_flags( + "asimddp", aarch64_gcc=r"-march=armv8.2-a\+dotprod" + ) + self.expect_flags( + # asimdfhm implies asimdhp + "asimdfhm", aarch64_gcc=r"-march=armv8.2-a\+fp16\+fp16fml" + ) + self.expect_flags( + "asimddp asimdhp asimdfhm", + aarch64_gcc=r"-march=armv8.2-a\+dotprod\+fp16\+fp16fml" + ) + + def test_targets_exceptions(self): + for targets in ( + "bla bla", "/*@targets", + "/*@targets */", + "/*@targets unknown */", + "/*@targets $unknown_policy avx2 */", + "/*@targets #unknown_group avx2 */", + "/*@targets $ */", + "/*@targets # vsx */", + "/*@targets #$ vsx */", + "/*@targets vsx avx2 ) */", + "/*@targets vsx avx2 (avx2 */", + "/*@targets vsx avx2 () */", + "/*@targets vsx avx2 ($autovec) */", # no features + "/*@targets vsx avx2 (xxx) */", + "/*@targets vsx avx2 (baseline) */", + ) : + try: + self.expect_targets( + targets, + x86="", armhf="", ppc64="" + ) + if self.march() != "unknown": + raise AssertionError( + "excepted an exception for %s" % self.march() + ) + except DistutilsError: + if self.march() == "unknown": + raise AssertionError("excepted no exceptions") + + def test_targets_syntax(self): + for targets in ( + "/*@targets $keep_baseline sse vsx neon*/", + "/*@targets,$keep_baseline,sse,vsx,neon*/", + "/*@targets*$keep_baseline*sse*vsx*neon*/", + """ + /* + ** @targets + ** $keep_baseline, sse vsx,neon + */ + """, + """ + /* + ************@targets************* + ** $keep_baseline, sse vsx, neon + ********************************* + */ + """, + """ + /* + /////////////@targets///////////////// + //$keep_baseline//sse//vsx//neon + ///////////////////////////////////// + */ + """, + """ + /* + @targets + $keep_baseline + SSE VSX NEON*/ + """ + ) : + self.expect_targets(targets, + x86="sse", ppc64="vsx", armhf="neon", unknown="" + ) + + def test_targets(self): + # test skipping baseline features + self.expect_targets( + """ + /*@targets + sse sse2 sse41 avx avx2 avx512f + vsx vsx2 vsx3 + neon neon_fp16 asimdhp asimddp + */ + """, + baseline="avx vsx2 asimd", + x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3" + ) + # test skipping non-dispatch features + self.expect_targets( + """ + /*@targets + sse41 avx avx2 avx512f + vsx2 vsx3 + asimd asimdhp asimddp + */ + """, + baseline="", dispatch="sse41 avx2 vsx2 asimd asimddp", + x86="avx2 sse41", armhf="asimddp asimd", ppc64="vsx2" + ) + # test skipping features that not supported + self.expect_targets( + """ + /*@targets + sse2 sse41 avx2 avx512f + vsx2 vsx3 + neon asimdhp asimddp + */ + """, + baseline="", + trap_files=".*(avx2|avx512f|vsx3|asimddp).c", + x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon" + ) + # test skipping features that implies each other + self.expect_targets( + """ + /*@targets + sse sse2 avx fma3 avx2 avx512f avx512cd + vsx vsx2 vsx3 + neon neon_vfpv4 neon_fp16 neon_fp16 asimd asimdhp + asimddp asimdfhm + */ + """, + baseline="", + x86_gcc="avx512cd avx512f avx2 fma3 avx sse2", + x86_msvc="avx512cd avx2 avx sse2", + x86_icc="avx512cd avx2 avx sse2", + x86_iccw="avx512cd avx2 avx sse2", + ppc64="vsx3 vsx2 vsx", + ppc64le="vsx3 vsx2", + armhf="asimdfhm asimddp asimdhp asimd neon_vfpv4 neon_fp16 neon", + aarch64="asimdfhm asimddp asimdhp asimd" + ) + + def test_targets_policies(self): + # 'keep_baseline', generate objects for baseline features + self.expect_targets( + """ + /*@targets + $keep_baseline + sse2 sse42 avx2 avx512f + vsx2 vsx3 + neon neon_vfpv4 asimd asimddp + */ + """, + baseline="sse41 avx2 vsx2 asimd vsx3", + x86="avx512f avx2 sse42 sse2", + ppc64="vsx3 vsx2", + armhf="asimddp asimd neon_vfpv4 neon", + # neon, neon_vfpv4, asimd implies each other + aarch64="asimddp asimd" + ) + # 'keep_sort', leave the sort as-is + self.expect_targets( + """ + /*@targets + $keep_baseline $keep_sort + avx512f sse42 avx2 sse2 + vsx2 vsx3 + asimd neon neon_vfpv4 asimddp + */ + """, + x86="avx512f sse42 avx2 sse2", + ppc64="vsx2 vsx3", + armhf="asimd neon neon_vfpv4 asimddp", + # neon, neon_vfpv4, asimd implies each other + aarch64="asimd asimddp" + ) + # 'autovec', skipping features that can't be + # vectorized by the compiler + self.expect_targets( + """ + /*@targets + $keep_baseline $keep_sort $autovec + avx512f avx2 sse42 sse41 sse2 + vsx3 vsx2 + asimddp asimd neon_vfpv4 neon + */ + """, + x86_gcc="avx512f avx2 sse42 sse41 sse2", + x86_icc="avx512f avx2 sse42 sse41 sse2", + x86_iccw="avx512f avx2 sse42 sse41 sse2", + x86_msvc="avx512f avx2 sse2", + ppc64="vsx3 vsx2", + armhf="asimddp asimd neon_vfpv4 neon", + # neon, neon_vfpv4, asimd implies each other + aarch64="asimddp asimd" + ) + for policy in ("$maxopt", "$autovec"): + # 'maxopt' and autovec set the max acceptable optimization flags + self.expect_target_flags( + "/*@targets baseline %s */" % policy, + gcc={"baseline":".*-O3.*"}, icc={"baseline":".*-O3.*"}, + iccw={"baseline":".*/O3.*"}, msvc={"baseline":".*/O2.*"}, + unknown={"baseline":".*"} + ) + + # 'werror', force compilers to treat warnings as errors + self.expect_target_flags( + "/*@targets baseline $werror */", + gcc={"baseline":".*-Werror.*"}, icc={"baseline":".*-Werror.*"}, + iccw={"baseline":".*/Werror.*"}, msvc={"baseline":".*/WX.*"}, + unknown={"baseline":".*"} + ) + + def test_targets_groups(self): + self.expect_targets( + """ + /*@targets $keep_baseline baseline #test_group */ + """, + groups=dict( + test_group=(""" + $keep_baseline + asimddp sse2 vsx2 avx2 vsx3 + avx512f asimdhp + """) + ), + x86="avx512f avx2 sse2 baseline", + ppc64="vsx3 vsx2 baseline", + armhf="asimddp asimdhp baseline" + ) + # test skip duplicating and sorting + self.expect_targets( + """ + /*@targets + * sse42 avx avx512f + * #test_group_1 + * vsx2 + * #test_group_2 + * asimddp asimdfhm + */ + """, + groups=dict( + test_group_1=(""" + VSX2 vsx3 asimd avx2 SSE41 + """), + test_group_2=(""" + vsx2 vsx3 asImd aVx2 sse41 + """) + ), + x86="avx512f avx2 avx sse42 sse41", + ppc64="vsx3 vsx2", + # vsx2 part of the default baseline of ppc64le, option ("min") + ppc64le="vsx3", + armhf="asimdfhm asimddp asimd", + # asimd part of the default baseline of aarch64, option ("min") + aarch64="asimdfhm asimddp" + ) + + def test_targets_multi(self): + self.expect_targets( + """ + /*@targets + (avx512_clx avx512_cnl) (asimdhp asimddp) + */ + """, + x86=r"\(avx512_clx avx512_cnl\)", + armhf=r"\(asimdhp asimddp\)", + ) + # test skipping implied features and auto-sort + self.expect_targets( + """ + /*@targets + f16c (sse41 avx sse42) (sse3 avx2 avx512f) + vsx2 (vsx vsx3 vsx2) + (neon neon_vfpv4 asimd asimdhp asimddp) + */ + """, + x86="avx512f f16c avx", + ppc64="vsx3 vsx2", + ppc64le="vsx3", # vsx2 part of baseline + armhf=r"\(asimdhp asimddp\)", + ) + # test skipping implied features and keep sort + self.expect_targets( + """ + /*@targets $keep_sort + (sse41 avx sse42) (sse3 avx2 avx512f) + (vsx vsx3 vsx2) + (asimddp neon neon_vfpv4 asimd asimdhp) + */ + """, + x86="avx avx512f", + ppc64="vsx3", + armhf=r"\(asimdhp asimddp\)", + ) + # test compiler variety and avoiding duplicating + self.expect_targets( + """ + /*@targets $keep_sort + fma3 avx2 (fma3 avx2) (avx2 fma3) avx2 fma3 + */ + """, + x86_gcc=r"fma3 avx2 \(fma3 avx2\)", + x86_icc="avx2", x86_iccw="avx2", + x86_msvc="avx2" + ) + +def new_test(arch, cc): + if is_standalone: return textwrap.dedent("""\ + class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt, unittest.TestCase): + arch = '{arch}' + cc = '{cc}' + def __init__(self, methodName="runTest"): + unittest.TestCase.__init__(self, methodName) + self.setup() + """).format( + class_name=arch + '_' + cc, arch=arch, cc=cc + ) + return textwrap.dedent("""\ + class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt): + arch = '{arch}' + cc = '{cc}' + """).format( + class_name=arch + '_' + cc, arch=arch, cc=cc + ) +""" +if 1 and is_standalone: + FakeCCompilerOpt.fake_info = "x86_icc" + cco = FakeCCompilerOpt(None, cpu_baseline="avx2") + print(' '.join(cco.cpu_baseline_names())) + print(cco.cpu_baseline_flags()) + unittest.main() + sys.exit() +""" +for arch, compilers in arch_compilers.items(): + for cc in compilers: + exec(new_test(arch, cc)) + +if is_standalone: + unittest.main() diff --git a/numpy/distutils/tests/test_ccompiler_opt_conf.py b/numpy/distutils/tests/test_ccompiler_opt_conf.py new file mode 100644 index 000000000..2f83a59e0 --- /dev/null +++ b/numpy/distutils/tests/test_ccompiler_opt_conf.py @@ -0,0 +1,169 @@ +import unittest +from os import sys, path + +is_standalone = __name__ == '__main__' and __package__ is None +if is_standalone: + sys.path.append(path.abspath(path.join(path.dirname(__file__), ".."))) + from ccompiler_opt import CCompilerOpt +else: + from numpy.distutils.ccompiler_opt import CCompilerOpt + +arch_compilers = dict( + x86 = ("gcc", "clang", "icc", "iccw", "msvc"), + x64 = ("gcc", "clang", "icc", "iccw", "msvc"), + ppc64 = ("gcc", "clang"), + ppc64le = ("gcc", "clang"), + armhf = ("gcc", "clang"), + aarch64 = ("gcc", "clang"), + narch = ("gcc",) +) + +class FakeCCompilerOpt(CCompilerOpt): + fake_info = "" + def __init__(self, *args, **kwargs): + CCompilerOpt.__init__(self, None, **kwargs) + def dist_compile(self, sources, flags, **kwargs): + return sources + def dist_info(self): + return FakeCCompilerOpt.fake_info + @staticmethod + def dist_log(*args, stderr=False): + pass + +class _TestConfFeatures(FakeCCompilerOpt): + """A hook to check the sanity of configured features +- before it called by the abstract class '_Feature' + """ + + def conf_features_partial(self): + conf_all = self.conf_features + for feature_name, feature in conf_all.items(): + self.test_feature( + "attribute conf_features", + conf_all, feature_name, feature + ) + + conf_partial = FakeCCompilerOpt.conf_features_partial(self) + for feature_name, feature in conf_partial.items(): + self.test_feature( + "conf_features_partial()", + conf_partial, feature_name, feature + ) + return conf_partial + + def test_feature(self, log, search_in, feature_name, feature_dict): + error_msg = ( + "during validate '{}' within feature '{}', " + "march '{}' and compiler '{}'\n>> " + ).format(log, feature_name, self.cc_march, self.cc_name) + + if not feature_name.isupper(): + raise AssertionError(error_msg + "feature name must be in uppercase") + + for option, val in feature_dict.items(): + self.test_option_types(error_msg, option, val) + self.test_duplicates(error_msg, option, val) + + self.test_implies(error_msg, search_in, feature_name, feature_dict) + self.test_group(error_msg, search_in, feature_name, feature_dict) + + def test_option_types(self, error_msg, option, val): + for tp, available in ( + ((str, list), ( + "implies", "headers", "flags", "group", "detect" + )), + ((str,), ("disable",)), + ((int,), ("interest",)), + ((bool,), ("implies_detect",)), + ((bool, type(None)), ("autovec",)), + ) : + found_it = option in available + if not found_it: + continue + if not isinstance(val, tp): + error_tp = [t.__name__ for t in (*tp,)] + error_tp = ' or '.join(error_tp) + raise AssertionError(error_msg + \ + "expected '%s' type for option '%s' not '%s'" % ( + error_tp, option, type(val).__name__ + )) + break + + if not found_it: + raise AssertionError(error_msg + \ + "invalid option name '%s'" % option + ) + + def test_duplicates(self, error_msg, option, val): + if option not in ( + "implies", "headers", "flags", "group", "detect" + ) : return + + if isinstance(val, str): + val = val.split() + + if len(val) != len(set(val)): + raise AssertionError(error_msg + \ + "duplicated values in option '%s'" % option + ) + + def test_implies(self, error_msg, search_in, feature_name, feature_dict): + if feature_dict.get("disabled") is not None: + return + implies = feature_dict.get("implies", "") + if not implies: + return + if isinstance(implies, str): + implies = implies.split() + + if feature_name in implies: + raise AssertionError(error_msg + \ + "feature implies itself" + ) + + for impl in implies: + impl_dict = search_in.get(impl) + if impl_dict is not None: + if "disable" in impl_dict: + raise AssertionError(error_msg + \ + "implies disabled feature '%s'" % impl + ) + continue + raise AssertionError(error_msg + \ + "implies non-exist feature '%s'" % impl + ) + + def test_group(self, error_msg, search_in, feature_name, feature_dict): + if feature_dict.get("disabled") is not None: + return + group = feature_dict.get("group", "") + if not group: + return + if isinstance(group, str): + group = group.split() + + for f in group: + impl_dict = search_in.get(f) + if not impl_dict or "disable" in impl_dict: + continue + raise AssertionError(error_msg + \ + "in option '%s', '%s' already exists as a feature name" % ( + option, f + )) + +class TestConfFeatures(unittest.TestCase): + def __init__(self, methodName="runTest"): + unittest.TestCase.__init__(self, methodName) + self.setup() + + def setup(self): + FakeCCompilerOpt.conf_nocache = True + + def test_features(self): + for arch, compilers in arch_compilers.items(): + for cc in compilers: + FakeCCompilerOpt.fake_info = arch + cc + _TestConfFeatures() + +if is_standalone: + unittest.main() diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py index df0e04285..cc4c5d8c5 100644 --- a/numpy/tests/test_public_api.py +++ b/numpy/tests/test_public_api.py @@ -214,6 +214,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [ "core.umath", "core.umath_tests", "distutils.ccompiler", + 'distutils.ccompiler_opt', "distutils.command", "distutils.command.autodist", "distutils.command.bdist_rpm", |