summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-02-05 08:00:51 +0200
committerGitHub <noreply@github.com>2020-02-05 08:00:51 +0200
commitfed1fb49bc50fd2a44238b8250d13314a127ddcc (patch)
tree2765600c6e8cd4de41cf1af43cebcce38c493660 /numpy
parentb69cf68a24cffea0a884a102da1797755de8f022 (diff)
parent64f7074d243f72e33bfc74bdbc9fec1d85b117a1 (diff)
downloadnumpy-fed1fb49bc50fd2a44238b8250d13314a127ddcc.tar.gz
Merge pull request #13421 from seiko2plus/core_improve_infa_runtime
ENH: improve runtime detection of CPU features
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/code_generators/generate_umath.py3
-rw-r--r--numpy/core/setup.py2
-rw-r--r--numpy/core/setup_common.py5
-rw-r--r--numpy/core/src/common/npy_config.h1
-rw-r--r--numpy/core/src/common/npy_cpu_features.c.src404
-rw-r--r--numpy/core/src/common/npy_cpu_features.h117
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c15
-rw-r--r--numpy/core/src/umath/cpuid.c97
-rw-r--r--numpy/core/src/umath/cpuid.h9
-rw-r--r--numpy/core/tests/test_cpu_features.py104
10 files changed, 643 insertions, 114 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 1fd08241d..f22380ccb 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -1014,7 +1014,7 @@ def make_arrays(funcdict):
for vt in t.simd:
code2list.append(textwrap.dedent("""\
#ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
- if (npy_cpu_supports("{isa}")) {{
+ if (NPY_CPU_HAVE({ISA})) {{
{fname}_functions[{idx}] = {type}_{fname}_{isa};
}}
#endif
@@ -1138,7 +1138,6 @@ def make_code(funcdict, filename):
Please make changes to the code generator program (%s)
**/
- #include "cpuid.h"
#include "ufunc_object.h"
#include "ufunc_type_resolution.h"
#include "loops.h"
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 66c1b782e..e15cbf7c2 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -745,6 +745,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'common', 'ucsnarrow.c'),
join('src', 'common', 'ufunc_override.c'),
join('src', 'common', 'numpyos.c'),
+ join('src', 'common', 'npy_cpu_features.c.src'),
]
if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0":
@@ -898,7 +899,6 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'clip.c.src'),
join('src', 'umath', 'ufunc_object.c'),
join('src', 'umath', 'extobj.c'),
- join('src', 'umath', 'cpuid.c'),
join('src', 'umath', 'scalarmath.c.src'),
join('src', 'umath', 'ufunc_type_resolution.c'),
join('src', 'umath', 'override.c'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 7cac66e61..63c4a76a9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -132,11 +132,6 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
("__builtin_bswap64", '5u'),
("__builtin_expect", '5, 0'),
("__builtin_mul_overflow", '5, 5, (int*)5'),
- # broken on OSX 10.11, make sure its not optimized away
- ("volatile int r = __builtin_cpu_supports", '"sse"',
- "stdio.h", "__BUILTIN_CPU_SUPPORTS"),
- ("volatile int r = __builtin_cpu_supports", '"avx512f"',
- "stdio.h", "__BUILTIN_CPU_SUPPORTS_AVX512F"),
# MMX only needed for icc, but some clangs don't have it
("_m_from_int64", '0', "emmintrin.h"),
("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index eedfbe364..aebe241a5 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
#define _NPY_NPY_CONFIG_H_
#include "config.h"
+#include "npy_cpu_features.h"
#include "numpy/numpyconfig.h"
#include "numpy/npy_cpu.h"
#include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
new file mode 100644
index 000000000..cbd99827b
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -0,0 +1,404 @@
+#include "npy_cpu_features.h"
+#include "numpy/npy_common.h" // for NPY_INLINE
+#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope.
+
+/******************** Private Definitions *********************/
+
+// Hold all CPU features boolean values
+static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
+
+/******************** Private Declarations *********************/
+
+// Almost detect all CPU features in runtime
+static void
+npy__cpu_init_features(void);
+
+/******************** Public Definitions *********************/
+
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_have(int feature_id)
+{
+ if (feature_id <= NPY_CPU_FEATURE_NONE || feature_id >= NPY_CPU_FEATURE_MAX)
+ return 0;
+ return npy__cpu_have[feature_id];
+}
+
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_init(void)
+{
+ npy__cpu_init_features();
+ return 0;
+}
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_features_dict(void)
+{
+ PyObject *dict = PyDict_New();
+ if (dict) {
+ /**begin repeat
+ * #feature = MMX, SSE, SSE2, SSE3, SSSE3, SSE41, POPCNT, SSE42,
+ * AVX, F16C, XOP, FMA4, FMA3, AVX2, AVX512F,
+ * AVX512CD, AVX512ER, AVX512PF, AVX5124FMAPS, AVX5124VNNIW,
+ * AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI,
+ * AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG,
+ * AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL,
+ * VSX, VSX2, VSX3,
+ * NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM#
+ */
+ if (PyDict_SetItemString(dict, "@feature@",
+ npy__cpu_have[NPY_CPU_FEATURE_@feature@] ? Py_True : Py_False) < 0) {
+ Py_DECREF(dict);
+ return NULL;
+ }
+ /**end repeat**/
+ }
+ return dict;
+}
+
+/****************************************************************
+ * This section is reserved to defining @npy__cpu_init_features
+ * for each CPU architecture, please try to keep it clean. Ty
+ ****************************************************************/
+
+/***************** X86 ******************/
+
+#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86)
+
+#ifdef _MSC_VER
+ #include <intrin.h>
+#elif defined(__INTEL_COMPILER)
+ #include <immintrin.h>
+#endif
+
+static int
+npy__cpu_getxcr0(void)
+{
+#if defined(_MSC_VER) || defined (__INTEL_COMPILER)
+ return _xgetbv(0);
+#elif defined(__GNUC__) || defined(__clang__)
+ unsigned int eax, edx;
+ __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+ return (eax | (unsigned long long)edx << 32);
+#else
+ // TODO: handle other x86 compilers
+ return 0;
+#endif
+}
+
+static void
+npy__cpu_cpuid(int reg[4], int func_id)
+{
+#if defined(_MSC_VER)
+ __cpuidex(reg, func_id, 0);
+#elif defined(__INTEL_COMPILER)
+ __cpuid(reg, func_id);
+#elif defined(__GNUC__) || defined(__clang__)
+ #if defined(NPY_CPU_X86) && defined(__PIC__)
+ // %ebx may be the PIC register
+ #define NPY__CPUID_ASM \
+ "xchg{l}\t{%%}ebx, %1\n\t" \
+ "cpuid\n\t" \
+ "xchg{l}\t{%%}ebx, %1\n\t"
+ #else
+ #define NPY__CPUID_ASM "cpuid"
+ #endif
+ __asm__(NPY__CPUID_ASM : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) : "a" (func_id), "c" (0) : );
+#else
+ // TODO: handle other x86 compilers
+ reg[0] = 0;
+#endif
+}
+
+static void
+npy__cpu_init_features(void)
+{
+ memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+
+ // validate platform support
+ int reg[] = {0, 0, 0, 0};
+ npy__cpu_cpuid(reg, 0);
+ if (reg[0] == 0)
+ return;
+
+ npy__cpu_cpuid(reg, 1);
+ npy__cpu_have[NPY_CPU_FEATURE_MMX] = (reg[3] & (1 << 23)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSE] = (reg[3] & (1 << 25)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSE2] = (reg[3] & (1 << 26)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSE3] = (reg[2] & (1 << 0)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSSE3] = (reg[2] & (1 << 9)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSE41] = (reg[2] & (1 << 19)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_POPCNT] = (reg[2] & (1 << 23)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_SSE42] = (reg[2] & (1 << 20)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_F16C] = (reg[2] & (1 << 29)) != 0;
+
+ // check OSXSAVE
+ if ((reg[2] & (1 << 27)) == 0)
+ return;
+ // check AVX OS support
+ int xcr = npy__cpu_getxcr0();
+ if ((xcr & 6) != 6)
+ return;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX] = (reg[2] & (1 << 28)) != 0;
+ if (!npy__cpu_have[NPY_CPU_FEATURE_AVX])
+ return;
+ npy__cpu_have[NPY_CPU_FEATURE_FMA3] = (reg[2] & (1 << 12)) != 0;
+
+ // second call to the cpuid to get extended AMD feature bits
+ npy__cpu_cpuid(reg, 0x80000001);
+ npy__cpu_have[NPY_CPU_FEATURE_XOP] = (reg[2] & (1 << 11)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_FMA4] = (reg[2] & (1 << 16)) != 0;
+
+ // third call to the cpuid to get extended AVX2 & AVX512 feature bits
+ npy__cpu_cpuid(reg, 7);
+ npy__cpu_have[NPY_CPU_FEATURE_AVX2] = (reg[1] & (1 << 5)) != 0;
+ if (!npy__cpu_have[NPY_CPU_FEATURE_AVX2])
+ return;
+ // detect AVX2 & FMA3
+ npy__cpu_have[NPY_CPU_FEATURE_FMA] = npy__cpu_have[NPY_CPU_FEATURE_FMA3];
+
+ // check AVX512 OS support
+ if ((xcr & 0xe6) != 0xe6)
+ return;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512F] = (reg[1] & (1 << 16)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512CD] = (reg[1] & (1 << 28)) != 0;
+ if (npy__cpu_have[NPY_CPU_FEATURE_AVX512F] && npy__cpu_have[NPY_CPU_FEATURE_AVX512CD]) {
+ // Knights Landing
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512PF] = (reg[1] & (1 << 26)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] = (reg[1] & (1 << 27)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512PF];
+ // Knights Mill
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ] = (reg[2] & (1 << 14)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] = (reg[3] & (1 << 2)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] = (reg[3] & (1 << 3)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNM] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+
+ // Skylake-X
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] = (reg[1] & (1 << 17)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] = (reg[1] & (1 << 30)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VL] = (reg[1] & (1 << 31)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VL];
+ // Cascade Lake
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI] = (reg[2] & (1 << 11)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI];
+
+ // Cannon Lake
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] = (reg[1] & (1 << 21)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI] = (reg[2] & (1 << 1)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI];
+ // Ice Lake
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] = (reg[2] & (1 << 6)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] = (reg[2] & (1 << 12)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+ }
+}
+
+/***************** POWER ******************/
+
+#elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
+
+#ifdef __linux__
+ #include <sys/auxv.h>
+ #ifndef AT_HWCAP2
+ #define AT_HWCAP2 26
+ #endif
+ #ifndef PPC_FEATURE2_ARCH_3_00
+ #define PPC_FEATURE2_ARCH_3_00 0x00800000
+ #endif
+#endif
+
+static void
+npy__cpu_init_features(void)
+{
+ memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#ifdef __linux__
+ unsigned int hwcap = getauxval(AT_HWCAP);
+ if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+ return;
+
+ hwcap = getauxval(AT_HWCAP2);
+ if (hwcap & PPC_FEATURE2_ARCH_3_00)
+ {
+ npy__cpu_have[NPY_CPU_FEATURE_VSX] =
+ npy__cpu_have[NPY_CPU_FEATURE_VSX2] =
+ npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
+ return;
+ }
+ npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1;
+// TODO: AIX, FreeBSD
+#else
+ npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1;
+ #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
+ npy__cpu_have[NPY_CPU_FEATURE_VSX2] = 1;
+ #endif
+ #ifdef NPY_HAVE_VSX3
+ npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
+ #endif
+#endif
+}
+
+/***************** ARM ******************/
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+static NPY_INLINE void
+npy__cpu_init_features_arm8(void)
+{
+ npy__cpu_have[NPY_CPU_FEATURE_NEON] =
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] =
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] =
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMD] = 1;
+}
+
+#ifdef __linux__
+/*
+ * we aren't sure of what kind kernel or clib we deal with
+ * so we play it safe
+*/
+#include <stdio.h>
+#include <fcntl.h>
+
+#define NPY__HWCAP 16
+#define NPY__HWCAP2 26
+
+// arch/arm/include/uapi/asm/hwcap.h
+#define NPY__HWCAP_HALF (1 << 1)
+#define NPY__HWCAP_NEON (1 << 12)
+#define NPY__HWCAP_VFPv3 (1 << 13)
+#define NPY__HWCAP_VFPv4 (1 << 16)
+#define NPY__HWCAP2_AES (1 << 0)
+#define NPY__HWCAP2_PMULL (1 << 1)
+#define NPY__HWCAP2_SHA1 (1 << 2)
+#define NPY__HWCAP2_SHA2 (1 << 3)
+#define NPY__HWCAP2_CRC32 (1 << 4)
+// arch/arm64/include/uapi/asm/hwcap.h
+#define NPY__HWCAP_FP (1 << 0)
+#define NPY__HWCAP_ASIMD (1 << 1)
+#define NPY__HWCAP_FPHP (1 << 9)
+#define NPY__HWCAP_ASIMDHP (1 << 10)
+#define NPY__HWCAP_ASIMDDP (1 << 20)
+#define NPY__HWCAP_ASIMDFHM (1 << 23)
+
+__attribute__((weak)) unsigned long getauxval(unsigned long); // linker should handle it
+static int
+npy__cpu_init_features_linux(void)
+{
+ unsigned long hwcap = 0, hwcap2 = 0;
+ if (getauxval != 0) {
+ hwcap = getauxval(NPY__HWCAP);
+ #ifdef __arm__
+ hwcap2 = getauxval(NPY__HWCAP2);
+ #endif
+ } else {
+ unsigned long auxv[2];
+ int fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd >= 0) {
+ while (read(fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
+ if (auxv[0] == NPY__HWCAP) {
+ hwcap = auxv[1];
+ }
+ #ifdef __arm__
+ else if (auxv[0] == NPY__HWCAP2) {
+ hwcap2 = auxv[1];
+ }
+ #endif
+ // detect the end
+ else if (auxv[0] == 0 && auxv[1] == 0) {
+ break;
+ }
+ }
+ close(fd);
+ }
+ }
+ if (hwcap == 0 && hwcap2 == 0) {
+ /*
+ * FIXME: failback to compiler definitions,
+ * BTW we can parse /proc/cpuinfo for badly patched kernels
+ */
+ return 0;
+ }
+#ifdef __arm__
+ // Detect Arm8 (aarch32 state)
+ if ((hwcap2 & NPY__HWCAP2_AES) || (hwcap2 & NPY__HWCAP2_SHA1) ||
+ (hwcap2 & NPY__HWCAP2_SHA2) || (hwcap2 & NPY__HWCAP2_PMULL) ||
+ (hwcap2 & NPY__HWCAP2_CRC32))
+#else
+ if (1)
+#endif
+ {
+ if (!(hwcap & (NPY__HWCAP_FP | NPY__HWCAP_ASIMD))) {
+ // Is this could happen? maybe disabled by kernel
+ // BTW this will break the baseline of AARCH64
+ return 1;
+ }
+ npy__cpu_have[NPY_CPU_FEATURE_FPHP] = (hwcap & NPY__HWCAP_FPHP) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = (hwcap & NPY__HWCAP_ASIMDHP) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = (hwcap & NPY__HWCAP_ASIMDDP) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = (hwcap & NPY__HWCAP_ASIMDFHM) != 0;
+ npy__cpu_init_features_arm8();
+ } else {
+ npy__cpu_have[NPY_CPU_FEATURE_NEON] = (hwcap & NPY__HWCAP_NEON) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv3 |
+ NPY__HWCAP_HALF)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv4)) != 0;
+ }
+ return 1;
+}
+#endif
+
+static void
+npy__cpu_init_features(void)
+{
+ memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#ifdef __linux__
+ if (npy__cpu_init_features_linux())
+ return;
+#endif
+ // We have nothing else todo
+#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+ #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
+ #endif
+ #if defined(NPY_HAVE_ASIMDHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = 1;
+ #endif
+ #if defined(NPY_HAVE_ASIMDDP) || defined(__ARM_FEATURE_DOTPROD)
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = 1;
+ #endif
+ #if defined(NPY_HAVE_ASIMDFHM) || defined(__ARM_FEATURE_FP16FML)
+ npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = 1;
+ #endif
+ npy__cpu_init_features_arm8();
+#else
+ #if defined(NPY_HAVE_NEON) || defined(__ARM_NEON__)
+ npy__cpu_have[NPY_CPU_FEATURE_NEON] = 1;
+ #endif
+ #if defined(NPY_HAVE_NEON_FP16) || defined(__ARM_FP16_FORMAT_IEEE) || (defined(__ARM_FP) && (__ARM_FP & 2))
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = npy__cpu_have[NPY_CPU_FEATURE_NEON];
+ #endif
+ #if defined(NPY_HAVE_NEON_VFPV4) || defined(__ARM_FEATURE_FMA)
+ npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = npy__cpu_have[NPY_CPU_FEATURE_NEON];
+ #endif
+#endif
+}
+
+/*********** Unsupported ARCH ***********/
+#else
+static void
+npy__cpu_init_features(void)
+{
+}
+#endif
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
new file mode 100644
index 000000000..0e8901328
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -0,0 +1,117 @@
+#ifndef _NPY_CPU_FEATURES_H_
+#define _NPY_CPU_FEATURES_H_
+
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
+#include <Python.h> // for PyObject
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum npy_cpu_features
+{
+ NPY_CPU_FEATURE_NONE = 0,
+ // X86
+ NPY_CPU_FEATURE_MMX = 1,
+ NPY_CPU_FEATURE_SSE = 2,
+ NPY_CPU_FEATURE_SSE2 = 3,
+ NPY_CPU_FEATURE_SSE3 = 4,
+ NPY_CPU_FEATURE_SSSE3 = 5,
+ NPY_CPU_FEATURE_SSE41 = 6,
+ NPY_CPU_FEATURE_POPCNT = 7,
+ NPY_CPU_FEATURE_SSE42 = 8,
+ NPY_CPU_FEATURE_AVX = 9,
+ NPY_CPU_FEATURE_F16C = 10,
+ NPY_CPU_FEATURE_XOP = 11,
+ NPY_CPU_FEATURE_FMA4 = 12,
+ NPY_CPU_FEATURE_FMA3 = 13,
+ NPY_CPU_FEATURE_AVX2 = 14,
+ NPY_CPU_FEATURE_FMA = 15, // AVX2 & FMA3, provides backward compatibility
+
+ NPY_CPU_FEATURE_AVX512F = 30,
+ NPY_CPU_FEATURE_AVX512CD = 31,
+ NPY_CPU_FEATURE_AVX512ER = 32,
+ NPY_CPU_FEATURE_AVX512PF = 33,
+ NPY_CPU_FEATURE_AVX5124FMAPS = 34,
+ NPY_CPU_FEATURE_AVX5124VNNIW = 35,
+ NPY_CPU_FEATURE_AVX512VPOPCNTDQ = 36,
+ NPY_CPU_FEATURE_AVX512BW = 37,
+ NPY_CPU_FEATURE_AVX512DQ = 38,
+ NPY_CPU_FEATURE_AVX512VL = 39,
+ NPY_CPU_FEATURE_AVX512IFMA = 40,
+ NPY_CPU_FEATURE_AVX512VBMI = 41,
+ NPY_CPU_FEATURE_AVX512VNNI = 42,
+ NPY_CPU_FEATURE_AVX512VBMI2 = 43,
+ NPY_CPU_FEATURE_AVX512BITALG = 44,
+
+ // X86 CPU Groups
+ // Knights Landing (F,CD,ER,PF)
+ NPY_CPU_FEATURE_AVX512_KNL = 101,
+ // Knights Mill (F,CD,ER,PF,4FMAPS,4VNNIW,VPOPCNTDQ)
+ NPY_CPU_FEATURE_AVX512_KNM = 102,
+ // Skylake-X (F,CD,BW,DQ,VL)
+ NPY_CPU_FEATURE_AVX512_SKX = 103,
+ // Cascade Lake (F,CD,BW,DQ,VL,VNNI)
+ NPY_CPU_FEATURE_AVX512_CLX = 104,
+ // Cannon Lake (F,CD,BW,DQ,VL,IFMA,VBMI)
+ NPY_CPU_FEATURE_AVX512_CNL = 105,
+ // Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
+ NPY_CPU_FEATURE_AVX512_ICL = 106,
+
+ // IBM/POWER VSX
+ // POWER7
+ NPY_CPU_FEATURE_VSX = 200,
+ // POWER8
+ NPY_CPU_FEATURE_VSX2 = 201,
+ // POWER9
+ NPY_CPU_FEATURE_VSX3 = 202,
+
+ // ARM
+ NPY_CPU_FEATURE_NEON = 300,
+ NPY_CPU_FEATURE_NEON_FP16 = 301,
+ // FMA
+ NPY_CPU_FEATURE_NEON_VFPV4 = 302,
+ // Advanced SIMD
+ NPY_CPU_FEATURE_ASIMD = 303,
+ // ARMv8.2 half-precision
+ NPY_CPU_FEATURE_FPHP = 304,
+ // ARMv8.2 half-precision vector arithm
+ NPY_CPU_FEATURE_ASIMDHP = 305,
+ // ARMv8.2 dot product
+ NPY_CPU_FEATURE_ASIMDDP = 306,
+ // ARMv8.2 single&half-precision multiply
+ NPY_CPU_FEATURE_ASIMDFHM = 307,
+
+ NPY_CPU_FEATURE_MAX
+};
+
+/*
+ * Initialize CPU features
+ * return 0 on success otherwise return -1
+*/
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_init(void);
+
+/*
+ * return 0 if CPU feature isn't available
+ * note: `npy_cpu_init` must be called first otherwise it will always return 0
+*/
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_have(int feature_id);
+
+#define NPY_CPU_HAVE(FEATURE_NAME) \
+npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME)
+
+/*
+ * return a new dictionary contains CPU feature names
+ * with runtime availability.
+ * same as npy_cpu_have, `npy_cpu_init` must be called first.
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_features_dict(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _NPY_CPU_FEATURES_H_
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index c2e597385..7792fcdcb 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4388,6 +4388,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
PyObject *m, *d, *s;
PyObject *c_api;
+ /* Initialize CPU features */
+ if (npy_cpu_init() < 0) {
+ goto err;
+ }
+
/* Create the module and add the functions */
m = PyModule_Create(&moduledef);
if (!m) {
@@ -4513,6 +4518,16 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
PyDict_SetItemString(d, "__version__", s);
Py_DECREF(s);
+ s = npy_cpu_features_dict();
+ if (s == NULL) {
+ goto err;
+ }
+ if (PyDict_SetItemString(d, "__cpu_features__", s) < 0) {
+ Py_DECREF(s);
+ goto err;
+ }
+ Py_DECREF(s);
+
s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
if (s == NULL) {
goto err;
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
deleted file mode 100644
index 72c6493e8..000000000
--- a/numpy/core/src/umath/cpuid.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <Python.h>
-
-#include "npy_config.h"
-
-#include "cpuid.h"
-
-#define XCR_XFEATURE_ENABLED_MASK 0x0
-#define XSTATE_SSE 0x2
-#define XSTATE_YMM 0x4
-#define XSTATE_ZMM 0x70
-
-/*
- * verify the OS supports avx instructions
- * it can be disabled in some OS, e.g. with the nosavex boot option of linux
- */
-static NPY_INLINE
-int os_avx_support(void)
-{
-#if HAVE_XGETBV
- /*
- * use bytes for xgetbv to avoid issues with compiler not knowing the
- * instruction
- */
- unsigned int eax, edx;
- unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
- __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
- return (eax & (XSTATE_SSE | XSTATE_YMM)) == (XSTATE_SSE | XSTATE_YMM);
-#else
- return 0;
-#endif
-}
-
-static NPY_INLINE
-int os_avx512_support(void)
-{
-#if HAVE_XGETBV
- unsigned int eax, edx;
- unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
- unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
- __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
- return (eax & xcr0) == xcr0;
-#else
- return 0;
-#endif
-}
-
-static NPY_INLINE
-int cpu_supports_fma(void)
-{
-#ifdef __x86_64__
- unsigned int feature = 0x01;
- unsigned int a, b, c, d;
- __asm__ volatile (
- "cpuid" "\n\t"
- : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
- : "a" (feature));
- /*
- * FMA is the 12th bit of ECX
- */
- return (c >> 12) & 1;
-#else
- return 0;
-#endif
-}
-
-/*
- * Primitive cpu feature detect function
- * Currently only supports checking for avx on gcc compatible compilers.
- */
-NPY_NO_EXPORT int
-npy_cpu_supports(const char * feature)
-{
-#ifdef HAVE___BUILTIN_CPU_SUPPORTS
- if (strcmp(feature, "avx512f") == 0) {
-#ifdef HAVE___BUILTIN_CPU_SUPPORTS_AVX512F
- return __builtin_cpu_supports("avx512f") && os_avx512_support();
-#else
- return 0;
-#endif
- }
- else if (strcmp(feature, "fma") == 0) {
- return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
- }
- else if (strcmp(feature, "avx2") == 0) {
- return __builtin_cpu_supports("avx2") && os_avx_support();
- }
- else if (strcmp(feature, "avx") == 0) {
- return __builtin_cpu_supports("avx") && os_avx_support();
- }
-#endif
-
- return 0;
-}
diff --git a/numpy/core/src/umath/cpuid.h b/numpy/core/src/umath/cpuid.h
deleted file mode 100644
index 33702ed41..000000000
--- a/numpy/core/src/umath/cpuid.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _NPY_PRIVATE__CPUID_H_
-#define _NPY_PRIVATE__CPUID_H_
-
-#include <numpy/ndarraytypes.h> /* for NPY_NO_EXPORT */
-
-NPY_NO_EXPORT int
-npy_cpu_supports(const char * feature);
-
-#endif
diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
new file mode 100644
index 000000000..3b5cb3157
--- /dev/null
+++ b/numpy/core/tests/test_cpu_features.py
@@ -0,0 +1,104 @@
+import sys, platform, re, pytest
+
+from numpy.testing import assert_equal
+from numpy.core._multiarray_umath import __cpu_features__
+
+class AbstractTest(object):
+ features = []
+ features_groups = {}
+ features_map = {}
+ features_flags = set()
+
+ def load_flags(self):
+ # a hook
+ pass
+
+ def test_features(self):
+ self.load_flags()
+ for gname, features in self.features_groups.items():
+ test_features = [self.features_map.get(f, f) in self.features_flags for f in features]
+ assert_equal(__cpu_features__.get(gname), all(test_features))
+
+ for feature_name in self.features:
+ map_name = self.features_map.get(feature_name, feature_name)
+ cpu_have = map_name in self.features_flags
+ npy_have = __cpu_features__.get(feature_name)
+ assert_equal(npy_have, cpu_have)
+
+ def load_flags_proc(self, magic_key):
+ with open('/proc/cpuinfo') as fd:
+ for line in fd:
+ if not line.startswith(magic_key):
+ continue
+ flags_value = [s.strip() for s in line.split(':', 1)]
+ if len(flags_value) == 2:
+ self.features_flags = self.features_flags.union(flags_value[1].upper().split())
+
+ def load_flags_auxv(self):
+ import subprocess
+ auxv = subprocess.check_output(['/bin/true'], env=dict(LD_SHOW_AUXV="1"))
+ for at in auxv.split(b'\n'):
+ if not at.startswith(b"AT_HWCAP"):
+ continue
+ hwcap_value = [s.strip() for s in at.split(b':', 1)]
+ if len(hwcap_value) == 2:
+ self.features_flags = self.features_flags.union(
+ hwcap_value[1].upper().decode().split()
+ )
+
+is_linux = sys.platform.startswith('linux')
+machine = platform.machine()
+is_x86 = re.match("^(amd64|x86|i386|i686)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_x86, reason="Only for Linux and x86")
+class Test_X86_Features(AbstractTest):
+ features = [
+ "MMX", "SSE", "SSE2", "SSE3", "SSSE3", "SSE41", "POPCNT", "SSE42",
+ "AVX", "F16C", "XOP", "FMA4", "FMA3", "AVX2", "AVX512F", "AVX512CD",
+ "AVX512ER", "AVX512PF", "AVX5124FMAPS", "AVX5124VNNIW", "AVX512VPOPCNTDQ",
+ "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512VNNI", "AVX512IFMA",
+ "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG",
+ ]
+ features_groups = dict(
+ AVX512_KNL = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF"],
+ AVX512_KNM = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF", "AVX5124FMAPS",
+ "AVX5124VNNIW", "AVX512VPOPCNTDQ"],
+ AVX512_SKX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL"],
+ AVX512_CLX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512VNNI"],
+ AVX512_CNL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
+ "AVX512VBMI"],
+ AVX512_ICL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
+ "AVX512VBMI", "AVX512VNNI", "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ"],
+ )
+ features_map = dict(
+ SSE3="PNI", SSE41="SSE4_1", SSE42="SSE4_2", FMA3="FMA",
+ AVX512VNNI="AVX512_VNNI", AVX512BITALG="AVX512_BITALG", AVX512VBMI2="AVX512_VBMI2",
+ AVX5124FMAPS="AVX512_4FMAPS", AVX5124VNNIW="AVX512_4VNNIW", AVX512VPOPCNTDQ="AVX512_VPOPCNTDQ",
+ )
+ def load_flags(self):
+ self.load_flags_proc("flags")
+
+is_power = re.match("^(powerpc|ppc)64", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_power, reason="Only for Linux and Power")
+class Test_POWER_Features(AbstractTest):
+ features = ["VSX", "VSX2", "VSX3"]
+ features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00")
+
+ def load_flags(self):
+ self.load_flags_auxv()
+
+is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM")
+class Test_ARM_Features(AbstractTest):
+ features = [
+ "NEON", "ASIMD", "FPHP", "ASIMDHP", "ASIMDDP", "ASIMDFHM"
+ ]
+ features_groups = dict(
+ NEON_FP16 = ["NEON", "HALF"],
+ NEON_VFPV4 = ["NEON", "VFPV4"],
+ )
+ def load_flags(self):
+ self.load_flags_proc("Features")
+ if re.match("^(aarch64|AARCH64)", platform.machine()):
+ self.features_map = dict(
+ NEON="ASIMD", HALF="ASIMD", VFPV4="ASIMD"
+ )