diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 16 | ||||
-rw-r--r-- | numpy/core/setup.py | 2 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 5 | ||||
-rw-r--r-- | numpy/core/src/common/npy_config.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/npy_cpu_features.c.src | 404 | ||||
-rw-r--r-- | numpy/core/src/common/npy_cpu_features.h | 117 | ||||
-rw-r--r-- | numpy/core/src/multiarray/arrayobject.c | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/arraytypes.c.src | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/descriptor.c | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/methods.c | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/multiarraymodule.c | 15 | ||||
-rw-r--r-- | numpy/core/src/multiarray/refcount.c | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/shape.c | 2 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.c | 97 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.h | 9 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 43 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 29 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 348 | ||||
-rw-r--r-- | numpy/core/tests/test_cpu_features.py | 104 | ||||
-rw-r--r-- | numpy/core/tests/test_umath_complex.py | 39 | ||||
-rw-r--r-- | numpy/lib/arraysetops.py | 4 |
21 files changed, 1116 insertions, 141 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 1fd08241d..c14711d16 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -233,6 +233,7 @@ flts = 'efdg' fltsO = flts + O fltsP = flts + P cmplx = 'FDG' +cmplxvec = 'FD' cmplxO = cmplx + O cmplxP = cmplx + P inexact = flts + cmplx @@ -268,7 +269,7 @@ defdict = { Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -279,7 +280,7 @@ defdict = { Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(ints + inexact, simd=[('avx2', ints)]), + TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -290,7 +291,7 @@ defdict = { Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)]), + TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -325,7 +326,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.conjugate'), None, - TD(ints+flts+cmplx, simd=[('avx2', ints)]), + TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]), TD(P, f='conjugate'), ), 'fmod': @@ -340,7 +341,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]), TD(O, f='Py_square'), ), 'reciprocal': @@ -378,7 +379,7 @@ defdict = { docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]), - TD(cmplx, out=('f', 'd', 'g')), + TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), '_arg': @@ -1014,7 +1015,7 @@ def make_arrays(funcdict): for vt in t.simd: code2list.append(textwrap.dedent("""\ #ifdef HAVE_ATTRIBUTE_TARGET_{ISA} - if (npy_cpu_supports("{isa}")) {{ + if (NPY_CPU_HAVE({ISA})) {{ {fname}_functions[{idx}] = {type}_{fname}_{isa}; }} #endif @@ -1138,7 +1139,6 @@ def make_code(funcdict, filename): Please make changes to the code generator program (%s) **/ - #include "cpuid.h" #include "ufunc_object.h" #include "ufunc_type_resolution.h" #include "loops.h" diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 66c1b782e..e15cbf7c2 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -745,6 +745,7 @@ def configuration(parent_package='',top_path=None): join('src', 'common', 'ucsnarrow.c'), join('src', 'common', 'ufunc_override.c'), join('src', 'common', 'numpyos.c'), + join('src', 'common', 'npy_cpu_features.c.src'), ] if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0": @@ -898,7 +899,6 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'clip.c.src'), join('src', 'umath', 'ufunc_object.c'), join('src', 'umath', 'extobj.c'), - join('src', 'umath', 'cpuid.c'), join('src', 'umath', 'scalarmath.c.src'), join('src', 'umath', 'ufunc_type_resolution.c'), join('src', 'umath', 'override.c'), diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 7cac66e61..63c4a76a9 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -132,11 +132,6 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_bswap64", '5u'), ("__builtin_expect", '5, 0'), ("__builtin_mul_overflow", '5, 5, (int*)5'), - # broken on OSX 10.11, make sure its not optimized away - ("volatile int r = __builtin_cpu_supports", '"sse"', - "stdio.h", "__BUILTIN_CPU_SUPPORTS"), - ("volatile int r = __builtin_cpu_supports", '"avx512f"', - "stdio.h", "__BUILTIN_CPU_SUPPORTS_AVX512F"), # MMX only needed for icc, but some clangs don't have it ("_m_from_int64", '0', "emmintrin.h"), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index eedfbe364..aebe241a5 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -2,6 +2,7 @@ #define _NPY_NPY_CONFIG_H_ #include "config.h" +#include "npy_cpu_features.h" #include "numpy/numpyconfig.h" #include "numpy/npy_cpu.h" #include "numpy/npy_os.h" diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src new file mode 100644 index 000000000..cbd99827b --- /dev/null +++ b/numpy/core/src/common/npy_cpu_features.c.src @@ -0,0 +1,404 @@ +#include "npy_cpu_features.h" +#include "numpy/npy_common.h" // for NPY_INLINE +#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope. + +/******************** Private Definitions *********************/ + +// Hold all CPU features boolean values +static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX]; + +/******************** Private Declarations *********************/ + +// Almost detect all CPU features in runtime +static void +npy__cpu_init_features(void); + +/******************** Public Definitions *********************/ + +NPY_VISIBILITY_HIDDEN int +npy_cpu_have(int feature_id) +{ + if (feature_id <= NPY_CPU_FEATURE_NONE || feature_id >= NPY_CPU_FEATURE_MAX) + return 0; + return npy__cpu_have[feature_id]; +} + +NPY_VISIBILITY_HIDDEN int +npy_cpu_init(void) +{ + npy__cpu_init_features(); + return 0; +} + +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_features_dict(void) +{ + PyObject *dict = PyDict_New(); + if (dict) { + /**begin repeat + * #feature = MMX, SSE, SSE2, SSE3, SSSE3, SSE41, POPCNT, SSE42, + * AVX, F16C, XOP, FMA4, FMA3, AVX2, AVX512F, + * AVX512CD, AVX512ER, AVX512PF, AVX5124FMAPS, AVX5124VNNIW, + * AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI, + * AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG, + * AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL, + * VSX, VSX2, VSX3, + * NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM# + */ + if (PyDict_SetItemString(dict, "@feature@", + npy__cpu_have[NPY_CPU_FEATURE_@feature@] ? Py_True : Py_False) < 0) { + Py_DECREF(dict); + return NULL; + } + /**end repeat**/ + } + return dict; +} + +/**************************************************************** + * This section is reserved to defining @npy__cpu_init_features + * for each CPU architecture, please try to keep it clean. Ty + ****************************************************************/ + +/***************** X86 ******************/ + +#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86) + +#ifdef _MSC_VER + #include <intrin.h> +#elif defined(__INTEL_COMPILER) + #include <immintrin.h> +#endif + +static int +npy__cpu_getxcr0(void) +{ +#if defined(_MSC_VER) || defined (__INTEL_COMPILER) + return _xgetbv(0); +#elif defined(__GNUC__) || defined(__clang__) + unsigned int eax, edx; + __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + return (eax | (unsigned long long)edx << 32); +#else + // TODO: handle other x86 compilers + return 0; +#endif +} + +static void +npy__cpu_cpuid(int reg[4], int func_id) +{ +#if defined(_MSC_VER) + __cpuidex(reg, func_id, 0); +#elif defined(__INTEL_COMPILER) + __cpuid(reg, func_id); +#elif defined(__GNUC__) || defined(__clang__) + #if defined(NPY_CPU_X86) && defined(__PIC__) + // %ebx may be the PIC register + #define NPY__CPUID_ASM \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" + #else + #define NPY__CPUID_ASM "cpuid" + #endif + __asm__(NPY__CPUID_ASM : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) : "a" (func_id), "c" (0) : ); +#else + // TODO: handle other x86 compilers + reg[0] = 0; +#endif +} + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); + + // validate platform support + int reg[] = {0, 0, 0, 0}; + npy__cpu_cpuid(reg, 0); + if (reg[0] == 0) + return; + + npy__cpu_cpuid(reg, 1); + npy__cpu_have[NPY_CPU_FEATURE_MMX] = (reg[3] & (1 << 23)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE] = (reg[3] & (1 << 25)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE2] = (reg[3] & (1 << 26)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE3] = (reg[2] & (1 << 0)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSSE3] = (reg[2] & (1 << 9)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE41] = (reg[2] & (1 << 19)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_POPCNT] = (reg[2] & (1 << 23)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE42] = (reg[2] & (1 << 20)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_F16C] = (reg[2] & (1 << 29)) != 0; + + // check OSXSAVE + if ((reg[2] & (1 << 27)) == 0) + return; + // check AVX OS support + int xcr = npy__cpu_getxcr0(); + if ((xcr & 6) != 6) + return; + npy__cpu_have[NPY_CPU_FEATURE_AVX] = (reg[2] & (1 << 28)) != 0; + if (!npy__cpu_have[NPY_CPU_FEATURE_AVX]) + return; + npy__cpu_have[NPY_CPU_FEATURE_FMA3] = (reg[2] & (1 << 12)) != 0; + + // second call to the cpuid to get extended AMD feature bits + npy__cpu_cpuid(reg, 0x80000001); + npy__cpu_have[NPY_CPU_FEATURE_XOP] = (reg[2] & (1 << 11)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_FMA4] = (reg[2] & (1 << 16)) != 0; + + // third call to the cpuid to get extended AVX2 & AVX512 feature bits + npy__cpu_cpuid(reg, 7); + npy__cpu_have[NPY_CPU_FEATURE_AVX2] = (reg[1] & (1 << 5)) != 0; + if (!npy__cpu_have[NPY_CPU_FEATURE_AVX2]) + return; + // detect AVX2 & FMA3 + npy__cpu_have[NPY_CPU_FEATURE_FMA] = npy__cpu_have[NPY_CPU_FEATURE_FMA3]; + + // check AVX512 OS support + if ((xcr & 0xe6) != 0xe6) + return; + npy__cpu_have[NPY_CPU_FEATURE_AVX512F] = (reg[1] & (1 << 16)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512CD] = (reg[1] & (1 << 28)) != 0; + if (npy__cpu_have[NPY_CPU_FEATURE_AVX512F] && npy__cpu_have[NPY_CPU_FEATURE_AVX512CD]) { + // Knights Landing + npy__cpu_have[NPY_CPU_FEATURE_AVX512PF] = (reg[1] & (1 << 26)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] = (reg[1] & (1 << 27)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512PF]; + // Knights Mill + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ] = (reg[2] & (1 << 14)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] = (reg[3] & (1 << 2)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] = (reg[3] & (1 << 3)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNM] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] && + npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] && + npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ]; + + // Skylake-X + npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] = (reg[1] & (1 << 17)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] = (reg[1] & (1 << 30)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512VL] = (reg[1] & (1 << 31)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VL]; + // Cascade Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI] = (reg[2] & (1 << 11)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI]; + + // Cannon Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] = (reg[1] & (1 << 21)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI] = (reg[2] & (1 << 1)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI]; + // Ice Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] = (reg[2] & (1 << 6)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] = (reg[2] & (1 << 12)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ]; + } +} + +/***************** POWER ******************/ + +#elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE) + +#ifdef __linux__ + #include <sys/auxv.h> + #ifndef AT_HWCAP2 + #define AT_HWCAP2 26 + #endif + #ifndef PPC_FEATURE2_ARCH_3_00 + #define PPC_FEATURE2_ARCH_3_00 0x00800000 + #endif +#endif + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); +#ifdef __linux__ + unsigned int hwcap = getauxval(AT_HWCAP); + if ((hwcap & PPC_FEATURE_HAS_VSX) == 0) + return; + + hwcap = getauxval(AT_HWCAP2); + if (hwcap & PPC_FEATURE2_ARCH_3_00) + { + npy__cpu_have[NPY_CPU_FEATURE_VSX] = + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; + return; + } + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0; + npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; +// TODO: AIX, FreeBSD +#else + npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; + #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2) + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = 1; + #endif + #ifdef NPY_HAVE_VSX3 + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; + #endif +#endif +} + +/***************** ARM ******************/ + +#elif defined(__arm__) || defined(__aarch64__) + +static NPY_INLINE void +npy__cpu_init_features_arm8(void) +{ + npy__cpu_have[NPY_CPU_FEATURE_NEON] = + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = + npy__cpu_have[NPY_CPU_FEATURE_ASIMD] = 1; +} + +#ifdef __linux__ +/* + * we aren't sure of what kind kernel or clib we deal with + * so we play it safe +*/ +#include <stdio.h> +#include <fcntl.h> + +#define NPY__HWCAP 16 +#define NPY__HWCAP2 26 + +// arch/arm/include/uapi/asm/hwcap.h +#define NPY__HWCAP_HALF (1 << 1) +#define NPY__HWCAP_NEON (1 << 12) +#define NPY__HWCAP_VFPv3 (1 << 13) +#define NPY__HWCAP_VFPv4 (1 << 16) +#define NPY__HWCAP2_AES (1 << 0) +#define NPY__HWCAP2_PMULL (1 << 1) +#define NPY__HWCAP2_SHA1 (1 << 2) +#define NPY__HWCAP2_SHA2 (1 << 3) +#define NPY__HWCAP2_CRC32 (1 << 4) +// arch/arm64/include/uapi/asm/hwcap.h +#define NPY__HWCAP_FP (1 << 0) +#define NPY__HWCAP_ASIMD (1 << 1) +#define NPY__HWCAP_FPHP (1 << 9) +#define NPY__HWCAP_ASIMDHP (1 << 10) +#define NPY__HWCAP_ASIMDDP (1 << 20) +#define NPY__HWCAP_ASIMDFHM (1 << 23) + +__attribute__((weak)) unsigned long getauxval(unsigned long); // linker should handle it +static int +npy__cpu_init_features_linux(void) +{ + unsigned long hwcap = 0, hwcap2 = 0; + if (getauxval != 0) { + hwcap = getauxval(NPY__HWCAP); + #ifdef __arm__ + hwcap2 = getauxval(NPY__HWCAP2); + #endif + } else { + unsigned long auxv[2]; + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd >= 0) { + while (read(fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv[0] == NPY__HWCAP) { + hwcap = auxv[1]; + } + #ifdef __arm__ + else if (auxv[0] == NPY__HWCAP2) { + hwcap2 = auxv[1]; + } + #endif + // detect the end + else if (auxv[0] == 0 && auxv[1] == 0) { + break; + } + } + close(fd); + } + } + if (hwcap == 0 && hwcap2 == 0) { + /* + * FIXME: failback to compiler definitions, + * BTW we can parse /proc/cpuinfo for badly patched kernels + */ + return 0; + } +#ifdef __arm__ + // Detect Arm8 (aarch32 state) + if ((hwcap2 & NPY__HWCAP2_AES) || (hwcap2 & NPY__HWCAP2_SHA1) || + (hwcap2 & NPY__HWCAP2_SHA2) || (hwcap2 & NPY__HWCAP2_PMULL) || + (hwcap2 & NPY__HWCAP2_CRC32)) +#else + if (1) +#endif + { + if (!(hwcap & (NPY__HWCAP_FP | NPY__HWCAP_ASIMD))) { + // Is this could happen? maybe disabled by kernel + // BTW this will break the baseline of AARCH64 + return 1; + } + npy__cpu_have[NPY_CPU_FEATURE_FPHP] = (hwcap & NPY__HWCAP_FPHP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = (hwcap & NPY__HWCAP_ASIMDHP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = (hwcap & NPY__HWCAP_ASIMDDP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = (hwcap & NPY__HWCAP_ASIMDFHM) != 0; + npy__cpu_init_features_arm8(); + } else { + npy__cpu_have[NPY_CPU_FEATURE_NEON] = (hwcap & NPY__HWCAP_NEON) != 0; + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv3 | + NPY__HWCAP_HALF)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv4)) != 0; + } + return 1; +} +#endif + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); +#ifdef __linux__ + if (npy__cpu_init_features_linux()) + return; +#endif + // We have nothing else todo +#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) + #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDDP) || defined(__ARM_FEATURE_DOTPROD) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDFHM) || defined(__ARM_FEATURE_FP16FML) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = 1; + #endif + npy__cpu_init_features_arm8(); +#else + #if defined(NPY_HAVE_NEON) || defined(__ARM_NEON__) + npy__cpu_have[NPY_CPU_FEATURE_NEON] = 1; + #endif + #if defined(NPY_HAVE_NEON_FP16) || defined(__ARM_FP16_FORMAT_IEEE) || (defined(__ARM_FP) && (__ARM_FP & 2)) + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = npy__cpu_have[NPY_CPU_FEATURE_NEON]; + #endif + #if defined(NPY_HAVE_NEON_VFPV4) || defined(__ARM_FEATURE_FMA) + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = npy__cpu_have[NPY_CPU_FEATURE_NEON]; + #endif +#endif +} + +/*********** Unsupported ARCH ***********/ +#else +static void +npy__cpu_init_features(void) +{ +} +#endif diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h new file mode 100644 index 000000000..0e8901328 --- /dev/null +++ b/numpy/core/src/common/npy_cpu_features.h @@ -0,0 +1,117 @@ +#ifndef _NPY_CPU_FEATURES_H_ +#define _NPY_CPU_FEATURES_H_ + +#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN +#include <Python.h> // for PyObject + +#ifdef __cplusplus +extern "C" { +#endif + +enum npy_cpu_features +{ + NPY_CPU_FEATURE_NONE = 0, + // X86 + NPY_CPU_FEATURE_MMX = 1, + NPY_CPU_FEATURE_SSE = 2, + NPY_CPU_FEATURE_SSE2 = 3, + NPY_CPU_FEATURE_SSE3 = 4, + NPY_CPU_FEATURE_SSSE3 = 5, + NPY_CPU_FEATURE_SSE41 = 6, + NPY_CPU_FEATURE_POPCNT = 7, + NPY_CPU_FEATURE_SSE42 = 8, + NPY_CPU_FEATURE_AVX = 9, + NPY_CPU_FEATURE_F16C = 10, + NPY_CPU_FEATURE_XOP = 11, + NPY_CPU_FEATURE_FMA4 = 12, + NPY_CPU_FEATURE_FMA3 = 13, + NPY_CPU_FEATURE_AVX2 = 14, + NPY_CPU_FEATURE_FMA = 15, // AVX2 & FMA3, provides backward compatibility + + NPY_CPU_FEATURE_AVX512F = 30, + NPY_CPU_FEATURE_AVX512CD = 31, + NPY_CPU_FEATURE_AVX512ER = 32, + NPY_CPU_FEATURE_AVX512PF = 33, + NPY_CPU_FEATURE_AVX5124FMAPS = 34, + NPY_CPU_FEATURE_AVX5124VNNIW = 35, + NPY_CPU_FEATURE_AVX512VPOPCNTDQ = 36, + NPY_CPU_FEATURE_AVX512BW = 37, + NPY_CPU_FEATURE_AVX512DQ = 38, + NPY_CPU_FEATURE_AVX512VL = 39, + NPY_CPU_FEATURE_AVX512IFMA = 40, + NPY_CPU_FEATURE_AVX512VBMI = 41, + NPY_CPU_FEATURE_AVX512VNNI = 42, + NPY_CPU_FEATURE_AVX512VBMI2 = 43, + NPY_CPU_FEATURE_AVX512BITALG = 44, + + // X86 CPU Groups + // Knights Landing (F,CD,ER,PF) + NPY_CPU_FEATURE_AVX512_KNL = 101, + // Knights Mill (F,CD,ER,PF,4FMAPS,4VNNIW,VPOPCNTDQ) + NPY_CPU_FEATURE_AVX512_KNM = 102, + // Skylake-X (F,CD,BW,DQ,VL) + NPY_CPU_FEATURE_AVX512_SKX = 103, + // Cascade Lake (F,CD,BW,DQ,VL,VNNI) + NPY_CPU_FEATURE_AVX512_CLX = 104, + // Cannon Lake (F,CD,BW,DQ,VL,IFMA,VBMI) + NPY_CPU_FEATURE_AVX512_CNL = 105, + // Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ) + NPY_CPU_FEATURE_AVX512_ICL = 106, + + // IBM/POWER VSX + // POWER7 + NPY_CPU_FEATURE_VSX = 200, + // POWER8 + NPY_CPU_FEATURE_VSX2 = 201, + // POWER9 + NPY_CPU_FEATURE_VSX3 = 202, + + // ARM + NPY_CPU_FEATURE_NEON = 300, + NPY_CPU_FEATURE_NEON_FP16 = 301, + // FMA + NPY_CPU_FEATURE_NEON_VFPV4 = 302, + // Advanced SIMD + NPY_CPU_FEATURE_ASIMD = 303, + // ARMv8.2 half-precision + NPY_CPU_FEATURE_FPHP = 304, + // ARMv8.2 half-precision vector arithm + NPY_CPU_FEATURE_ASIMDHP = 305, + // ARMv8.2 dot product + NPY_CPU_FEATURE_ASIMDDP = 306, + // ARMv8.2 single&half-precision multiply + NPY_CPU_FEATURE_ASIMDFHM = 307, + + NPY_CPU_FEATURE_MAX +}; + +/* + * Initialize CPU features + * return 0 on success otherwise return -1 +*/ +NPY_VISIBILITY_HIDDEN int +npy_cpu_init(void); + +/* + * return 0 if CPU feature isn't available + * note: `npy_cpu_init` must be called first otherwise it will always return 0 +*/ +NPY_VISIBILITY_HIDDEN int +npy_cpu_have(int feature_id); + +#define NPY_CPU_HAVE(FEATURE_NAME) \ +npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME) + +/* + * return a new dictionary contains CPU feature names + * with runtime availability. + * same as npy_cpu_have, `npy_cpu_init` must be called first. + */ +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_features_dict(void); + +#ifdef __cplusplus +} +#endif + +#endif // _NPY_CPU_FEATURES_H_ diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c index cf8f9ddae..16896aa12 100644 --- a/numpy/core/src/multiarray/arrayobject.c +++ b/numpy/core/src/multiarray/arrayobject.c @@ -1123,7 +1123,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op) op = (cmp_op == Py_EQ ? n_ops.logical_and : n_ops.logical_or); while (PyDict_Next(PyArray_DESCR(self)->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } a = array_subscript_asarray(self, key); diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index 61270dbef..ce288d62e 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -2290,6 +2290,7 @@ static void STRING_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride, npy_intp n, int NPY_UNUSED(swap), PyArrayObject *arr) { + assert(arr != NULL); if (arr == NULL) { return; } @@ -2304,6 +2305,7 @@ VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride, { PyArray_Descr *descr; + assert(arr != NULL); if (arr == NULL) { return; } @@ -2394,6 +2396,7 @@ VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr) { PyArray_Descr *descr; + assert(arr != NULL); if (arr == NULL) { return; } @@ -2475,6 +2478,7 @@ UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride, { int itemsize; + assert(arr != NULL); if (arr == NULL) { return; } @@ -2502,6 +2506,7 @@ UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride, static void STRING_copyswap(char *dst, char *src, int NPY_UNUSED(swap), PyArrayObject *arr) { + assert(arr != NULL); if (arr == NULL) { return; } @@ -2514,6 +2519,7 @@ UNICODE_copyswap (char *dst, char *src, int swap, PyArrayObject *arr) { int itemsize; + assert(arr != NULL); if (arr == NULL) { return; } diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c index bfbf67ff9..215c8b0ab 100644 --- a/numpy/core/src/multiarray/descriptor.c +++ b/numpy/core/src/multiarray/descriptor.c @@ -1998,7 +1998,7 @@ _arraydescr_isnative(PyArray_Descr *self) int offset; Py_ssize_t pos = 0; while (PyDict_Next(self->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) { @@ -2508,7 +2508,7 @@ _descr_find_object(PyArray_Descr *self) Py_ssize_t pos = 0; while (PyDict_Next(self->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) { @@ -2982,7 +2982,7 @@ PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian) newfields = PyDict_New(); /* make new dictionary with replaced PyArray_Descr Objects */ while (PyDict_Next(self->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyUString_Check(key) || !PyTuple_Check(value) || diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c index 7b9aa4794..ebdf8a4cd 100644 --- a/numpy/core/src/multiarray/methods.c +++ b/numpy/core/src/multiarray/methods.c @@ -1503,7 +1503,7 @@ _deepcopy_call(char *iptr, char *optr, PyArray_Descr *dtype, int offset; Py_ssize_t pos = 0; while (PyDict_Next(dtype->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index c2e597385..7792fcdcb 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -4388,6 +4388,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { PyObject *m, *d, *s; PyObject *c_api; + /* Initialize CPU features */ + if (npy_cpu_init() < 0) { + goto err; + } + /* Create the module and add the functions */ m = PyModule_Create(&moduledef); if (!m) { @@ -4513,6 +4518,16 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { PyDict_SetItemString(d, "__version__", s); Py_DECREF(s); + s = npy_cpu_features_dict(); + if (s == NULL) { + goto err; + } + if (PyDict_SetItemString(d, "__cpu_features__", s) < 0) { + Py_DECREF(s); + goto err; + } + Py_DECREF(s); + s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL); if (s == NULL) { goto err; diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c index 6033929d9..c869b5eea 100644 --- a/numpy/core/src/multiarray/refcount.c +++ b/numpy/core/src/multiarray/refcount.c @@ -46,7 +46,7 @@ PyArray_Item_INCREF(char *data, PyArray_Descr *descr) Py_ssize_t pos = 0; while (PyDict_Next(descr->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, @@ -108,7 +108,7 @@ PyArray_Item_XDECREF(char *data, PyArray_Descr *descr) Py_ssize_t pos = 0; while (PyDict_Next(descr->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, @@ -318,7 +318,7 @@ _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype) Py_ssize_t pos = 0; while (PyDict_Next(dtype->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) { diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c index 127ac5134..30507112d 100644 --- a/numpy/core/src/multiarray/shape.c +++ b/numpy/core/src/multiarray/shape.c @@ -317,7 +317,7 @@ _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype) int offset; Py_ssize_t pos = 0; while (PyDict_Next(dtype->fields, &pos, &key, &value)) { - if NPY_TITLE_KEY(key, value) { + if (NPY_TITLE_KEY(key, value)) { continue; } if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) { diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c deleted file mode 100644 index 72c6493e8..000000000 --- a/numpy/core/src/umath/cpuid.c +++ /dev/null @@ -1,97 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include <Python.h> - -#include "npy_config.h" - -#include "cpuid.h" - -#define XCR_XFEATURE_ENABLED_MASK 0x0 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_ZMM 0x70 - -/* - * verify the OS supports avx instructions - * it can be disabled in some OS, e.g. with the nosavex boot option of linux - */ -static NPY_INLINE -int os_avx_support(void) -{ -#if HAVE_XGETBV - /* - * use bytes for xgetbv to avoid issues with compiler not knowing the - * instruction - */ - unsigned int eax, edx; - unsigned int ecx = XCR_XFEATURE_ENABLED_MASK; - __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); - return (eax & (XSTATE_SSE | XSTATE_YMM)) == (XSTATE_SSE | XSTATE_YMM); -#else - return 0; -#endif -} - -static NPY_INLINE -int os_avx512_support(void) -{ -#if HAVE_XGETBV - unsigned int eax, edx; - unsigned int ecx = XCR_XFEATURE_ENABLED_MASK; - unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE; - __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); - return (eax & xcr0) == xcr0; -#else - return 0; -#endif -} - -static NPY_INLINE -int cpu_supports_fma(void) -{ -#ifdef __x86_64__ - unsigned int feature = 0x01; - unsigned int a, b, c, d; - __asm__ volatile ( - "cpuid" "\n\t" - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) - : "a" (feature)); - /* - * FMA is the 12th bit of ECX - */ - return (c >> 12) & 1; -#else - return 0; -#endif -} - -/* - * Primitive cpu feature detect function - * Currently only supports checking for avx on gcc compatible compilers. - */ -NPY_NO_EXPORT int -npy_cpu_supports(const char * feature) -{ -#ifdef HAVE___BUILTIN_CPU_SUPPORTS - if (strcmp(feature, "avx512f") == 0) { -#ifdef HAVE___BUILTIN_CPU_SUPPORTS_AVX512F - return __builtin_cpu_supports("avx512f") && os_avx512_support(); -#else - return 0; -#endif - } - else if (strcmp(feature, "fma") == 0) { - return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support(); - } - else if (strcmp(feature, "avx2") == 0) { - return __builtin_cpu_supports("avx2") && os_avx_support(); - } - else if (strcmp(feature, "avx") == 0) { - return __builtin_cpu_supports("avx") && os_avx_support(); - } -#endif - - return 0; -} diff --git a/numpy/core/src/umath/cpuid.h b/numpy/core/src/umath/cpuid.h deleted file mode 100644 index 33702ed41..000000000 --- a/numpy/core/src/umath/cpuid.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _NPY_PRIVATE__CPUID_H_ -#define _NPY_PRIVATE__CPUID_H_ - -#include <numpy/ndarraytypes.h> /* for NPY_NO_EXPORT */ - -NPY_NO_EXPORT int -npy_cpu_supports(const char * feature); - -#endif diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index b310d73ff..9b43824cb 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -2509,6 +2509,7 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, * #ftype = npy_float, npy_double, npy_longdouble# * #c = f, , l# * #C = F, , L# + * #SIMD = 1, 1, 0# */ /* similar to pairwise sum of real floats */ @@ -2584,6 +2585,7 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, } } + /**begin repeat1 * arithmetic * #kind = add, subtract# @@ -2662,6 +2664,32 @@ NPY_NO_EXPORT void } } +#if @SIMD@ +NPY_NO_EXPORT void +@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (IS_BINARY_REDUCE) { + @TYPE@_add(args, dimensions, steps, func); + } + else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) { + @TYPE@_add(args, dimensions, steps, func); + } +} + +/**begin repeat1 + * arithmetic + * #kind = subtract, multiply# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + @TYPE@_@kind@(args, dimensions, steps, func); + } +} +/**end repeat1**/ +#endif + NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -2819,6 +2847,21 @@ NPY_NO_EXPORT void } } +#if @SIMD@ +/**begin repeat1 + * arithmetic + * #kind = conjugate, square, absolute# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + @TYPE@_@kind@(args, dimensions, steps, func); + } +} +/**end repeat1**/ +#endif + NPY_NO_EXPORT void @TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 6c89627ca..e9d0b4c62 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -356,20 +356,27 @@ NPY_NO_EXPORT void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #IFSIMD = 1, 1, 0# */ /**begin repeat1 + * #isa = , _avx512f# + */ + +/**begin repeat2 * arithmetic * #kind = add, subtract# * #OP = +, -# */ + NPY_NO_EXPORT void -C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat1**/ +/**end repeat2**/ NPY_NO_EXPORT void -C@TYPE@_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ NPY_NO_EXPORT void C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -409,19 +416,24 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v /**end repeat1**/ NPY_NO_EXPORT void -C@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**begin repeat1 + * #isa = , _avx512f# + */ + NPY_NO_EXPORT void -C@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ NPY_NO_EXPORT void C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -444,7 +456,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v NPY_NO_EXPORT void C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat1**/ - #define C@TYPE@_true_divide C@TYPE@_divide /**end repeat**/ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index cd485034e..7ec90f9c8 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -55,6 +55,13 @@ abs_ptrdiff(char *a, char *b) return (a > b) ? (a - b) : (b - a); } +#define IS_BINARY_STRIDE_ONE(esize, vsize) \ + ((steps[0] == esize) && \ + (steps[1] == esize) && \ + (steps[2] == esize) && \ + (abs_ptrdiff(args[2], args[0]) >= vsize) && \ + (abs_ptrdiff(args[2], args[1]) >= vsize)) + /* * stride is equal to element size and input and destination are equal or * don't overlap within one register. The check of the steps against @@ -158,6 +165,71 @@ abs_ptrdiff(char *a, char *b) /* ***************************************************************************** + ** CMPLX DISPATCHERS + ***************************************************************************** + */ + +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type= npy_float, npy_double# + * #esize = 8, 16# + */ + +/**begin repeat1 + * #func = add, subtract, multiply# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps); +#endif + +static NPY_INLINE int +run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS + if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat1**/ + +/**begin repeat1 + * #func = square, absolute, conjugate# + * #outsize = 1, 2, 1# + * #max_stride = 2, 8, 8# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride); +#endif + +static NPY_INLINE int +run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS + if ((IS_OUTPUT_BLOCKABLE_UNARY((npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) { + AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** ** FLOAT DISPATCHERS ***************************************************************************** */ @@ -1591,9 +1663,17 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) } /**begin repeat * #vsub = ps, pd# + * #type= npy_float, npy_double# * #epi_vsub = epi32, epi64# * #vtype = __m512, __m512d# + * #mask = __mmask16, __mmask8# * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# + * #neg_mask = 0x80000000, 0x8000000000000000# + * #perm_ = 0xb1, 0x55# + * #cmpx_img_mask = 0xAAAA, 0xAA# + * #cmpx_re_mask = 0x5555, 0x55# + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# */ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ avx512_abs_@vsub@(@vtype@ x) @@ -1631,6 +1711,96 @@ avx512_trunc_@vsub@(@vtype@ x) { return _mm512_roundscale_@vsub@(x, 0x0B); } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_hadd_@vsub@(const @vtype@ x) +{ + return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_hsub_@vsub@(const @vtype@ x) +{ + return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_cabsolute_@vsub@(const @vtype@ x1, + const @vtype@ x2, + const __m512i re_indices, + const __m512i im_indices) +{ + @vtype@ inf = _mm512_set1_@vsub@(@INF@); + @vtype@ nan = _mm512_set1_@vsub@(@NAN@); + @vtype@ x1_abs = avx512_abs_@vsub@(x1); + @vtype@ x2_abs = avx512_abs_@vsub@(x2); + @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs); + @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ); + @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ); + im = _mm512_mask_mov_@vsub@(im, re_infmask, inf); + re = _mm512_mask_mov_@vsub@(re, im_infmask, inf); + + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ); + @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ); + im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan); + re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan); + + @vtype@ larger = _mm512_max_@vsub@(re, im); + @vtype@ smaller = _mm512_min_@vsub@(im, re); + + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ); + @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ); + @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask)); + @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger); + @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@( + ratio, ratio, _mm512_set1_@vsub@(1.0f))); + return _mm512_mul_@vsub@(hypot, larger); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_conjugate_@vsub@(const @vtype@ x) +{ + /* + * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and + * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction + */ + __m512i cast_x = _mm512_cast@vsub@_si512(x); + __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@, + cast_x, _mm512_set1_@epi_vsub@(@neg_mask@)); + return _mm512_castsi512_@vsub@(res); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) +{ + // x1 = r1, i1 + // x2 = r2, i2 + @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 + @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 + @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 + @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 + @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 + return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_csquare_@vsub@(@vtype@ x) +{ + return avx512_cmul_@vsub@(x, x); +} + /**end repeat**/ #endif @@ -2450,6 +2620,184 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void #endif /**end repeat**/ +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + * #type = npy_float, npy_double# + * #num_lanes = 16, 8# + * #vsuffix = ps, pd# + * #epi_vsub = epi32, epi64# + * #mask = __mmask16, __mmask8# + * #vtype = __m512, __m512d# + * #scale = 4, 8# + * #vindextype = __m512i, __m256i# + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + * #storemask = 0xFF, 0xF# + * #IS_FLOAT = 1, 0# + */ + +/**begin repeat1 + * #func = add, subtract, multiply# + * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +{ + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = 2*array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* ip2 = (@type@*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1, x2; + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); + + @vtype@ out = @vectorf@_@vsuffix@(x1, x2); + + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + + ip1 += @num_lanes@; + ip2 += @num_lanes@; + op += @num_lanes@; + num_remaining_elements -= @num_lanes@; + } +} +#endif +/**end repeat1**/ + +/**begin repeat1 + * #func = square, conjugate# + * #vectorf = avx512_csquare, avx512_conjugate# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(@type@ * op, + @type@ * ip, + const npy_intp array_size, + const npy_intp steps) +{ + npy_intp num_remaining_elements = 2*array_size; + const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via max_stride + */ + npy_int32 index_ip1[16]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) { + index_ip1[ii] = ii*stride_ip1; + index_ip1[ii+1] = ii*stride_ip1 + 1; + } + @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1); + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + @vtype@ zeros = _mm512_setzero_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask); + } + + @vtype@ out = @vectorf@_@vsuffix@(x1); + + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + op += @num_lanes@; + ip += @num_lanes@*stride_ip1; + num_remaining_elements -= @num_lanes@; + } +} +#endif +/**end repeat1**/ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_absolute_@TYPE@(@type@ * op, + @type@ * ip, + const npy_intp array_size, + const npy_intp steps) +{ + npy_intp num_remaining_elements = 2*array_size; + const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via max_stride + */ + npy_int32 index_ip[32]; + for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) { + index_ip[ii] = ii*stride_ip1; + index_ip[ii+1] = ii*stride_ip1 + 1; + } + @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip); + @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@)); + + @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@(); + @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@(); + @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@(); + @vtype@ zeros = _mm512_setzero_@vsuffix@(); + +#if @IS_FLOAT@ + __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0); + __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1); +#else + __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0); + __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1); +#endif + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask1 = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + load_mask2 = 0x0000; + store_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements/2, @num_lanes@); + } else if (num_remaining_elements < 2*@num_lanes@) { + load_mask1 = avx512_get_full_load_mask_@vsuffix@(); + load_mask2 = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements - @num_lanes@, @num_lanes@); + store_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements/2, @num_lanes@); + } + @vtype@ x1, x2; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask1, ip); + x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1); + x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2); + } + + @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index); + + _mm512_mask_storeu_@vsuffix@(op, store_mask, out); + op += @num_lanes@; + ip += 2*@num_lanes@*stride_ip1; + num_remaining_elements -= 2*@num_lanes@; + } + npy_clear_floatstatus_barrier((char*)op); +} + +#endif +/**end repeat**/ + /* ***************************************************************************** ** BOOL LOOPS diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py new file mode 100644 index 000000000..3b5cb3157 --- /dev/null +++ b/numpy/core/tests/test_cpu_features.py @@ -0,0 +1,104 @@ +import sys, platform, re, pytest + +from numpy.testing import assert_equal +from numpy.core._multiarray_umath import __cpu_features__ + +class AbstractTest(object): + features = [] + features_groups = {} + features_map = {} + features_flags = set() + + def load_flags(self): + # a hook + pass + + def test_features(self): + self.load_flags() + for gname, features in self.features_groups.items(): + test_features = [self.features_map.get(f, f) in self.features_flags for f in features] + assert_equal(__cpu_features__.get(gname), all(test_features)) + + for feature_name in self.features: + map_name = self.features_map.get(feature_name, feature_name) + cpu_have = map_name in self.features_flags + npy_have = __cpu_features__.get(feature_name) + assert_equal(npy_have, cpu_have) + + def load_flags_proc(self, magic_key): + with open('/proc/cpuinfo') as fd: + for line in fd: + if not line.startswith(magic_key): + continue + flags_value = [s.strip() for s in line.split(':', 1)] + if len(flags_value) == 2: + self.features_flags = self.features_flags.union(flags_value[1].upper().split()) + + def load_flags_auxv(self): + import subprocess + auxv = subprocess.check_output(['/bin/true'], env=dict(LD_SHOW_AUXV="1")) + for at in auxv.split(b'\n'): + if not at.startswith(b"AT_HWCAP"): + continue + hwcap_value = [s.strip() for s in at.split(b':', 1)] + if len(hwcap_value) == 2: + self.features_flags = self.features_flags.union( + hwcap_value[1].upper().decode().split() + ) + +is_linux = sys.platform.startswith('linux') +machine = platform.machine() +is_x86 = re.match("^(amd64|x86|i386|i686)", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_x86, reason="Only for Linux and x86") +class Test_X86_Features(AbstractTest): + features = [ + "MMX", "SSE", "SSE2", "SSE3", "SSSE3", "SSE41", "POPCNT", "SSE42", + "AVX", "F16C", "XOP", "FMA4", "FMA3", "AVX2", "AVX512F", "AVX512CD", + "AVX512ER", "AVX512PF", "AVX5124FMAPS", "AVX5124VNNIW", "AVX512VPOPCNTDQ", + "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512VNNI", "AVX512IFMA", + "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG", + ] + features_groups = dict( + AVX512_KNL = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF"], + AVX512_KNM = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF", "AVX5124FMAPS", + "AVX5124VNNIW", "AVX512VPOPCNTDQ"], + AVX512_SKX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL"], + AVX512_CLX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512VNNI"], + AVX512_CNL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA", + "AVX512VBMI"], + AVX512_ICL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA", + "AVX512VBMI", "AVX512VNNI", "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ"], + ) + features_map = dict( + SSE3="PNI", SSE41="SSE4_1", SSE42="SSE4_2", FMA3="FMA", + AVX512VNNI="AVX512_VNNI", AVX512BITALG="AVX512_BITALG", AVX512VBMI2="AVX512_VBMI2", + AVX5124FMAPS="AVX512_4FMAPS", AVX5124VNNIW="AVX512_4VNNIW", AVX512VPOPCNTDQ="AVX512_VPOPCNTDQ", + ) + def load_flags(self): + self.load_flags_proc("flags") + +is_power = re.match("^(powerpc|ppc)64", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_power, reason="Only for Linux and Power") +class Test_POWER_Features(AbstractTest): + features = ["VSX", "VSX2", "VSX3"] + features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00") + + def load_flags(self): + self.load_flags_auxv() + +is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM") +class Test_ARM_Features(AbstractTest): + features = [ + "NEON", "ASIMD", "FPHP", "ASIMDHP", "ASIMDDP", "ASIMDFHM" + ] + features_groups = dict( + NEON_FP16 = ["NEON", "HALF"], + NEON_VFPV4 = ["NEON", "VFPV4"], + ) + def load_flags(self): + self.load_flags_proc("Features") + if re.match("^(aarch64|AARCH64)", platform.machine()): + self.features_map = dict( + NEON="ASIMD", HALF="ASIMD", VFPV4="ASIMD" + ) diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py index 5e5ced85c..a21158420 100644 --- a/numpy/core/tests/test_umath_complex.py +++ b/numpy/core/tests/test_umath_complex.py @@ -6,7 +6,7 @@ import numpy as np # import the c-extension module directly since _arg is not exported via umath import numpy.core._multiarray_umath as ncu from numpy.testing import ( - assert_raises, assert_equal, assert_array_equal, assert_almost_equal + assert_raises, assert_equal, assert_array_equal, assert_almost_equal, assert_array_max_ulp ) # TODO: branch cuts (use Pauli code) @@ -540,3 +540,40 @@ def check_complex_value(f, x1, y1, x2, y2, exact=True): assert_equal(f(z1), z2) else: assert_almost_equal(f(z1), z2) + +class TestSpecialComplexAVX(object): + @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4]) + @pytest.mark.parametrize("astype", [np.complex64, np.complex128]) + def test_array(self, stride, astype): + arr = np.array([np.complex(np.nan , np.nan), + np.complex(np.nan , np.inf), + np.complex(np.inf , np.nan), + np.complex(np.inf , np.inf), + np.complex(0. , np.inf), + np.complex(np.inf , 0.), + np.complex(0. , 0.), + np.complex(0. , np.nan), + np.complex(np.nan , 0.)], dtype=astype) + abs_true = np.array([np.nan, np.inf, np.inf, np.inf, np.inf, np.inf, 0., np.nan, np.nan], dtype=arr.real.dtype) + sq_true = np.array([np.complex(np.nan, np.nan), + np.complex(np.nan, np.nan), + np.complex(np.nan, np.nan), + np.complex(np.nan, np.inf), + np.complex(-np.inf, np.nan), + np.complex(np.inf, np.nan), + np.complex(0., 0.), + np.complex(np.nan, np.nan), + np.complex(np.nan, np.nan)], dtype=astype) + assert_equal(np.abs(arr[::stride]), abs_true[::stride]) + with np.errstate(invalid='ignore'): + assert_equal(np.square(arr[::stride]), sq_true[::stride]) + +class TestComplexAbsoluteAVX(object): + @pytest.mark.parametrize("arraysize", [1,2,3,4,5,6,7,8,9,10,11,13,15,17,18,19]) + @pytest.mark.parametrize("stride", [-4,-3,-2,-1,1,2,3,4]) + @pytest.mark.parametrize("astype", [np.complex64, np.complex128]) + # test to ensure masking and strides work as intended in the AVX implementation + def test_array(self, arraysize, stride, astype): + arr = np.ones(arraysize, dtype=astype) + abs_true = np.ones(arraysize, dtype=arr.real.dtype) + assert_equal(np.abs(arr[::stride]), abs_true[::stride]) diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index ad508e85d..0f2d082c5 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -251,9 +251,9 @@ def unique(ar, return_index=False, return_inverse=False, >>> u array([1, 2, 3, 4, 6]) >>> indices - array([0, 1, 4, ..., 1, 2, 1]) + array([0, 1, 4, 3, 1, 2, 1]) >>> u[indices] - array([1, 2, 6, ..., 2, 3, 2]) + array([1, 2, 6, 4, 2, 3, 2]) """ ar = np.asanyarray(ar) |