21 files changed, 1116 insertions, 141 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 1fd08241d..c14711d16 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -233,6 +233,7 @@ flts = 'efdg'
 fltsO = flts + O
 fltsP = flts + P
 cmplx = 'FDG'
+cmplxvec = 'FD'
 cmplxO = cmplx + O
 cmplxP = cmplx + P
 inexact = flts + cmplx
@@ -268,7 +269,7 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -279,7 +280,7 @@ defdict = {
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -290,7 +291,7 @@ defdict = {
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(notimes_or_obj, simd=[('avx512f', cmplxvec),('avx2', ints)]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -325,7 +326,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx, simd=[('avx2', ints)]),
+          TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -340,7 +341,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
@@ -378,7 +379,7 @@ defdict = {
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
           TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
-          TD(cmplx, out=('f', 'd', 'g')),
+          TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
 '_arg':
@@ -1014,7 +1015,7 @@ def make_arrays(funcdict):
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
                         #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
-                        if (npy_cpu_supports("{isa}")) {{
+                        if (NPY_CPU_HAVE({ISA})) {{
                             {fname}_functions[{idx}] = {type}_{fname}_{isa};
                         }}
                         #endif
@@ -1138,7 +1139,6 @@ def make_code(funcdict, filename):
 
         Please make changes to the code generator program (%s)
     **/
-    #include "cpuid.h"
     #include "ufunc_object.h"
     #include "ufunc_type_resolution.h"
     #include "loops.h"
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 66c1b782e..e15cbf7c2 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -745,6 +745,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'ucsnarrow.c'),
             join('src', 'common', 'ufunc_override.c'),
             join('src', 'common', 'numpyos.c'),
+            join('src', 'common', 'npy_cpu_features.c.src'),
             ]
 
     if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0":
@@ -898,7 +899,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'clip.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'extobj.c'),
-            join('src', 'umath', 'cpuid.c'),
             join('src', 'umath', 'scalarmath.c.src'),
             join('src', 'umath', 'ufunc_type_resolution.c'),
             join('src', 'umath', 'override.c'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 7cac66e61..63c4a76a9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -132,11 +132,6 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
                        ("__builtin_mul_overflow", '5, 5, (int*)5'),
-                       # broken on OSX 10.11, make sure its not optimized away
-                       ("volatile int r = __builtin_cpu_supports", '"sse"',
-                        "stdio.h", "__BUILTIN_CPU_SUPPORTS"),
-                       ("volatile int r = __builtin_cpu_supports", '"avx512f"',
-                        "stdio.h", "__BUILTIN_CPU_SUPPORTS_AVX512F"),
                        # MMX only needed for icc, but some clangs don't have it
                        ("_m_from_int64", '0', "emmintrin.h"),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index eedfbe364..aebe241a5 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
 #define _NPY_NPY_CONFIG_H_
 
 #include "config.h"
+#include "npy_cpu_features.h"
 #include "numpy/numpyconfig.h"
 #include "numpy/npy_cpu.h"
 #include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
new file mode 100644
index 000000000..cbd99827b
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -0,0 +1,404 @@
+#include "npy_cpu_features.h"
+#include "numpy/npy_common.h" // for NPY_INLINE
+#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope.
+
+/******************** Private Definitions *********************/
+
+// Hold all CPU features boolean values
+static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
+
+/******************** Private Declarations *********************/
+
+// Almost detect all CPU features in runtime
+static void
+npy__cpu_init_features(void);
+
+/******************** Public Definitions *********************/
+
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_have(int feature_id)
+{
+    if (feature_id <= NPY_CPU_FEATURE_NONE || feature_id >= NPY_CPU_FEATURE_MAX)
+        return 0;
+    return npy__cpu_have[feature_id];
+}
+
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_init(void)
+{
+    npy__cpu_init_features();
+    return 0;
+}
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_features_dict(void)
+{
+    PyObject *dict = PyDict_New();
+    if (dict) {
+    /**begin repeat
+     * #feature = MMX, SSE, SSE2, SSE3, SSSE3, SSE41, POPCNT, SSE42,
+     *            AVX, F16C, XOP, FMA4, FMA3, AVX2, AVX512F,
+     *            AVX512CD, AVX512ER, AVX512PF, AVX5124FMAPS, AVX5124VNNIW,
+     *            AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI,
+     *            AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG,
+     *            AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL,
+     *            VSX, VSX2, VSX3,
+     *            NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM#
+    */
+        if (PyDict_SetItemString(dict, "@feature@",
+            npy__cpu_have[NPY_CPU_FEATURE_@feature@] ? Py_True : Py_False) < 0) {
+            Py_DECREF(dict);
+            return NULL;
+        }
+    /**end repeat**/
+    }
+    return dict;
+}
+
+/****************************************************************
+ * This section is reserved to defining @npy__cpu_init_features
+ * for each CPU architecture, please try to keep it clean. Ty
+ ****************************************************************/
+
+/***************** X86 ******************/
+
+#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86)
+
+#ifdef _MSC_VER
+    #include <intrin.h>
+#elif defined(__INTEL_COMPILER)
+    #include <immintrin.h>
+#endif
+
+static int
+npy__cpu_getxcr0(void)
+{
+#if defined(_MSC_VER) || defined (__INTEL_COMPILER)
+    return _xgetbv(0);
+#elif defined(__GNUC__) || defined(__clang__)
+    unsigned int eax, edx;
+    __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+    return (eax | (unsigned long long)edx << 32);
+#else
+    // TODO: handle other x86 compilers
+    return 0;
+#endif
+}
+
+static void
+npy__cpu_cpuid(int reg[4], int func_id)
+{
+#if defined(_MSC_VER)
+    __cpuidex(reg, func_id, 0);
+#elif defined(__INTEL_COMPILER)
+    __cpuid(reg, func_id);
+#elif defined(__GNUC__) || defined(__clang__)
+    #if defined(NPY_CPU_X86) && defined(__PIC__)
+    // %ebx may be the PIC register
+        #define NPY__CPUID_ASM \
+            "xchg{l}\t{%%}ebx, %1\n\t" \
+            "cpuid\n\t"                \
+            "xchg{l}\t{%%}ebx, %1\n\t"
+    #else
+        #define NPY__CPUID_ASM "cpuid"
+    #endif
+    __asm__(NPY__CPUID_ASM : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) : "a" (func_id), "c" (0) : );
+#else
+    // TODO: handle other x86 compilers
+    reg[0] = 0;
+#endif
+}
+
+static void
+npy__cpu_init_features(void)
+{
+    memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+
+    // validate platform support
+    int reg[] = {0, 0, 0, 0};
+    npy__cpu_cpuid(reg, 0);
+    if (reg[0] == 0)
+        return;
+
+    npy__cpu_cpuid(reg, 1);
+    npy__cpu_have[NPY_CPU_FEATURE_MMX]    = (reg[3] & (1 << 23)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSE]    = (reg[3] & (1 << 25)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSE2]   = (reg[3] & (1 << 26)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSE3]   = (reg[2] & (1 << 0))  != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSSE3]  = (reg[2] & (1 << 9))  != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSE41]  = (reg[2] & (1 << 19)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_POPCNT] = (reg[2] & (1 << 23)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_SSE42]  = (reg[2] & (1 << 20)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_F16C]   = (reg[2] & (1 << 29)) != 0;
+
+    // check OSXSAVE
+    if ((reg[2] & (1 << 27)) == 0)
+        return;
+    // check AVX OS support
+    int xcr = npy__cpu_getxcr0();
+    if ((xcr & 6) != 6)
+        return;
+    npy__cpu_have[NPY_CPU_FEATURE_AVX]    = (reg[2] & (1 << 28)) != 0;
+    if (!npy__cpu_have[NPY_CPU_FEATURE_AVX])
+        return;
+    npy__cpu_have[NPY_CPU_FEATURE_FMA3]   = (reg[2] & (1 << 12)) != 0;
+
+    // second call to the cpuid to get extended AMD feature bits
+    npy__cpu_cpuid(reg, 0x80000001);
+    npy__cpu_have[NPY_CPU_FEATURE_XOP]    = (reg[2] & (1 << 11)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_FMA4]   = (reg[2] & (1 << 16)) != 0;
+
+    // third call to the cpuid to get extended AVX2 & AVX512 feature bits
+    npy__cpu_cpuid(reg, 7);
+    npy__cpu_have[NPY_CPU_FEATURE_AVX2]   = (reg[1] & (1 << 5))  != 0;
+    if (!npy__cpu_have[NPY_CPU_FEATURE_AVX2])
+        return;
+    // detect AVX2 & FMA3
+    npy__cpu_have[NPY_CPU_FEATURE_FMA]    = npy__cpu_have[NPY_CPU_FEATURE_FMA3];
+
+    // check AVX512 OS support
+    if ((xcr & 0xe6) != 0xe6)
+        return;
+    npy__cpu_have[NPY_CPU_FEATURE_AVX512F]  = (reg[1] & (1 << 16)) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_AVX512CD] = (reg[1] & (1 << 28)) != 0;
+    if (npy__cpu_have[NPY_CPU_FEATURE_AVX512F] && npy__cpu_have[NPY_CPU_FEATURE_AVX512CD]) {
+        // Knights Landing
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512PF]        = (reg[1] & (1 << 26)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512ER]        = (reg[1] & (1 << 27)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512PF];
+        // Knights Mill
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ] = (reg[2] & (1 << 14)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW]    = (reg[3] & (1 << 2))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS]    = (reg[3] & (1 << 3))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNM]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+
+        // Skylake-X
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ]        = (reg[1] & (1 << 17)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512BW]        = (reg[1] & (1 << 30)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512VL]        = (reg[1] & (1 << 31)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VL];
+        // Cascade Lake
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI]      = (reg[2] & (1 << 11)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI];
+
+        // Cannon Lake
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA]      = (reg[1] & (1 << 21)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI]      = (reg[2] & (1 << 1))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI];
+        // Ice Lake
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2]     = (reg[2] & (1 << 6))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG]    = (reg[2] & (1 << 12)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+    }
+}
+
+/***************** POWER ******************/
+
+#elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
+
+#ifdef __linux__
+    #include <sys/auxv.h>
+    #ifndef AT_HWCAP2
+        #define AT_HWCAP2 26
+    #endif
+    #ifndef PPC_FEATURE2_ARCH_3_00
+        #define PPC_FEATURE2_ARCH_3_00 0x00800000
+    #endif
+#endif
+
+static void
+npy__cpu_init_features(void)
+{
+    memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#ifdef __linux__
+    unsigned int hwcap = getauxval(AT_HWCAP);
+    if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+        return;
+
+    hwcap = getauxval(AT_HWCAP2);
+    if (hwcap & PPC_FEATURE2_ARCH_3_00)
+    {
+        npy__cpu_have[NPY_CPU_FEATURE_VSX]  =
+        npy__cpu_have[NPY_CPU_FEATURE_VSX2] =
+        npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
+        return;
+    }
+    npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
+    npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
+// TODO: AIX, FreeBSD
+#else
+    npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
+    #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
+    npy__cpu_have[NPY_CPU_FEATURE_VSX2] = 1;
+    #endif
+    #ifdef NPY_HAVE_VSX3
+    npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1;
+    #endif
+#endif
+}
+
+/***************** ARM ******************/
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+static NPY_INLINE void
+npy__cpu_init_features_arm8(void)
+{
+    npy__cpu_have[NPY_CPU_FEATURE_NEON]       =
+    npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16]  =
+    npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] =
+    npy__cpu_have[NPY_CPU_FEATURE_ASIMD]      = 1;
+}
+
+#ifdef __linux__
+/*
+ * we aren't sure of what kind kernel or clib we deal with
+ * so we play it safe
+*/
+#include <stdio.h>
+#include <fcntl.h>
+
+#define NPY__HWCAP  16
+#define NPY__HWCAP2 26
+
+// arch/arm/include/uapi/asm/hwcap.h
+#define NPY__HWCAP_HALF   (1 << 1)
+#define NPY__HWCAP_NEON   (1 << 12)
+#define NPY__HWCAP_VFPv3  (1 << 13)
+#define NPY__HWCAP_VFPv4  (1 << 16)
+#define NPY__HWCAP2_AES   (1 << 0)
+#define NPY__HWCAP2_PMULL (1 << 1)
+#define NPY__HWCAP2_SHA1  (1 << 2)
+#define NPY__HWCAP2_SHA2  (1 << 3)
+#define NPY__HWCAP2_CRC32 (1 << 4)
+// arch/arm64/include/uapi/asm/hwcap.h
+#define NPY__HWCAP_FP       (1 << 0)
+#define NPY__HWCAP_ASIMD    (1 << 1)
+#define NPY__HWCAP_FPHP     (1 << 9)
+#define NPY__HWCAP_ASIMDHP  (1 << 10)
+#define NPY__HWCAP_ASIMDDP  (1 << 20)
+#define NPY__HWCAP_ASIMDFHM (1 << 23)
+
+__attribute__((weak)) unsigned long getauxval(unsigned long); // linker should handle it
+static int
+npy__cpu_init_features_linux(void)
+{
+    unsigned long hwcap = 0, hwcap2 = 0;
+    if (getauxval != 0) {
+        hwcap = getauxval(NPY__HWCAP);
+    #ifdef __arm__
+        hwcap2 = getauxval(NPY__HWCAP2);
+    #endif
+    } else {
+        unsigned long auxv[2];
+        int fd = open("/proc/self/auxv", O_RDONLY);
+        if (fd >= 0) {
+            while (read(fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
+                if (auxv[0] == NPY__HWCAP) {
+                    hwcap = auxv[1];
+                }
+            #ifdef __arm__
+                else if (auxv[0] == NPY__HWCAP2) {
+                    hwcap2 = auxv[1];
+                }
+            #endif
+                // detect the end
+                else if (auxv[0] == 0 && auxv[1] == 0) {
+                    break;
+                }
+            }
+            close(fd);
+        }
+    }
+    if (hwcap == 0 && hwcap2 == 0) {
+        /*
+         * FIXME: failback to compiler definitions,
+         * BTW we can parse /proc/cpuinfo for badly patched kernels
+        */
+        return 0;
+    }
+#ifdef __arm__
+    // Detect Arm8 (aarch32 state)
+    if ((hwcap2 & NPY__HWCAP2_AES)  || (hwcap2 & NPY__HWCAP2_SHA1)  ||
+        (hwcap2 & NPY__HWCAP2_SHA2) || (hwcap2 & NPY__HWCAP2_PMULL) ||
+        (hwcap2 & NPY__HWCAP2_CRC32))
+#else
+    if (1)
+#endif
+    {
+        if (!(hwcap & (NPY__HWCAP_FP | NPY__HWCAP_ASIMD))) {
+            // Is this could happen? maybe disabled by kernel
+            // BTW this will break the baseline of AARCH64
+            return 1;
+        }
+        npy__cpu_have[NPY_CPU_FEATURE_FPHP]       = (hwcap & NPY__HWCAP_FPHP)     != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP]    = (hwcap & NPY__HWCAP_ASIMDHP)  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP]    = (hwcap & NPY__HWCAP_ASIMDDP)  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM]   = (hwcap & NPY__HWCAP_ASIMDFHM) != 0;
+        npy__cpu_init_features_arm8();
+    } else {
+        npy__cpu_have[NPY_CPU_FEATURE_NEON]       = (hwcap & NPY__HWCAP_NEON)   != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16]  = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv3 |
+                                                              NPY__HWCAP_HALF)) != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv4)) != 0;
+    }
+    return 1;
+}
+#endif
+
+static void
+npy__cpu_init_features(void)
+{
+    memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#ifdef __linux__
+    if (npy__cpu_init_features_linux())
+        return;
+#endif
+    // We have nothing else todo
+#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+    #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
+    #endif
+    #if defined(NPY_HAVE_ASIMDHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = 1;
+    #endif
+    #if defined(NPY_HAVE_ASIMDDP) || defined(__ARM_FEATURE_DOTPROD)
+    npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = 1;
+    #endif
+    #if defined(NPY_HAVE_ASIMDFHM) || defined(__ARM_FEATURE_FP16FML)
+    npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = 1;
+    #endif
+    npy__cpu_init_features_arm8();
+#else
+    #if defined(NPY_HAVE_NEON) || defined(__ARM_NEON__)
+        npy__cpu_have[NPY_CPU_FEATURE_NEON] = 1;
+    #endif
+    #if defined(NPY_HAVE_NEON_FP16) || defined(__ARM_FP16_FORMAT_IEEE) || (defined(__ARM_FP) && (__ARM_FP & 2))
+        npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = npy__cpu_have[NPY_CPU_FEATURE_NEON];
+    #endif
+    #if defined(NPY_HAVE_NEON_VFPV4) || defined(__ARM_FEATURE_FMA)
+        npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = npy__cpu_have[NPY_CPU_FEATURE_NEON];
+    #endif
+#endif
+}
+
+/*********** Unsupported ARCH ***********/
+#else
+static void
+npy__cpu_init_features(void)
+{
+}
+#endif
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
new file mode 100644
index 000000000..0e8901328
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -0,0 +1,117 @@
+#ifndef _NPY_CPU_FEATURES_H_
+#define _NPY_CPU_FEATURES_H_
+
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
+#include <Python.h> // for PyObject
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum npy_cpu_features
+{
+    NPY_CPU_FEATURE_NONE = 0,
+    // X86
+    NPY_CPU_FEATURE_MMX               = 1,
+    NPY_CPU_FEATURE_SSE               = 2,
+    NPY_CPU_FEATURE_SSE2              = 3,
+    NPY_CPU_FEATURE_SSE3              = 4,
+    NPY_CPU_FEATURE_SSSE3             = 5,
+    NPY_CPU_FEATURE_SSE41             = 6,
+    NPY_CPU_FEATURE_POPCNT            = 7,
+    NPY_CPU_FEATURE_SSE42             = 8,
+    NPY_CPU_FEATURE_AVX               = 9,
+    NPY_CPU_FEATURE_F16C              = 10,
+    NPY_CPU_FEATURE_XOP               = 11,
+    NPY_CPU_FEATURE_FMA4              = 12,
+    NPY_CPU_FEATURE_FMA3              = 13,
+    NPY_CPU_FEATURE_AVX2              = 14,
+    NPY_CPU_FEATURE_FMA               = 15, // AVX2 & FMA3, provides backward compatibility
+
+    NPY_CPU_FEATURE_AVX512F           = 30,
+    NPY_CPU_FEATURE_AVX512CD          = 31,
+    NPY_CPU_FEATURE_AVX512ER          = 32,
+    NPY_CPU_FEATURE_AVX512PF          = 33,
+    NPY_CPU_FEATURE_AVX5124FMAPS      = 34,
+    NPY_CPU_FEATURE_AVX5124VNNIW      = 35,
+    NPY_CPU_FEATURE_AVX512VPOPCNTDQ   = 36,
+    NPY_CPU_FEATURE_AVX512BW          = 37,
+    NPY_CPU_FEATURE_AVX512DQ          = 38,
+    NPY_CPU_FEATURE_AVX512VL          = 39,
+    NPY_CPU_FEATURE_AVX512IFMA        = 40,
+    NPY_CPU_FEATURE_AVX512VBMI        = 41,
+    NPY_CPU_FEATURE_AVX512VNNI        = 42,
+    NPY_CPU_FEATURE_AVX512VBMI2       = 43,
+    NPY_CPU_FEATURE_AVX512BITALG      = 44,
+
+    // X86 CPU Groups
+    // Knights Landing (F,CD,ER,PF)
+    NPY_CPU_FEATURE_AVX512_KNL        = 101,
+    // Knights Mill    (F,CD,ER,PF,4FMAPS,4VNNIW,VPOPCNTDQ)
+    NPY_CPU_FEATURE_AVX512_KNM        = 102,
+    // Skylake-X       (F,CD,BW,DQ,VL)
+    NPY_CPU_FEATURE_AVX512_SKX        = 103,
+    // Cascade Lake    (F,CD,BW,DQ,VL,VNNI)
+    NPY_CPU_FEATURE_AVX512_CLX        = 104,
+    // Cannon Lake     (F,CD,BW,DQ,VL,IFMA,VBMI)
+    NPY_CPU_FEATURE_AVX512_CNL        = 105,
+    // Ice Lake        (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
+    NPY_CPU_FEATURE_AVX512_ICL        = 106,
+
+    // IBM/POWER VSX
+    // POWER7
+    NPY_CPU_FEATURE_VSX               = 200,
+    // POWER8
+    NPY_CPU_FEATURE_VSX2              = 201,
+    // POWER9
+    NPY_CPU_FEATURE_VSX3              = 202,
+
+    // ARM
+    NPY_CPU_FEATURE_NEON              = 300,
+    NPY_CPU_FEATURE_NEON_FP16         = 301,
+    // FMA
+    NPY_CPU_FEATURE_NEON_VFPV4        = 302,
+    // Advanced SIMD
+    NPY_CPU_FEATURE_ASIMD             = 303,
+    // ARMv8.2 half-precision
+    NPY_CPU_FEATURE_FPHP              = 304,
+    // ARMv8.2 half-precision vector arithm
+    NPY_CPU_FEATURE_ASIMDHP           = 305,
+    // ARMv8.2 dot product
+    NPY_CPU_FEATURE_ASIMDDP           = 306,
+    // ARMv8.2 single&half-precision multiply
+    NPY_CPU_FEATURE_ASIMDFHM          = 307,
+
+    NPY_CPU_FEATURE_MAX
+};
+
+/*
+ * Initialize CPU features
+ * return 0 on success otherwise return -1
+*/
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_init(void);
+
+/*
+ * return 0 if CPU feature isn't available
+ * note: `npy_cpu_init` must be called first otherwise it will always return 0
+*/
+NPY_VISIBILITY_HIDDEN int
+npy_cpu_have(int feature_id);
+
+#define NPY_CPU_HAVE(FEATURE_NAME) \
+npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME)
+
+/*
+ * return a new dictionary contains CPU feature names
+ * with runtime availability.
+ * same as npy_cpu_have, `npy_cpu_init` must be called first.
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_features_dict(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _NPY_CPU_FEATURES_H_
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index cf8f9ddae..16896aa12 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -1123,7 +1123,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
 
         op = (cmp_op == Py_EQ ? n_ops.logical_and : n_ops.logical_or);
         while (PyDict_Next(PyArray_DESCR(self)->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             a = array_subscript_asarray(self, key);
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 61270dbef..ce288d62e 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -2290,6 +2290,7 @@ static void
 STRING_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
                   npy_intp n, int NPY_UNUSED(swap), PyArrayObject *arr)
 {
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
@@ -2304,6 +2305,7 @@ VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
 {
     PyArray_Descr *descr;
 
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
@@ -2394,6 +2396,7 @@ VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
 {
     PyArray_Descr *descr;
 
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
@@ -2475,6 +2478,7 @@ UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
 {
     int itemsize;
 
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
@@ -2502,6 +2506,7 @@ UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
 static void
 STRING_copyswap(char *dst, char *src, int NPY_UNUSED(swap), PyArrayObject *arr)
 {
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
@@ -2514,6 +2519,7 @@ UNICODE_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
 {
     int itemsize;
 
+    assert(arr != NULL);
     if (arr == NULL) {
         return;
     }
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index bfbf67ff9..215c8b0ab 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1998,7 +1998,7 @@ _arraydescr_isnative(PyArray_Descr *self)
         int offset;
         Py_ssize_t pos = 0;
         while (PyDict_Next(self->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
@@ -2508,7 +2508,7 @@ _descr_find_object(PyArray_Descr *self)
         Py_ssize_t pos = 0;
 
         while (PyDict_Next(self->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
@@ -2982,7 +2982,7 @@ PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian)
         newfields = PyDict_New();
         /* make new dictionary with replaced PyArray_Descr Objects */
         while (PyDict_Next(self->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyUString_Check(key) || !PyTuple_Check(value) ||
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 7b9aa4794..ebdf8a4cd 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -1503,7 +1503,7 @@ _deepcopy_call(char *iptr, char *optr, PyArray_Descr *dtype,
         int offset;
         Py_ssize_t pos = 0;
         while (PyDict_Next(dtype->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset,
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index c2e597385..7792fcdcb 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4388,6 +4388,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     PyObject *m, *d, *s;
     PyObject *c_api;
 
+    /* Initialize CPU features */
+    if (npy_cpu_init() < 0) {
+        goto err;
+    }
+
     /* Create the module and add the functions */
     m = PyModule_Create(&moduledef);
     if (!m) {
@@ -4513,6 +4518,16 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     PyDict_SetItemString(d, "__version__", s);
     Py_DECREF(s);
 
+    s = npy_cpu_features_dict();
+    if (s == NULL) {
+        goto err;
+    }
+    if (PyDict_SetItemString(d, "__cpu_features__", s) < 0) {
+        Py_DECREF(s);
+        goto err;
+    }
+    Py_DECREF(s);
+
     s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
     if (s == NULL) {
         goto err;
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 6033929d9..c869b5eea 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -46,7 +46,7 @@ PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
         Py_ssize_t pos = 0;
 
         while (PyDict_Next(descr->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset,
@@ -108,7 +108,7 @@ PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
             Py_ssize_t pos = 0;
 
             while (PyDict_Next(descr->fields, &pos, &key, &value)) {
-                if NPY_TITLE_KEY(key, value) {
+                if (NPY_TITLE_KEY(key, value)) {
                     continue;
                 }
                 if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset,
@@ -318,7 +318,7 @@ _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype)
         Py_ssize_t pos = 0;
 
         while (PyDict_Next(dtype->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 127ac5134..30507112d 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -317,7 +317,7 @@ _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype)
         int offset;
         Py_ssize_t pos = 0;
         while (PyDict_Next(dtype->fields, &pos, &key, &value)) {
-            if NPY_TITLE_KEY(key, value) {
+            if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c
deleted file mode 100644
index 72c6493e8..000000000
--- a/numpy/core/src/umath/cpuid.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <Python.h>
-
-#include "npy_config.h"
-
-#include "cpuid.h"
-
-#define XCR_XFEATURE_ENABLED_MASK 0x0
-#define XSTATE_SSE 0x2
-#define XSTATE_YMM 0x4
-#define XSTATE_ZMM 0x70
-
-/*
- * verify the OS supports avx instructions
- * it can be disabled in some OS, e.g. with the nosavex boot option of linux
- */
-static NPY_INLINE
-int os_avx_support(void)
-{
-#if HAVE_XGETBV
-    /*
-     * use bytes for xgetbv to avoid issues with compiler not knowing the
-     * instruction
-     */
-    unsigned int eax, edx;
-    unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
-    __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
-    return (eax & (XSTATE_SSE | XSTATE_YMM)) == (XSTATE_SSE | XSTATE_YMM);
-#else
-    return 0;
-#endif
-}
-
-static NPY_INLINE
-int os_avx512_support(void)
-{
-#if HAVE_XGETBV
-    unsigned int eax, edx;
-    unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;
-    unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE;
-    __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
-    return (eax & xcr0) == xcr0;
-#else
-    return 0;
-#endif
-}
-
-static NPY_INLINE
-int cpu_supports_fma(void)
-{
-#ifdef __x86_64__
-    unsigned int feature = 0x01;
-    unsigned int a, b, c, d;
-    __asm__ volatile (
-        "cpuid"				"\n\t"
-	: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
-	: "a" (feature));
-    /*
-     * FMA is the 12th bit of ECX
-     */
-    return (c >> 12) & 1;
-#else
-    return 0;
-#endif
-}
-
-/*
- * Primitive cpu feature detect function
- * Currently only supports checking for avx on gcc compatible compilers.
- */
-NPY_NO_EXPORT int
-npy_cpu_supports(const char * feature)
-{
-#ifdef HAVE___BUILTIN_CPU_SUPPORTS
-    if (strcmp(feature, "avx512f") == 0) {
-#ifdef HAVE___BUILTIN_CPU_SUPPORTS_AVX512F
-        return __builtin_cpu_supports("avx512f") && os_avx512_support();
-#else
-        return 0;
-#endif
-    }
-    else if (strcmp(feature, "fma") == 0) {
-        return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support();
-    }
-    else if (strcmp(feature, "avx2") == 0) {
-        return __builtin_cpu_supports("avx2") && os_avx_support();
-    }
-    else if (strcmp(feature, "avx") == 0) {
-        return __builtin_cpu_supports("avx") && os_avx_support();
-    }
-#endif
-
-    return 0;
-}
diff --git a/numpy/core/src/umath/cpuid.h b/numpy/core/src/umath/cpuid.h
deleted file mode 100644
index 33702ed41..000000000
--- a/numpy/core/src/umath/cpuid.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _NPY_PRIVATE__CPUID_H_
-#define _NPY_PRIVATE__CPUID_H_
-
-#include <numpy/ndarraytypes.h>  /* for NPY_NO_EXPORT */
-
-NPY_NO_EXPORT int
-npy_cpu_supports(const char * feature);
-
-#endif
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index b310d73ff..9b43824cb 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -2509,6 +2509,7 @@ HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps,
  * #ftype = npy_float, npy_double, npy_longdouble#
  * #c = f, , l#
  * #C = F, , L#
+ * #SIMD = 1, 1, 0#
  */
 
 /* similar to pairwise sum of real floats */
@@ -2584,6 +2585,7 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
     }
 }
 
+
 /**begin repeat1
  * arithmetic
  * #kind = add, subtract#
@@ -2662,6 +2664,32 @@ NPY_NO_EXPORT void
     }
 }
 
+#if @SIMD@
+NPY_NO_EXPORT void
+@TYPE@_add_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (IS_BINARY_REDUCE) {
+        @TYPE@_add(args, dimensions, steps, func);
+    }
+    else if (!run_binary_avx512f_add_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_add(args, dimensions, steps, func);
+    }
+}
+
+/**begin repeat1
+ * arithmetic
+ * #kind = subtract, multiply#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_@kind@(args, dimensions, steps, func);
+    }
+}
+/**end repeat1**/
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2819,6 +2847,21 @@ NPY_NO_EXPORT void
     }
 }
 
+#if @SIMD@
+/**begin repeat1
+ * arithmetic
+ * #kind = conjugate, square, absolute#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
+        @TYPE@_@kind@(args, dimensions, steps, func);
+    }
+}
+/**end repeat1**/
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 6c89627ca..e9d0b4c62 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -356,20 +356,27 @@ NPY_NO_EXPORT void
  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #c = f, , l#
  * #C = F, , L#
+ * #IFSIMD = 1, 1, 0#
  */
 
 /**begin repeat1
+ * #isa = , _avx512f#
+ */
+
+/**begin repeat2
  * arithmetic
  * #kind = add, subtract#
  * #OP = +, -#
  */
+
 NPY_NO_EXPORT void
-C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-/**end repeat1**/
+/**end repeat2**/
 
 NPY_NO_EXPORT void
-C@TYPE@_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_multiply@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
 
 NPY_NO_EXPORT void
 C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -409,19 +416,24 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
 /**end repeat1**/
 
 NPY_NO_EXPORT void
-C@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
 C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
+/**begin repeat1
+ * #isa = , _avx512f#
+ */
+
 NPY_NO_EXPORT void
-C@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat1**/
 
 NPY_NO_EXPORT void
 C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -444,7 +456,6 @@ C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, v
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
-
 #define C@TYPE@_true_divide C@TYPE@_divide
 
 /**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index cd485034e..7ec90f9c8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,13 @@ abs_ptrdiff(char *a, char *b)
     return (a > b) ? (a - b) : (b - a);
 }
 
+#define IS_BINARY_STRIDE_ONE(esize, vsize) \
+    ((steps[0] == esize) && \
+     (steps[1] == esize) && \
+     (steps[2] == esize) && \
+     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
+     (abs_ptrdiff(args[2], args[1]) >= vsize))
+
 /*
  * stride is equal to element size and input and destination are equal or
  * don't overlap within one register. The check of the steps against
@@ -158,6 +165,71 @@ abs_ptrdiff(char *a, char *b)
 
 /*
  *****************************************************************************
+ **                           CMPLX DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type= npy_float, npy_double#
+ * #esize = 8, 16#
+ */
+
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
+#endif
+
+static NPY_INLINE int
+run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
+        AVX512F_@func@_@TYPE@(args, dimensions, steps);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+
+/**begin repeat1
+ *  #func = square, absolute, conjugate#
+ *  #outsize = 1, 2, 1#
+ *  #max_stride = 2, 8, 8#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if ((IS_OUTPUT_BLOCKABLE_UNARY((npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
+        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
  **                           FLOAT DISPATCHERS
  *****************************************************************************
  */
@@ -1591,9 +1663,17 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 }
 /**begin repeat
  *  #vsub  = ps, pd#
+ *  #type= npy_float, npy_double#
  *  #epi_vsub  = epi32, epi64#
  *  #vtype = __m512, __m512d#
+ *  #mask = __mmask16, __mmask8#
  *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ *  #neg_mask = 0x80000000, 0x8000000000000000#
+ *  #perm_ = 0xb1, 0x55#
+ *  #cmpx_img_mask = 0xAAAA, 0xAA#
+ *  #cmpx_re_mask = 0x5555, 0x55#
+ *  #INF = NPY_INFINITYF, NPY_INFINITY#
+ *  #NAN = NPY_NANF, NPY_NAN#
  */
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_abs_@vsub@(@vtype@ x)
@@ -1631,6 +1711,96 @@ avx512_trunc_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0B);
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_hadd_@vsub@(const @vtype@ x)
+{
+    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_hsub_@vsub@(const @vtype@ x)
+{
+    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_cabsolute_@vsub@(const @vtype@ x1,
+                        const @vtype@ x2,
+                        const __m512i re_indices,
+                        const __m512i im_indices)
+{
+    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
+    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
+    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
+    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
+    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
+    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
+    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
+    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
+    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
+
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
+    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
+    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
+    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
+
+    @vtype@ larger  = _mm512_max_@vsub@(re, im);
+    @vtype@ smaller = _mm512_min_@vsub@(im, re);
+
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
+    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
+    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
+    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
+    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
+                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
+    return _mm512_mul_@vsub@(hypot, larger);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_conjugate_@vsub@(const @vtype@ x)
+{
+    /*
+     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
+     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
+     */
+    __m512i cast_x = _mm512_cast@vsub@_si512(x);
+    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
+                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
+    return _mm512_castsi512_@vsub@(res);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
+{
+    // x1 = r1, i1
+    // x2 = r2, i2
+    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
+    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
+    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
+    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
+    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
+    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_csquare_@vsub@(@vtype@ x)
+{
+    return avx512_cmul_@vsub@(x, x);
+}
+
 /**end repeat**/
 #endif
 
@@ -2450,6 +2620,184 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 #endif
 /**end repeat**/
 
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #type = npy_float, npy_double#
+ * #num_lanes = 16, 8#
+ * #vsuffix = ps, pd#
+ * #epi_vsub  = epi32, epi64#
+ * #mask = __mmask16, __mmask8#
+ * #vtype = __m512, __m512d#
+ * #scale = 4, 8#
+ * #vindextype = __m512i, __m256i#
+ * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #storemask = 0xFF, 0xF#
+ * #IS_FLOAT = 1, 0#
+ */
+
+/**begin repeat1
+ *  #func = add, subtract, multiply#
+ *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+{
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = 2*array_size;
+    @type@* ip1 = (@type@*) args[0];
+    @type@* ip2 = (@type@*) args[1];
+    @type@* op  = (@type@*) args[2];
+
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
+
+        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
+
+        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+
+        ip1 += @num_lanes@;
+        ip2 += @num_lanes@;
+        op += @num_lanes@;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif
+/**end repeat1**/
+
+/**begin repeat1
+ *  #func = square, conjugate#
+ *  #vectorf = avx512_csquare, avx512_conjugate#
+ */
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_@func@_@TYPE@(@type@ * op,
+                      @type@ * ip,
+                      const npy_intp array_size,
+                      const npy_intp steps)
+{
+    npy_intp num_remaining_elements = 2*array_size;
+    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
+
+     /*
+      * Note: while generally indices are npy_intp, we ensure that our maximum index
+      * will fit in an int32 as a precondition for this function via max_stride
+      */
+    npy_int32 index_ip1[16];
+    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip1[ii+1] = ii*stride_ip1 + 1;
+    }
+    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
+    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+    @vtype@ zeros = _mm512_setzero_@vsuffix@();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+        }
+        @vtype@ x1;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
+        }
+        else {
+            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
+        }
+
+        @vtype@ out = @vectorf@_@vsuffix@(x1);
+
+        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+        op += @num_lanes@;
+        ip += @num_lanes@*stride_ip1;
+        num_remaining_elements -= @num_lanes@;
+    }
+}
+#endif
+/**end repeat1**/
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
+AVX512F_absolute_@TYPE@(@type@ * op,
+                        @type@ * ip,
+                        const npy_intp array_size,
+                        const npy_intp steps)
+{
+    npy_intp num_remaining_elements = 2*array_size;
+    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via max_stride
+     */
+    npy_int32 index_ip[32];
+    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
+        index_ip[ii] = ii*stride_ip1;
+        index_ip[ii+1] = ii*stride_ip1 + 1;
+    }
+    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
+    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
+
+    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
+    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
+    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
+    @vtype@ zeros = _mm512_setzero_@vsuffix@();
+
+#if @IS_FLOAT@
+    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
+    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
+#else
+    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
+    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
+#endif
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < @num_lanes@) {
+            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements, @num_lanes@);
+            load_mask2 = 0x0000;
+            store_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements/2, @num_lanes@);
+        } else if (num_remaining_elements < 2*@num_lanes@) {
+            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
+            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements - @num_lanes@, @num_lanes@);
+            store_mask = avx512_get_partial_load_mask_@vsuffix@(
+                                    num_remaining_elements/2, @num_lanes@);
+        }
+        @vtype@ x1, x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
+            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
+        }
+        else {
+            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
+            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
+        }
+
+        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
+
+        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
+        op += @num_lanes@;
+        ip += 2*@num_lanes@*stride_ip1;
+        num_remaining_elements -= 2*@num_lanes@;
+    }
+    npy_clear_floatstatus_barrier((char*)op);
+}
+
+#endif
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           BOOL LOOPS
diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
new file mode 100644
index 000000000..3b5cb3157
--- /dev/null
+++ b/numpy/core/tests/test_cpu_features.py
@@ -0,0 +1,104 @@
+import sys, platform, re, pytest
+
+from numpy.testing import assert_equal
+from numpy.core._multiarray_umath import __cpu_features__
+
+class AbstractTest(object):
+    features = []
+    features_groups = {}
+    features_map = {}
+    features_flags = set()
+
+    def load_flags(self):
+        # a hook
+        pass
+
+    def test_features(self):
+        self.load_flags()
+        for gname, features in self.features_groups.items():
+            test_features = [self.features_map.get(f, f) in self.features_flags for f in features]
+            assert_equal(__cpu_features__.get(gname), all(test_features))
+
+        for feature_name in self.features:
+            map_name = self.features_map.get(feature_name, feature_name)
+            cpu_have = map_name in self.features_flags
+            npy_have = __cpu_features__.get(feature_name)
+            assert_equal(npy_have, cpu_have)
+
+    def load_flags_proc(self, magic_key):
+        with open('/proc/cpuinfo') as fd:
+            for line in fd:
+                if not line.startswith(magic_key):
+                    continue
+                flags_value = [s.strip() for s in line.split(':', 1)]
+                if len(flags_value) == 2:
+                    self.features_flags = self.features_flags.union(flags_value[1].upper().split())
+
+    def load_flags_auxv(self):
+        import subprocess
+        auxv = subprocess.check_output(['/bin/true'], env=dict(LD_SHOW_AUXV="1"))
+        for at in auxv.split(b'\n'):
+            if not at.startswith(b"AT_HWCAP"):
+                continue
+            hwcap_value = [s.strip() for s in at.split(b':', 1)]
+            if len(hwcap_value) == 2:
+                self.features_flags = self.features_flags.union(
+                    hwcap_value[1].upper().decode().split()
+                )
+
+is_linux = sys.platform.startswith('linux')
+machine  = platform.machine()
+is_x86   = re.match("^(amd64|x86|i386|i686)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_x86, reason="Only for Linux and x86")
+class Test_X86_Features(AbstractTest):
+    features = [
+        "MMX", "SSE", "SSE2", "SSE3", "SSSE3", "SSE41", "POPCNT", "SSE42",
+        "AVX", "F16C", "XOP", "FMA4", "FMA3", "AVX2", "AVX512F", "AVX512CD",
+        "AVX512ER", "AVX512PF", "AVX5124FMAPS", "AVX5124VNNIW", "AVX512VPOPCNTDQ",
+        "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512VNNI", "AVX512IFMA",
+        "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG",
+    ]
+    features_groups = dict(
+        AVX512_KNL = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF"],
+        AVX512_KNM = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF", "AVX5124FMAPS",
+                      "AVX5124VNNIW", "AVX512VPOPCNTDQ"],
+        AVX512_SKX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL"],
+        AVX512_CLX = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512VNNI"],
+        AVX512_CNL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
+                      "AVX512VBMI"],
+        AVX512_ICL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
+                      "AVX512VBMI", "AVX512VNNI", "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ"],
+    )
+    features_map = dict(
+        SSE3="PNI", SSE41="SSE4_1", SSE42="SSE4_2", FMA3="FMA",
+        AVX512VNNI="AVX512_VNNI", AVX512BITALG="AVX512_BITALG", AVX512VBMI2="AVX512_VBMI2",
+        AVX5124FMAPS="AVX512_4FMAPS", AVX5124VNNIW="AVX512_4VNNIW", AVX512VPOPCNTDQ="AVX512_VPOPCNTDQ",
+    )
+    def load_flags(self):
+        self.load_flags_proc("flags")
+
+is_power = re.match("^(powerpc|ppc)64", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_power, reason="Only for Linux and Power")
+class Test_POWER_Features(AbstractTest):
+    features = ["VSX", "VSX2", "VSX3"]
+    features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00")
+
+    def load_flags(self):
+        self.load_flags_auxv()
+
+is_arm = re.match("^(arm|aarch64)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_arm, reason="Only for Linux and ARM")
+class Test_ARM_Features(AbstractTest):
+    features = [
+        "NEON", "ASIMD", "FPHP", "ASIMDHP", "ASIMDDP", "ASIMDFHM"
+    ]
+    features_groups = dict(
+        NEON_FP16  = ["NEON", "HALF"],
+        NEON_VFPV4 = ["NEON", "VFPV4"],
+    )
+    def load_flags(self):
+        self.load_flags_proc("Features")
+        if re.match("^(aarch64|AARCH64)", platform.machine()):
+            self.features_map = dict(
+                NEON="ASIMD", HALF="ASIMD", VFPV4="ASIMD"
+            )
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index 5e5ced85c..a21158420 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -6,7 +6,7 @@ import numpy as np
 # import the c-extension module directly since _arg is not exported via umath
 import numpy.core._multiarray_umath as ncu
 from numpy.testing import (
-    assert_raises, assert_equal, assert_array_equal, assert_almost_equal
+    assert_raises, assert_equal, assert_array_equal, assert_almost_equal, assert_array_max_ulp
     )
 
 # TODO: branch cuts (use Pauli code)
@@ -540,3 +540,40 @@ def check_complex_value(f, x1, y1, x2, y2, exact=True):
             assert_equal(f(z1), z2)
         else:
             assert_almost_equal(f(z1), z2)
+
+class TestSpecialComplexAVX(object):
+    @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+    @pytest.mark.parametrize("astype", [np.complex64, np.complex128])
+    def test_array(self, stride, astype):
+        arr = np.array([np.complex(np.nan , np.nan),
+                        np.complex(np.nan , np.inf),
+                        np.complex(np.inf , np.nan),
+                        np.complex(np.inf , np.inf),
+                        np.complex(0.     , np.inf),
+                        np.complex(np.inf , 0.),
+                        np.complex(0.     , 0.),
+                        np.complex(0.     , np.nan),
+                        np.complex(np.nan , 0.)], dtype=astype)
+        abs_true = np.array([np.nan, np.inf, np.inf, np.inf, np.inf, np.inf, 0., np.nan, np.nan], dtype=arr.real.dtype)
+        sq_true = np.array([np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.nan),
+                            np.complex(np.nan,  np.inf),
+                            np.complex(-np.inf, np.nan),
+                            np.complex(np.inf,  np.nan),
+                            np.complex(0.,     0.),
+                            np.complex(np.nan, np.nan),
+                            np.complex(np.nan, np.nan)], dtype=astype)
+        assert_equal(np.abs(arr[::stride]), abs_true[::stride])
+        with np.errstate(invalid='ignore'):
+            assert_equal(np.square(arr[::stride]), sq_true[::stride])
+
+class TestComplexAbsoluteAVX(object):
+    @pytest.mark.parametrize("arraysize", [1,2,3,4,5,6,7,8,9,10,11,13,15,17,18,19])
+    @pytest.mark.parametrize("stride", [-4,-3,-2,-1,1,2,3,4])
+    @pytest.mark.parametrize("astype", [np.complex64, np.complex128])
+    # test to ensure masking and strides work as intended in the AVX implementation
+    def test_array(self, arraysize, stride, astype):
+        arr = np.ones(arraysize, dtype=astype)
+        abs_true = np.ones(arraysize, dtype=arr.real.dtype)
+        assert_equal(np.abs(arr[::stride]), abs_true[::stride])
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index ad508e85d..0f2d082c5 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -251,9 +251,9 @@ def unique(ar, return_index=False, return_inverse=False,
     >>> u
     array([1, 2, 3, 4, 6])
     >>> indices
-    array([0, 1, 4, ..., 1, 2, 1])
+    array([0, 1, 4, 3, 1, 2, 1])
     >>> u[indices]
-    array([1, 2, 6, ..., 2, 3, 2])
+    array([1, 2, 6, 4, 2, 3, 2])
 
     """
     ar = np.asanyarray(ar)