ENH: [1/7] enable multi-platform SIMD compiler optimizations

Implement new distutils class `CCompilerOpt`, used for handling the CPU/hardware optimization, starting from parsing the command arguments, to managing the relationship between the CPU baseline and dispatch-able features, also generating the required C headers and ending with compiling the sources with proper compiler's flags. `CCompilerOpt` mainly used as a helper class for `CCompiler`, and doesn't provide any runtime detection for the CPU features, instead only focuses on the compiler side, but it generates abstract C headers that can be used later for the final runtime dispatching process.
author: Sayed Adel <seiko@imavr.com> 2020-06-13 18:15:25 +0200
committer: Sayed Adel <seiko@imavr.com> 2020-06-15 22:48:29 +0200
commit: da21d28ef69e65c5bfef8dc22840fe16fec52540 (patch)
tree: dcff74da7f8712f0328d3c1ed49527a0fd6f131d /numpy/distutils/checks
parent: 5345c2575a28fa2dfbbec83c99636669476c2745 (diff)
download: numpy-da21d28ef69e65c5bfef8dc22840fe16fec52540.tar.gz
32 files changed, 368 insertions, 0 deletions
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
new file mode 100644
index 000000000..8df556b6c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -0,0 +1,25 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    /* MAXMIN */
+    int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+        ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+    /* ROUNDING */
+    ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+    {
+        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        /* MAXMIN */
+        ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+        ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+        /* ROUNDING */
+        ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+    }
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
new file mode 100644
index 000000000..0158d1354
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    uint32x4_t va = vdupq_n_u32(3);
+    int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+    ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
new file mode 100644
index 000000000..bb437aa40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -0,0 +1,17 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)1);
+    float32x4_t vf   = vdupq_n_f32(1.0f);
+    float32x2_t vlf  = vdup_n_f32(1.0f);
+
+    int ret  = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0);
+        ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0);
+
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
new file mode 100644
index 000000000..80b94000f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -0,0 +1,14 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+
+    int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+        ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c
new file mode 100644
index 000000000..737c0d2e9
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c
new file mode 100644
index 000000000..dfb11fd79
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx2.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256i a = _mm256_abs_epi16(_mm256_setzero_si256());
+    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c
new file mode 100644
index 000000000..71dad83a7
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_clx.c
@@ -0,0 +1,8 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VNNI */
+    __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c
new file mode 100644
index 000000000..dfab4436d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_cnl.c
@@ -0,0 +1,10 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* IFMA */
+    __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* VMBI */
+    a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c
new file mode 100644
index 000000000..cf2706b3b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_icl.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VBMI2 */
+    __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* BITLAG */
+    a = _mm512_popcnt_epi8(a);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c
new file mode 100644
index 000000000..0699f37a6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knl.c
@@ -0,0 +1,11 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    int base[128];
+    /* ER */
+    __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd()));
+    /* PF */
+    _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+    return base[0];
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c
new file mode 100644
index 000000000..db61b4bfa
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knm.c
@@ -0,0 +1,17 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_setzero_si512();
+    __m512 b = _mm512_setzero_ps();
+
+    /* 4FMAPS */
+    b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+    /* 4VNNIW */
+    a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+
+    a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c
new file mode 100644
index 000000000..1d5e15b5e
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_skx.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VL */
+    __m256i a = _mm256_abs_epi64(_mm256_setzero_si256());
+    /* DQ */
+    __m512i b = _mm512_broadcast_i32x8(a);
+    /* BW */
+    b = _mm512_abs_epi16(b);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c
new file mode 100644
index 000000000..61bef6b82
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512cd.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c
new file mode 100644
index 000000000..f60cc09dd
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512f.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_abs_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c
new file mode 100644
index 000000000..a5a343e2d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_f16c.c
@@ -0,0 +1,9 @@
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m128 a  = _mm_cvtph_ps(_mm_setzero_si128());
+    __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128());
+    return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c
new file mode 100644
index 000000000..cf34c6cb1
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma3.c
@@ -0,0 +1,8 @@
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c
new file mode 100644
index 000000000..1ad717033
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma4.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
new file mode 100644
index 000000000..4eab1f384
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
new file mode 100644
index 000000000..745d2e793
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    return (int)vgetq_lane_f32(v_z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
new file mode 100644
index 000000000..45f7b5d69
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f);
+    float32x4_t v2 = vdupq_n_f32(2.0f);
+    float32x4_t v3 = vdupq_n_f32(3.0f);
+    int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0);
+    float64x2_t vd2 = vdupq_n_f64(2.0);
+    float64x2_t vd3 = vdupq_n_f64(3.0);
+    ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c
new file mode 100644
index 000000000..e6a80fb40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_popcnt.c
@@ -0,0 +1,23 @@
+#ifdef _MSC_VER
+    #include <nmmintrin.h>
+#else
+    #include <popcntintrin.h>
+#endif
+
+int main(void)
+{
+    long long a = 0;
+    int b;
+#ifdef _MSC_VER
+    #ifdef _M_X64
+    a = _mm_popcnt_u64(1);
+    #endif
+    b = _mm_popcnt_u32(1);
+#else
+    #ifdef __x86_64__
+    a = __builtin_popcountll(1);
+    #endif
+    b = __builtin_popcount(1);
+#endif
+    return (int)a + b;
+}
diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c
new file mode 100644
index 000000000..bb98bf63c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse.c
@@ -0,0 +1,7 @@
+#include <xmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c
new file mode 100644
index 000000000..658afc9b4
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse2.c
@@ -0,0 +1,7 @@
+#include <emmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c
new file mode 100644
index 000000000..aece1e601
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse3.c
@@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c
new file mode 100644
index 000000000..bfdb9feac
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse41.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_floor_ps(_mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c
new file mode 100644
index 000000000..24f5d93fe
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse42.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c
new file mode 100644
index 000000000..ad0abc1e6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_ssse3.c
@@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return (int)_mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c
new file mode 100644
index 000000000..0b3f30d6a
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    unsigned int zout[4];
+    unsigned int z4[] = {0, 0, 0, 0};
+    __vector unsigned int v_z4 = vsx_ld(0, z4);
+    vsx_st(v_z4, 0, zout);
+    return zout[0];
+}
diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c
new file mode 100644
index 000000000..410fb29d6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx2.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+    v_uint64x2 z2 = (v_uint64x2){0, 0};
+    z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+    return (int)vec_extract(z2, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c
new file mode 100644
index 000000000..857526535
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx3.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+    z4 = vec_absd(z4, z4);
+    return (int)vec_extract(z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c
new file mode 100644
index 000000000..51d70cf2b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_xop.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c
new file mode 100644
index 000000000..4cd09d42a
--- /dev/null
+++ b/numpy/distutils/checks/test_flags.c
@@ -0,0 +1 @@
+int test_flags;
author	Sayed Adel <seiko@imavr.com>	2020-06-13 18:15:25 +0200
committer	Sayed Adel <seiko@imavr.com>	2020-06-15 22:48:29 +0200
commit	da21d28ef69e65c5bfef8dc22840fe16fec52540 (patch)
tree	dcff74da7f8712f0328d3c1ed49527a0fd6f131d /numpy/distutils/checks
parent	5345c2575a28fa2dfbbec83c99636669476c2745 (diff)
download	numpy-da21d28ef69e65c5bfef8dc22840fe16fec52540.tar.gz