diff options
| author | Sayed Adel <seiko@imavr.com> | 2020-06-13 18:15:25 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2020-06-15 22:48:29 +0200 |
| commit | da21d28ef69e65c5bfef8dc22840fe16fec52540 (patch) | |
| tree | dcff74da7f8712f0328d3c1ed49527a0fd6f131d /numpy/distutils/checks | |
| parent | 5345c2575a28fa2dfbbec83c99636669476c2745 (diff) | |
| download | numpy-da21d28ef69e65c5bfef8dc22840fe16fec52540.tar.gz | |
ENH: [1/7] enable multi-platform SIMD compiler optimizations
Implement new distutils class `CCompilerOpt`, used for handling
the CPU/hardware optimization, starting from parsing the
command arguments, to managing the relationship between the CPU baseline
and dispatch-able features, also generating the required C headers
and ending with compiling the sources with proper compiler's flags.
`CCompilerOpt` mainly used as a helper class for `CCompiler`,
and doesn't provide any runtime detection for the CPU features,
instead only focuses on the compiler side, but it generates
abstract C headers that can be used later for the final
runtime dispatching process.
Diffstat (limited to 'numpy/distutils/checks')
32 files changed, 368 insertions, 0 deletions
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c new file mode 100644 index 000000000..8df556b6c --- /dev/null +++ b/numpy/distutils/checks/cpu_asimd.c @@ -0,0 +1,25 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + /* MAXMIN */ + int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0); + ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0); + /* ROUNDING */ + ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0); +#ifdef __aarch64__ + { + float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + /* MAXMIN */ + ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0); + ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0); + /* ROUNDING */ + ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0); + } +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c new file mode 100644 index 000000000..0158d1354 --- /dev/null +++ b/numpy/distutils/checks/cpu_asimddp.c @@ -0,0 +1,15 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2); + uint32x4_t va = vdupq_n_u32(3); + int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0); +#ifdef __aarch64__ + ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c new file mode 100644 index 000000000..bb437aa40 --- /dev/null +++ b/numpy/distutils/checks/cpu_asimdfhm.c @@ -0,0 +1,17 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float16x8_t vhp = vdupq_n_f16((float16_t)1); + float16x4_t vlhp = vdup_n_f16((float16_t)1); + float32x4_t vf = vdupq_n_f32(1.0f); + float32x2_t vlf = vdup_n_f32(1.0f); + + int ret = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0); + ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0); + + return ret; +} diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c new file mode 100644 index 000000000..80b94000f --- /dev/null +++ b/numpy/distutils/checks/cpu_asimdhp.c @@ -0,0 +1,14 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float16x8_t vhp = vdupq_n_f16((float16_t)-1); + float16x4_t vlhp = vdup_n_f16((float16_t)-1); + + int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0); + ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0); + return ret; +} diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c new file mode 100644 index 000000000..737c0d2e9 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c new file mode 100644 index 000000000..dfb11fd79 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx2.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m256i a = _mm256_abs_epi16(_mm256_setzero_si256()); + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c new file mode 100644 index 000000000..71dad83a7 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_clx.c @@ -0,0 +1,8 @@ +#include <immintrin.h> + +int main(void) +{ + /* VNNI */ + __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c new file mode 100644 index 000000000..dfab4436d --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_cnl.c @@ -0,0 +1,10 @@ +#include <immintrin.h> + +int main(void) +{ + /* IFMA */ + __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + /* VMBI */ + a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c new file mode 100644 index 000000000..cf2706b3b --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_icl.c @@ -0,0 +1,12 @@ +#include <immintrin.h> + +int main(void) +{ + /* VBMI2 */ + __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()); + /* BITLAG */ + a = _mm512_popcnt_epi8(a); + /* VPOPCNTDQ */ + a = _mm512_popcnt_epi64(a); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c new file mode 100644 index 000000000..0699f37a6 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_knl.c @@ -0,0 +1,11 @@ +#include <immintrin.h> + +int main(void) +{ + int base[128]; + /* ER */ + __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd())); + /* PF */ + _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1); + return base[0]; +} diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c new file mode 100644 index 000000000..db61b4bfa --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_knm.c @@ -0,0 +1,17 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_setzero_si512(); + __m512 b = _mm512_setzero_ps(); + + /* 4FMAPS */ + b = _mm512_4fmadd_ps(b, b, b, b, b, NULL); + /* 4VNNIW */ + a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL); + /* VPOPCNTDQ */ + a = _mm512_popcnt_epi64(a); + + a = _mm512_add_epi32(a, _mm512_castps_si512(b)); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c new file mode 100644 index 000000000..1d5e15b5e --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512_skx.c @@ -0,0 +1,12 @@ +#include <immintrin.h> + +int main(void) +{ + /* VL */ + __m256i a = _mm256_abs_epi64(_mm256_setzero_si256()); + /* DQ */ + __m512i b = _mm512_broadcast_i32x8(a); + /* BW */ + b = _mm512_abs_epi16(b); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(b)); +} diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c new file mode 100644 index 000000000..61bef6b82 --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512cd.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c new file mode 100644 index 000000000..f60cc09dd --- /dev/null +++ b/numpy/distutils/checks/cpu_avx512f.c @@ -0,0 +1,7 @@ +#include <immintrin.h> + +int main(void) +{ + __m512i a = _mm512_abs_epi32(_mm512_setzero_si512()); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); +} diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c new file mode 100644 index 000000000..a5a343e2d --- /dev/null +++ b/numpy/distutils/checks/cpu_f16c.c @@ -0,0 +1,9 @@ +#include <emmintrin.h> +#include <immintrin.h> + +int main(void) +{ + __m128 a = _mm_cvtph_ps(_mm_setzero_si128()); + __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128()); + return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8))); +} diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c new file mode 100644 index 000000000..cf34c6cb1 --- /dev/null +++ b/numpy/distutils/checks/cpu_fma3.c @@ -0,0 +1,8 @@ +#include <xmmintrin.h> +#include <immintrin.h> + +int main(void) +{ + __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c new file mode 100644 index 000000000..1ad717033 --- /dev/null +++ b/numpy/distutils/checks/cpu_fma4.c @@ -0,0 +1,12 @@ +#include <immintrin.h> +#ifdef _MSC_VER + #include <ammintrin.h> +#else + #include <x86intrin.h> +#endif + +int main(void) +{ + __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()); + return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); +} diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c new file mode 100644 index 000000000..4eab1f384 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon.c @@ -0,0 +1,15 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0); +#ifdef __aarch64__ + float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c new file mode 100644 index 000000000..745d2e793 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon_fp16.c @@ -0,0 +1,11 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + short z4[] = {0, 0, 0, 0, 0, 0, 0, 0}; + float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4)); + return (int)vgetq_lane_f32(v_z4, 0); +} diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c new file mode 100644 index 000000000..45f7b5d69 --- /dev/null +++ b/numpy/distutils/checks/cpu_neon_vfpv4.c @@ -0,0 +1,19 @@ +#ifdef _MSC_VER + #include <Intrin.h> +#endif +#include <arm_neon.h> + +int main(void) +{ + float32x4_t v1 = vdupq_n_f32(1.0f); + float32x4_t v2 = vdupq_n_f32(2.0f); + float32x4_t v3 = vdupq_n_f32(3.0f); + int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0); +#ifdef __aarch64__ + float64x2_t vd1 = vdupq_n_f64(1.0); + float64x2_t vd2 = vdupq_n_f64(2.0); + float64x2_t vd3 = vdupq_n_f64(3.0); + ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0); +#endif + return ret; +} diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c new file mode 100644 index 000000000..e6a80fb40 --- /dev/null +++ b/numpy/distutils/checks/cpu_popcnt.c @@ -0,0 +1,23 @@ +#ifdef _MSC_VER + #include <nmmintrin.h> +#else + #include <popcntintrin.h> +#endif + +int main(void) +{ + long long a = 0; + int b; +#ifdef _MSC_VER + #ifdef _M_X64 + a = _mm_popcnt_u64(1); + #endif + b = _mm_popcnt_u32(1); +#else + #ifdef __x86_64__ + a = __builtin_popcountll(1); + #endif + b = __builtin_popcount(1); +#endif + return (int)a + b; +} diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c new file mode 100644 index 000000000..bb98bf63c --- /dev/null +++ b/numpy/distutils/checks/cpu_sse.c @@ -0,0 +1,7 @@ +#include <xmmintrin.h> + +int main(void) +{ + __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c new file mode 100644 index 000000000..658afc9b4 --- /dev/null +++ b/numpy/distutils/checks/cpu_sse2.c @@ -0,0 +1,7 @@ +#include <emmintrin.h> + +int main(void) +{ + __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128()); + return _mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c new file mode 100644 index 000000000..aece1e601 --- /dev/null +++ b/numpy/distutils/checks/cpu_sse3.c @@ -0,0 +1,7 @@ +#include <pmmintrin.h> + +int main(void) +{ + __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c new file mode 100644 index 000000000..bfdb9feac --- /dev/null +++ b/numpy/distutils/checks/cpu_sse41.c @@ -0,0 +1,7 @@ +#include <smmintrin.h> + +int main(void) +{ + __m128 a = _mm_floor_ps(_mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c new file mode 100644 index 000000000..24f5d93fe --- /dev/null +++ b/numpy/distutils/checks/cpu_sse42.c @@ -0,0 +1,7 @@ +#include <smmintrin.h> + +int main(void) +{ + __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return (int)_mm_cvtss_f32(a); +} diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c new file mode 100644 index 000000000..ad0abc1e6 --- /dev/null +++ b/numpy/distutils/checks/cpu_ssse3.c @@ -0,0 +1,7 @@ +#include <tmmintrin.h> + +int main(void) +{ + __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128()); + return (int)_mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c new file mode 100644 index 000000000..0b3f30d6a --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx.c @@ -0,0 +1,21 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #define vsx_ld vec_vsx_ld + #define vsx_st vec_vsx_st +#else + #define vsx_ld vec_xl + #define vsx_st vec_xst +#endif + +int main(void) +{ + unsigned int zout[4]; + unsigned int z4[] = {0, 0, 0, 0}; + __vector unsigned int v_z4 = vsx_ld(0, z4); + vsx_st(v_z4, 0, zout); + return zout[0]; +} diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c new file mode 100644 index 000000000..410fb29d6 --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx2.c @@ -0,0 +1,13 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector unsigned long long v_uint64x2; + +int main(void) +{ + v_uint64x2 z2 = (v_uint64x2){0, 0}; + z2 = (v_uint64x2)vec_cmpeq(z2, z2); + return (int)vec_extract(z2, 0); +} diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c new file mode 100644 index 000000000..857526535 --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx3.c @@ -0,0 +1,13 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector unsigned int v_uint32x4; + +int main(void) +{ + v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0}; + z4 = vec_absd(z4, z4); + return (int)vec_extract(z4, 0); +} diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c new file mode 100644 index 000000000..51d70cf2b --- /dev/null +++ b/numpy/distutils/checks/cpu_xop.c @@ -0,0 +1,12 @@ +#include <immintrin.h> +#ifdef _MSC_VER + #include <ammintrin.h> +#else + #include <x86intrin.h> +#endif + +int main(void) +{ + __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128()); + return _mm_cvtsi128_si32(a); +} diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c new file mode 100644 index 000000000..4cd09d42a --- /dev/null +++ b/numpy/distutils/checks/test_flags.c @@ -0,0 +1 @@ +int test_flags; |
