diff options
| -rw-r--r-- | numpy/core/src/common/simd/avx2/avx2.h | 1 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx2/math.h | 40 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/avx512.h | 1 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 49 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 22 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 86 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/neon.h | 1 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/math.h | 40 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/sse.h | 1 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vsx/math.h | 36 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vsx/vsx.h | 1 |
11 files changed, 273 insertions, 5 deletions
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 0641f2314..6f0d3c0d9 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h new file mode 100644 index 000000000..b3eba6f5f --- /dev/null +++ b/numpy/core/src/common/simd/avx2/math.h @@ -0,0 +1,40 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_MATH_H +#define _NPY_SIMD_AVX2_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm256_sqrt_ps +#define npyv_sqrt_f64 _mm256_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + return _mm256_and_ps( + a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)) + ); +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + return _mm256_and_pd( + a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL)) + ); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm256_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm256_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index b09d772f2..2de33765a 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h new file mode 100644 index 000000000..1db710670 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/math.h @@ -0,0 +1,49 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_MATH_H +#define _NPY_SIMD_AVX512_MATH_H + +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm512_sqrt_ps +#define npyv_sqrt_f64 _mm512_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ +#if 0 // def NPY_HAVE_AVX512DQ + return _mm512_range_ps(a, a, 8); +#else + return npyv_and_f32( + a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)) + ); +#endif +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ +#if 0 // def NPY_HAVE_AVX512DQ + return _mm512_range_pd(a, a, 8); +#else + return npyv_and_f64( + a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL)) + ); +#endif +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm512_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm512_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index bc14ffb75..87e00d5d1 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -63,14 +63,26 @@ /*************************** * Division ***************************/ -#ifdef __aarch64__ +#if NPY_SIMD_F64 #define npyv_div_f32 vdivq_f32 #else - NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b) + NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { - float32x4_t recip = vrecpeq_f32(b); - recip = vmulq_f32(vrecpsq_f32(b, recip), recip); - return vmulq_f32(a, recip); + // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI + // estimate to 1/b + npyv_f32 recipe = vrecpeq_f32(b); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (2-d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + // a/b = a*recip(b) + return vmulq_f32(a, recipe); } #endif #define npyv_div_f64 vdivq_f64 diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h new file mode 100644 index 000000000..a2bbdf2a5 --- /dev/null +++ b/numpy/core/src/common/simd/neon/math.h @@ -0,0 +1,86 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_MATH_H +#define _NPY_SIMD_NEON_MATH_H + +/*************************** + * Elementary + ***************************/ +// Absolute +#define npyv_abs_f32 vabsq_f32 +#define npyv_abs_f64 vabsq_f64 + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return vmulq_f32(a, a); } +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) + { return vmulq_f64(a, a); } +#endif + +// Square root +#if NPY_SIMD_F64 + #define npyv_sqrt_f32 vsqrtq_f32 + #define npyv_sqrt_f64 vsqrtq_f64 +#else + // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI + NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) + { + const npyv_f32 zero = vdupq_n_f32(0.0f); + const npyv_u32 pinf = vdupq_n_u32(0x7f800000); + npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf); + // guard agianst floating-point division-by-zero error + npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a); + // estimate to (1/√a) + npyv_f32 rsqrte = vrsqrteq_f32(guard_byz); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2) + * converges to (1/√d)if x0 is the result of VRSQRTE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + // a * (1/√a) + npyv_f32 sqrt = vmulq_f32(a, rsqrte); + // return zero if the a is zero + // - return zero if a is zero. + // - return positive infinity if a is positive infinity + return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt); + } +#endif // NPY_SIMD_F64 + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ +#if NPY_SIMD_F64 + const npyv_f32 one = vdupq_n_f32(1.0f); + return npyv_div_f32(one, a); +#else + npyv_f32 recipe = vrecpeq_f32(a); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (2-d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + return recipe; +#endif +} +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) + { + const npyv_f64 one = vdupq_n_f64(1.0); + return npyv_div_f64(one, a); + } +#endif // NPY_SIMD_F64 + +#endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index 280a34297..c8ddc92ad 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h new file mode 100644 index 000000000..b7203cd89 --- /dev/null +++ b/numpy/core/src/common/simd/sse/math.h @@ -0,0 +1,40 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_MATH_H +#define _NPY_SIMD_SSE_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm_sqrt_ps +#define npyv_sqrt_f64 _mm_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm_div_ps(_mm_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm_div_pd(_mm_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + return _mm_and_ps( + a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)) + ); +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + return _mm_and_pd( + a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL)) + ); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index 364b4baf1..132d3d347 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h new file mode 100644 index 000000000..7c8610b19 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/math.h @@ -0,0 +1,36 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_MATH_H +#define _NPY_SIMD_VSX_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 vec_sqrt +#define npyv_sqrt_f64 vec_sqrt + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ + const npyv_f32 one = npyv_setall_f32(1.0f); + return vec_div(one, a); +} +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ + const npyv_f64 one = npyv_setall_f64(1.0); + return vec_div(one, a); +} + +// Absolute +#define npyv_abs_f32 vec_abs +#define npyv_abs_f64 vec_abs + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return vec_mul(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return vec_mul(a, a); } + +#endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h index 5525dc1e6..27dde98e7 100644 --- a/numpy/core/src/common/simd/vsx/vsx.h +++ b/numpy/core/src/common/simd/vsx/vsx.h @@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" |
