summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h1
-rw-r--r--numpy/core/src/common/simd/avx2/math.h40
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h1
-rw-r--r--numpy/core/src/common/simd/avx512/math.h49
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h22
-rw-r--r--numpy/core/src/common/simd/neon/math.h86
-rw-r--r--numpy/core/src/common/simd/neon/neon.h1
-rw-r--r--numpy/core/src/common/simd/sse/math.h40
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/vsx/math.h36
-rw-r--r--numpy/core/src/common/simd/vsx/vsx.h1
11 files changed, 273 insertions, 5 deletions
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 0641f2314..6f0d3c0d9 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
new file mode 100644
index 000000000..b3eba6f5f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MATH_H
+#define _NPY_SIMD_AVX2_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm256_sqrt_ps
+#define npyv_sqrt_f64 _mm256_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+ return _mm256_and_ps(
+ a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
+ );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+ return _mm256_and_pd(
+ a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+ );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm256_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm256_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index b09d772f2..2de33765a 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
new file mode 100644
index 000000000..1db710670
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MATH_H
+#define _NPY_SIMD_AVX512_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm512_sqrt_ps
+#define npyv_sqrt_f64 _mm512_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+ return _mm512_range_ps(a, a, 8);
+#else
+ return npyv_and_f32(
+ a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
+ );
+#endif
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+ return _mm512_range_pd(a, a, 8);
+#else
+ return npyv_and_f64(
+ a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+ );
+#endif
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm512_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm512_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index bc14ffb75..87e00d5d1 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -63,14 +63,26 @@
/***************************
* Division
***************************/
-#ifdef __aarch64__
+#if NPY_SIMD_F64
#define npyv_div_f32 vdivq_f32
#else
- NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+ NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
{
- float32x4_t recip = vrecpeq_f32(b);
- recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
- return vmulq_f32(a, recip);
+ // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+ // estimate to 1/b
+ npyv_f32 recipe = vrecpeq_f32(b);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (2-d * x[n])
+ * converges to (1/d) if x0 is the result of VRECPE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ // a/b = a*recip(b)
+ return vmulq_f32(a, recipe);
}
#endif
#define npyv_div_f64 vdivq_f64
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
new file mode 100644
index 000000000..a2bbdf2a5
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -0,0 +1,86 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MATH_H
+#define _NPY_SIMD_NEON_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Absolute
+#define npyv_abs_f32 vabsq_f32
+#define npyv_abs_f64 vabsq_f64
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vmulq_f32(a, a); }
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+ { return vmulq_f64(a, a); }
+#endif
+
+// Square root
+#if NPY_SIMD_F64
+ #define npyv_sqrt_f32 vsqrtq_f32
+ #define npyv_sqrt_f64 vsqrtq_f64
+#else
+ // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+ NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
+ {
+ const npyv_f32 zero = vdupq_n_f32(0.0f);
+ const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
+ npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
+ // guard agianst floating-point division-by-zero error
+ npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
+ // estimate to (1/√a)
+ npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
+ * converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ // a * (1/√a)
+ npyv_f32 sqrt = vmulq_f32(a, rsqrte);
+ // return zero if the a is zero
+ // - return zero if a is zero.
+ // - return positive infinity if a is positive infinity
+ return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
+ }
+#endif // NPY_SIMD_F64
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+#if NPY_SIMD_F64
+ const npyv_f32 one = vdupq_n_f32(1.0f);
+ return npyv_div_f32(one, a);
+#else
+ npyv_f32 recipe = vrecpeq_f32(a);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (2-d * x[n])
+ * converges to (1/d) if x0 is the result of VRECPE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ return recipe;
+#endif
+}
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+ {
+ const npyv_f64 one = vdupq_n_f64(1.0);
+ return npyv_div_f64(one, a);
+ }
+#endif // NPY_SIMD_F64
+
+#endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index 280a34297..c8ddc92ad 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
new file mode 100644
index 000000000..b7203cd89
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MATH_H
+#define _NPY_SIMD_SSE_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm_sqrt_ps
+#define npyv_sqrt_f64 _mm_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+ return _mm_and_ps(
+ a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
+ );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+ return _mm_and_pd(
+ a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+ );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 364b4baf1..132d3d347 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
new file mode 100644
index 000000000..7c8610b19
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -0,0 +1,36 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MATH_H
+#define _NPY_SIMD_VSX_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 vec_sqrt
+#define npyv_sqrt_f64 vec_sqrt
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+ const npyv_f32 one = npyv_setall_f32(1.0f);
+ return vec_div(one, a);
+}
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{
+ const npyv_f64 one = npyv_setall_f64(1.0);
+ return vec_div(one, a);
+}
+
+// Absolute
+#define npyv_abs_f32 vec_abs
+#define npyv_abs_f64 vec_abs
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vec_mul(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return vec_mul(a, a); }
+
+#endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 5525dc1e6..27dde98e7 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"