summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/include/numpy/npy_common.h6
-rw-r--r--numpy/core/include/numpy/npy_math.h6
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src24
-rw-r--r--numpy/core/src/common/npy_cpu_features.h2
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h1
-rw-r--r--numpy/core/src/common/simd/avx2/math.h40
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h1
-rw-r--r--numpy/core/src/common/simd/avx512/math.h49
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h22
-rw-r--r--numpy/core/src/common/simd/neon/math.h86
-rw-r--r--numpy/core/src/common/simd/neon/neon.h1
-rw-r--r--numpy/core/src/common/simd/sse/math.h40
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/vsx/math.h36
-rw-r--r--numpy/core/src/common/simd/vsx/vsx.h1
-rw-r--r--numpy/core/src/umath/loops.c.src144
-rw-r--r--numpy/core/src/umath/loops.h.src31
-rw-r--r--numpy/core/src/umath/loops_unary_fp.dispatch.c.src219
-rw-r--r--numpy/core/src/umath/loops_utils.h42
-rw-r--r--numpy/core/src/umath/simd.inc.src104
-rw-r--r--numpy/core/tests/test_simd.py73
23 files changed, 702 insertions, 236 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index dc5c2577a..cb1147b93 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -352,14 +352,14 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
- TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
+ TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
TD(O, f='Py_square'),
),
'reciprocal':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.reciprocal'),
None,
- TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
+ TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
TD(O, f='Py_reciprocal'),
),
# This is no longer used as numpy.ones_like, however it is
@@ -389,7 +389,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.absolute'),
'PyUFunc_AbsoluteTypeResolver',
- TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+ TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
@@ -767,7 +767,7 @@ defdict = {
docstrings.get('numpy.core.umath.sqrt'),
None,
TD('e', f='sqrt', astype={'e':'f'}),
- TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+ TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg' + cmplx, f='sqrt'),
TD(P, f='sqrt'),
),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 560b82912..78d6d3120 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -1,15 +1,15 @@
#ifndef _NPY_COMMON_H_
#define _NPY_COMMON_H_
+/* need Python.h for npy_intp, npy_uintp */
+#include <Python.h>
+
/* numpconfig.h is auto-generated */
#include "numpyconfig.h"
#ifdef HAVE_NPY_CONFIG_H
#include <npy_config.h>
#endif
-/* need Python.h for npy_intp, npy_uintp */
-#include <Python.h>
-
/*
* using static inline modifiers when defining npy_math functions
* allows the compiler to make optimizations when possible
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index a07f49501..dbade058f 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -5,14 +5,12 @@
extern "C" {
#endif
+#include <numpy/npy_common.h>
+
#include <math.h>
#ifdef __SUNPRO_CC
#include <sunmath.h>
#endif
-#ifdef HAVE_NPY_CONFIG_H
-#include <npy_config.h>
-#endif
-#include <numpy/npy_common.h>
/* By adding static inline specifiers to npy_math function definitions when
appropriate, compiler is given the opportunity to optimize */
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 68aa0a851..6ada03f73 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -904,6 +904,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops.c.src'),
+ join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'clip.h.src'),
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 3d7af2333..18c383871 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -16,6 +16,7 @@
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
@@ -356,6 +357,17 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
#endif // sum_sup
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin = sqrt, recip, abs, square#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif
+
#endif // simd_sup
/**end repeat**/
/***************************
@@ -371,6 +383,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
@@ -494,6 +507,17 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(sum_@sfx@)
#endif // sum_sup
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin = sqrt, recip, abs, square#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+
#endif // simd_sup
/**end repeat**/
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 693a9857d..28dd00032 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -1,8 +1,8 @@
#ifndef _NPY_CPU_FEATURES_H_
#define _NPY_CPU_FEATURES_H_
-#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
#include <Python.h> // for PyObject
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
#ifdef __cplusplus
extern "C" {
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 0641f2314..6f0d3c0d9 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
new file mode 100644
index 000000000..b3eba6f5f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MATH_H
+#define _NPY_SIMD_AVX2_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm256_sqrt_ps
+#define npyv_sqrt_f64 _mm256_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+ return _mm256_and_ps(
+ a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
+ );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+ return _mm256_and_pd(
+ a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+ );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm256_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm256_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index b09d772f2..2de33765a 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
new file mode 100644
index 000000000..1db710670
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MATH_H
+#define _NPY_SIMD_AVX512_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm512_sqrt_ps
+#define npyv_sqrt_f64 _mm512_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+ return _mm512_range_ps(a, a, 8);
+#else
+ return npyv_and_f32(
+ a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
+ );
+#endif
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+ return _mm512_range_pd(a, a, 8);
+#else
+ return npyv_and_f64(
+ a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+ );
+#endif
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm512_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm512_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index bc14ffb75..87e00d5d1 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -63,14 +63,26 @@
/***************************
* Division
***************************/
-#ifdef __aarch64__
+#if NPY_SIMD_F64
#define npyv_div_f32 vdivq_f32
#else
- NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+ NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
{
- float32x4_t recip = vrecpeq_f32(b);
- recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
- return vmulq_f32(a, recip);
+ // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+ // estimate to 1/b
+ npyv_f32 recipe = vrecpeq_f32(b);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (2-d * x[n])
+ * converges to (1/d) if x0 is the result of VRECPE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+ // a/b = a*recip(b)
+ return vmulq_f32(a, recipe);
}
#endif
#define npyv_div_f64 vdivq_f64
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
new file mode 100644
index 000000000..a2bbdf2a5
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -0,0 +1,86 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MATH_H
+#define _NPY_SIMD_NEON_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Absolute
+#define npyv_abs_f32 vabsq_f32
+#define npyv_abs_f64 vabsq_f64
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vmulq_f32(a, a); }
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+ { return vmulq_f64(a, a); }
+#endif
+
+// Square root
+#if NPY_SIMD_F64
+ #define npyv_sqrt_f32 vsqrtq_f32
+ #define npyv_sqrt_f64 vsqrtq_f64
+#else
+ // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+ NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
+ {
+ const npyv_f32 zero = vdupq_n_f32(0.0f);
+ const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
+ npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
+ // guard agianst floating-point division-by-zero error
+ npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
+ // estimate to (1/√a)
+ npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
+ * converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+ // a * (1/√a)
+ npyv_f32 sqrt = vmulq_f32(a, rsqrte);
+ // return zero if the a is zero
+ // - return zero if a is zero.
+ // - return positive infinity if a is positive infinity
+ return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
+ }
+#endif // NPY_SIMD_F64
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+#if NPY_SIMD_F64
+ const npyv_f32 one = vdupq_n_f32(1.0f);
+ return npyv_div_f32(one, a);
+#else
+ npyv_f32 recipe = vrecpeq_f32(a);
+ /**
+ * Newton-Raphson iteration:
+ * x[n+1] = x[n] * (2-d * x[n])
+ * converges to (1/d) if x0 is the result of VRECPE applied to d.
+ *
+ * NOTE: at least 3 iterations is needed to improve precision
+ */
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+ return recipe;
+#endif
+}
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+ {
+ const npyv_f64 one = vdupq_n_f64(1.0);
+ return npyv_div_f64(one, a);
+ }
+#endif // NPY_SIMD_F64
+
+#endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index 280a34297..c8ddc92ad 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
new file mode 100644
index 000000000..b7203cd89
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MATH_H
+#define _NPY_SIMD_SSE_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm_sqrt_ps
+#define npyv_sqrt_f64 _mm_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+ return _mm_and_ps(
+ a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
+ );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+ return _mm_and_pd(
+ a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+ );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 364b4baf1..132d3d347 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
new file mode 100644
index 000000000..7c8610b19
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -0,0 +1,36 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MATH_H
+#define _NPY_SIMD_VSX_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 vec_sqrt
+#define npyv_sqrt_f64 vec_sqrt
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+ const npyv_f32 one = npyv_setall_f32(1.0f);
+ return vec_div(one, a);
+}
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{
+ const npyv_f64 one = npyv_setall_f64(1.0);
+ return vec_div(one, a);
+}
+
+// Absolute
+#define npyv_abs_f32 vec_abs
+#define npyv_abs_f64 vec_abs
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vec_mul(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return vec_mul(a, a); }
+
+#endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 5525dc1e6..27dde98e7 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ef3d5a21a..c9efdeb4e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1491,26 +1491,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
*/
/**begin repeat
- * Float types
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #scalarf = npy_sqrtf, npy_sqrt#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = @scalarf@(in1);
- }
- }
-}
-
-/**end repeat**/
-
-/**begin repeat
* #func = rint, ceil, floor, trunc#
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
*/
@@ -1579,53 +1559,6 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void
* #typesub = f, #
*/
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = npy_sqrt@typesub@(in1);
- }
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ tmp = in1 > 0 ? in1 : -in1;
- /* add 0 to clear -0.0 */
- *((@type@ *)op1) = tmp + 0;
- }
- }
- npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = in1*in1;
- }
- }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = 1.0f/in1;
- }
- }
-}
-
/**begin repeat2
* #func = rint, ceil, floor, trunc#
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -2048,33 +1981,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- char * margs[] = {args[0], args[0], args[1]};
- npy_intp msteps[] = {steps[0], steps[0], steps[1]};
- if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((@type@ *)op1) = in1*in1;
- }
- }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- @type@ one = 1.@c@;
- char * margs[] = {(char*)&one, args[0], args[1]};
- npy_intp msteps[] = {0, steps[0], steps[1]};
- if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((@type@ *)op1) = 1/in1;
- }
- }
-}
-
-NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
OUTPUT_LOOP {
@@ -2092,20 +1998,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ tmp = in1 > 0 ? in1 : -in1;
- /* add 0 to clear -0.0 */
- *((@type@ *)op1) = tmp + 0;
- }
- }
- npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
@@ -2216,6 +2108,42 @@ NPY_NO_EXPORT void
/*
*****************************************************************************
+ ** LONGDOUBLE LOOPS **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP {
+ const npy_longdouble in1 = *(npy_longdouble*)ip1;
+ *((npy_longdouble *)op1) = 1/in1;
+ }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP {
+ const npy_longdouble in1 = *(npy_longdouble *)ip1;
+ const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
+ /* add 0 to clear -0.0 */
+ *((npy_longdouble *)op1) = tmp + 0;
+ }
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP {
+ const npy_longdouble in1 = *(npy_longdouble *)ip1;
+ *((npy_longdouble *)op1) = in1*in1;
+ }
+}
+
+/*
+ *****************************************************************************
** HALF-FLOAT LOOPS **
*****************************************************************************
*/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 6b8b77e99..a0b68d168 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -6,6 +6,10 @@
#ifndef _NPY_UMATH_LOOPS_H_
#define _NPY_UMATH_LOOPS_H_
+#ifndef NPY_NO_EXPORT
+ #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
#define BOOL_invert BOOL_logical_not
#define BOOL_add BOOL_logical_or
#define BOOL_bitwise_and BOOL_logical_and
@@ -167,32 +171,29 @@ NPY_NO_EXPORT void
** FLOAT LOOPS **
*****************************************************************************
*/
-
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary_fp.dispatch.h"
+#endif
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
/**begin repeat1
- * #func = maximum, minimum#
+ * #kind = sqrt, absolute, square, reciprocal#
*/
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
/**end repeat1**/
+/**end repeat**/
-/**begin repeat1
- * #isa = avx512f, fma#
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
*/
-
-/**begin repeat2
- * #func = sqrt, absolute, square, reciprocal#
+/**begin repeat1
+ * #func = maximum, minimum#
*/
NPY_NO_EXPORT void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat2**/
/**end repeat1**/
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
new file mode 100644
index 000000000..3a1ea82f9
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -0,0 +1,219 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 vsx2 neon
+ **/
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/**********************************************************
+ ** Scalars
+ **********************************************************/
+#if !NPY_SIMD
+NPY_FINLINE float c_recip_f32(float a)
+{ return 1.0f / a; }
+NPY_FINLINE float c_abs_f32(float a)
+{
+ const float tmp = a > 0 ? a : -a;
+ /* add 0 to clear -0.0 */
+ return tmp + 0;
+}
+NPY_FINLINE float c_square_f32(float a)
+{ return a * a; }
+#endif // !NPY_SIMD
+
+#if !NPY_SIMD_F64
+NPY_FINLINE double c_recip_f64(double a)
+{ return 1.0 / a; }
+NPY_FINLINE double c_abs_f64(double a)
+{
+ const double tmp = a > 0 ? a : -a;
+ /* add 0 to clear -0.0 */
+ return tmp + 0;
+}
+NPY_FINLINE double c_square_f64(double a)
+{ return a * a; }
+#endif // !NPY_SIMD_F64
+/**
+ * MSVC(32-bit mode) requires a clarified contiguous loop
+ * in order to use SSE, otherwise it uses a soft version of square root
+ * that doesn't raise a domain error.
+ */
+#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
+ #include <emmintrin.h>
+ NPY_FINLINE float c_sqrt_f32(float _a)
+ {
+ __m128 a = _mm_load_ss(&_a);
+ __m128 lower = _mm_sqrt_ss(a);
+ return _mm_cvtss_f32(lower);
+ }
+ NPY_FINLINE double c_sqrt_f64(double _a)
+ {
+ __m128d a = _mm_load_sd(&_a);
+ __m128d lower = _mm_sqrt_pd(a);
+ return _mm_cvtsd_f64(lower);
+ }
+#else
+ #define c_sqrt_f32 npy_sqrtf
+ #define c_sqrt_f64 npy_sqrt
+#endif
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ * for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ * to fill the remind lanes with 1.0 to avoid divide by zero fp
+ * exception in reciprocal.
+ */
+#define CONTIG 0
+#define NCONTIG 1
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = sqrt, absolute, square, reciprocal#
+ * #intr = sqrt, abs, square, recip#
+ * #repl_0w1 = 0, 0, 0, 1#
+ */
+/**begin repeat2
+ * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
+ * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG#
+ * #unroll = 4, 4, 2, 2#
+ */
+static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+ const npyv_lanetype_@sfx@ *src = _src;
+ npyv_lanetype_@sfx@ *dst = _dst;
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * @unroll@;
+ for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @STYPE@ == CONTIG
+ npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
+ #else
+ npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
+ #endif
+ npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
+ #endif
+ /**end repeat3**/
+ /**begin repeat3
+ * #N = 0, 1, 2, 3#
+ */
+ #if @unroll@ > @N@
+ #if @DTYPE@ == CONTIG
+ npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
+ #else
+ npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
+ #endif
+ #endif
+ /**end repeat3**/
+ }
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ #if @STYPE@ == CONTIG
+ #if @repl_0w1@
+ npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
+ #else
+ npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
+ #endif
+ #else
+ #if @repl_0w1@
+ npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
+ #else
+ npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+ #endif
+ #endif
+ npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+ #if @DTYPE@ == CONTIG
+ npyv_store_till_@sfx@(dst, len, v_unary0);
+ #else
+ npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
+ #endif
+ }
+ npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind = sqrt, absolute, square, reciprocal#
+ * #intr = sqrt, abs, square, recip#
+ * #clear = 0, 1, 0, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ const char *src = args[0]; char *dst = args[1];
+ const npy_intp src_step = steps[0];
+ const npy_intp dst_step = steps[1];
+ npy_intp len = dimensions[0];
+#if @VCHK@
+ const int lsize = sizeof(npyv_lanetype_@sfx@);
+ assert(src_step % lsize == 0 && dst_step % lsize == 0);
+ if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+ goto no_unroll;
+ }
+ const npy_intp ssrc = src_step / lsize;
+ const npy_intp sdst = dst_step / lsize;
+ if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
+ goto no_unroll;
+ }
+ if (ssrc == 1 && sdst == 1) {
+ simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
+ }
+ else if (sdst == 1) {
+ simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+ }
+ else if (ssrc == 1) {
+ simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+ } else {
+ simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+ }
+ goto clear;
+no_unroll:
+#endif // @VCHK@
+ for (; len > 0; --len, src += src_step, dst += dst_step) {
+ #if @VCHK@
+ // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+ simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
+ #else
+ const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
+ *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
+ #endif
+ }
+#if @VCHK@
+clear:;
+#endif
+#if @clear@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
new file mode 100644
index 000000000..f5540bdae
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h
@@ -0,0 +1,42 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+ char *ip_start, *ip_end, *op_start, *op_end;
+ if (ip_size < 0) {
+ ip_start = ip + ip_size;
+ ip_end = ip;
+ }
+ else {
+ ip_start = ip;
+ ip_end = ip + ip_size;
+ }
+ if (op_size < 0) {
+ op_start = op + op_size;
+ op_end = op;
+ }
+ else {
+ op_start = op;
+ op_end = op + op_size;
+ }
+ return (ip_start == op_start && op_end == ip_end) ||
+ (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+ return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 40bb76914..fb4c84c9e 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -29,6 +29,7 @@
#endif
#endif
#include "simd/simd.h"
+#include "loops_utils.h" // nomemoverlap
#include <assert.h>
#include <stdlib.h>
#include <float.h>
@@ -51,37 +52,6 @@
*/
#define MAX_STEP_SIZE 2097152
-/*
- * nomemoverlap - returns true if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-static NPY_INLINE npy_bool
-nomemoverlap(char *ip,
- npy_intp ip_size,
- char *op,
- npy_intp op_size)
-{
- char *ip_start, *ip_end, *op_start, *op_end;
- if (ip_size < 0) {
- ip_start = ip + ip_size;
- ip_end = ip;
- }
- else {
- ip_start = ip;
- ip_end = ip + ip_size;
- }
- if (op_size < 0) {
- op_start = op + op_size;
- op_end = op;
- }
- else {
- op_start = op;
- op_end = op + op_size;
- }
- return (ip_start > op_end) | (op_start > ip_end);
-}
-
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
((steps[0] == esize) && \
(steps[1] == esize) && \
@@ -390,7 +360,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
*/
/**begin repeat2
- * #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ * #func = rint, floor, ceil, trunc#
*/
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -510,9 +480,9 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
*/
/**begin repeat1
- * #func = sqrt, absolute, negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary*3, unary_reduce*2#
+ * #func = absolute, negative, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 #
+ * #name = unary*2, unary_reduce*2#
*/
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -1357,33 +1327,6 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
}
/**end repeat1**/
-static void
-sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
- /* align output to VECTOR_SIZE_BYTES bytes */
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
- op[i] = @scalarf@(ip[i]);
- }
- assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
- npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
- if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
- }
- }
- LOOP_BLOCKED_END {
- op[i] = @scalarf@(ip[i]);
- }
-}
-
-
static NPY_INLINE
@type@ scalar_abs_@type@(@type@ v)
{
@@ -2458,9 +2401,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
*/
/**begin repeat1
- * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ * #func = rint, ceil, floor, trunc#
+ * #vectorf = rint, ceil, floor, trunc#
*/
#if defined @CHK@
@@ -2475,10 +2417,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
npy_intp num_remaining_elements = array_size;
@vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
@mask@ load_mask = @isa@_get_full_load_mask_ps();
-#if @replace_0_with_1@
- @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
-
/*
* Note: while generally indices are npy_intp, we ensure that our maximum index
* will fit in an int32 as a precondition for this function via
@@ -2495,20 +2433,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
num_lanes);
-#if @replace_0_with_1@
- inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
}
@vtype@ x;
if (stride == 1) {
x = @isa@_masked_load_ps(load_mask, ip);
-#if @replace_0_with_1@
- /*
- * Replace masked elements with 1.0f to avoid divide by zero fp
- * exception in reciprocal
- */
- x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
-#endif
}
else {
x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
@@ -2544,9 +2472,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
*/
/**begin repeat1
- * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ * #func = rint, ceil, floor, trunc#
+ * #vectorf = rint, ceil, floor, trunc#
*/
#if defined @CHK@
@@ -2560,9 +2487,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
npy_intp num_remaining_elements = array_size;
@mask@ load_mask = @isa@_get_full_load_mask_pd();
-#if @replace_0_with_1@
- @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
@vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
/*
@@ -2580,20 +2504,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
num_lanes);
-#if @replace_0_with_1@
- inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
}
@vtype@ x;
if (stride == 1) {
x = @isa@_masked_load_pd(load_mask, ip);
-#if @replace_0_with_1@
- /*
- * Replace masked elements with 1.0f to avoid divide by zero fp
- * exception in reciprocal
- */
- x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
-#endif
}
else {
x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 061d0e34f..13e8d5ede 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -1,6 +1,6 @@
# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
# may be involved in their functionality.
-import pytest
+import pytest, math
from numpy.core._simd import targets
class _Test_Utility:
@@ -80,6 +80,18 @@ class _Test_Utility:
cvt_intrin = "reinterpret_u{0}_{1}"
return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
+ def _pinfinity(self):
+ v = self.npyv.setall_u32(0x7f800000)
+ return self.npyv.reinterpret_f32_u32(v)[0]
+
+ def _ninfinity(self):
+ v = self.npyv.setall_u32(0xff800000)
+ return self.npyv.reinterpret_f32_u32(v)[0]
+
+ def _nan(self):
+ v = self.npyv.setall_u32(0x7fc00000)
+ return self.npyv.reinterpret_f32_u32(v)[0]
+
class _SIMD_INT(_Test_Utility):
"""
To test all integer vector types at once
@@ -150,6 +162,65 @@ class _SIMD_FP(_Test_Utility):
data_nfms = self.mul(data_fma, self.setall(-1))
assert nfms == data_nfms
+ def test_abs(self):
+ pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+ data = self._data()
+ vdata = self.load(self._data())
+
+ abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan))
+ for case, desired in abs_cases:
+ data_abs = [desired]*self.nlanes
+ vabs = self.abs(self.setall(case))
+ assert vabs == pytest.approx(data_abs, nan_ok=True)
+
+ vabs = self.abs(self.mul(vdata, self.setall(-1)))
+ assert vabs == data
+
+ def test_sqrt(self):
+ pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+ data = self._data()
+ vdata = self.load(self._data())
+
+ sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf))
+ for case, desired in sqrt_cases:
+ data_sqrt = [desired]*self.nlanes
+ sqrt = self.sqrt(self.setall(case))
+ assert sqrt == pytest.approx(data_sqrt, nan_ok=True)
+
+ data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision
+ sqrt = self.sqrt(vdata)
+ assert sqrt == data_sqrt
+
+ def test_square(self):
+ pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+ data = self._data()
+ vdata = self.load(self._data())
+ # square
+ square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf))
+ for case, desired in square_cases:
+ data_square = [desired]*self.nlanes
+ square = self.square(self.setall(case))
+ assert square == pytest.approx(data_square, nan_ok=True)
+
+ data_square = [x*x for x in data]
+ square = self.square(vdata)
+ assert square == data_square
+
+ def test_reciprocal(self):
+ pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+ data = self._data()
+ vdata = self.load(self._data())
+
+ recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf))
+ for case, desired in recip_cases:
+ data_recip = [desired]*self.nlanes
+ recip = self.recip(self.setall(case))
+ assert recip == pytest.approx(data_recip, nan_ok=True)
+
+ data_recip = self.load([1/x for x in data]) # load to truncate precision
+ recip = self.recip(vdata)
+ assert recip == data_recip
+
class _SIMD_ALL(_Test_Utility):
"""
To test all vector types at once