diff options
23 files changed, 702 insertions, 236 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index dc5c2577a..cb1147b93 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -352,14 +352,14 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]), + TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]), TD(O, f='Py_square'), ), 'reciprocal': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.reciprocal'), None, - TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]), + TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]), TD(O, f='Py_reciprocal'), ), # This is no longer used as numpy.ones_like, however it is @@ -389,7 +389,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', - TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]), TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), @@ -767,7 +767,7 @@ defdict = { docstrings.get('numpy.core.umath.sqrt'), None, TD('e', f='sqrt', astype={'e':'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg' + cmplx, f='sqrt'), TD(P, f='sqrt'), ), diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 560b82912..78d6d3120 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -1,15 +1,15 @@ #ifndef _NPY_COMMON_H_ #define _NPY_COMMON_H_ +/* need Python.h for npy_intp, npy_uintp */ +#include <Python.h> + /* numpconfig.h is auto-generated */ #include "numpyconfig.h" #ifdef HAVE_NPY_CONFIG_H #include <npy_config.h> #endif -/* need Python.h for npy_intp, npy_uintp */ -#include <Python.h> - /* * using static inline modifiers when defining npy_math functions * allows the compiler to make optimizations when possible diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index a07f49501..dbade058f 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -5,14 +5,12 @@ extern "C" { #endif +#include <numpy/npy_common.h> + #include <math.h> #ifdef __SUNPRO_CC #include <sunmath.h> #endif -#ifdef HAVE_NPY_CONFIG_H -#include <npy_config.h> -#endif -#include <numpy/npy_common.h> /* By adding static inline specifiers to npy_math function definitions when appropriate, compiler is given the opportunity to optimize */ diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 68aa0a851..6ada03f73 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -904,6 +904,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'simd.inc.src'), join('src', 'umath', 'loops.h.src'), join('src', 'umath', 'loops.c.src'), + join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), join('src', 'umath', 'matmul.c.src'), join('src', 'umath', 'clip.h.src'), diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 3d7af2333..18c383871 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -16,6 +16,7 @@ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# @@ -356,6 +357,17 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@) #endif // sum_sup +/*************************** + * Math + ***************************/ +#if @fp_only@ +/**begin repeat1 + * #intrin = sqrt, recip, abs, square# + */ +SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ +#endif + #endif // simd_sup /**end repeat**/ /*************************** @@ -371,6 +383,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# @@ -494,6 +507,17 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(sum_@sfx@) #endif // sum_sup +/*************************** + * Math + ***************************/ +#if @fp_only@ +/**begin repeat1 + * #intrin = sqrt, recip, abs, square# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif + #endif // simd_sup /**end repeat**/ diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h index 693a9857d..28dd00032 100644 --- a/numpy/core/src/common/npy_cpu_features.h +++ b/numpy/core/src/common/npy_cpu_features.h @@ -1,8 +1,8 @@ #ifndef _NPY_CPU_FEATURES_H_ #define _NPY_CPU_FEATURES_H_ -#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN #include <Python.h> // for PyObject +#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN #ifdef __cplusplus extern "C" { diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 0641f2314..6f0d3c0d9 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h new file mode 100644 index 000000000..b3eba6f5f --- /dev/null +++ b/numpy/core/src/common/simd/avx2/math.h @@ -0,0 +1,40 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_MATH_H +#define _NPY_SIMD_AVX2_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm256_sqrt_ps +#define npyv_sqrt_f64 _mm256_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + return _mm256_and_ps( + a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)) + ); +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + return _mm256_and_pd( + a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL)) + ); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm256_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm256_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index b09d772f2..2de33765a 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h new file mode 100644 index 000000000..1db710670 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/math.h @@ -0,0 +1,49 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_MATH_H +#define _NPY_SIMD_AVX512_MATH_H + +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm512_sqrt_ps +#define npyv_sqrt_f64 _mm512_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ +#if 0 // def NPY_HAVE_AVX512DQ + return _mm512_range_ps(a, a, 8); +#else + return npyv_and_f32( + a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)) + ); +#endif +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ +#if 0 // def NPY_HAVE_AVX512DQ + return _mm512_range_pd(a, a, 8); +#else + return npyv_and_f64( + a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL)) + ); +#endif +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm512_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm512_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index bc14ffb75..87e00d5d1 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -63,14 +63,26 @@ /*************************** * Division ***************************/ -#ifdef __aarch64__ +#if NPY_SIMD_F64 #define npyv_div_f32 vdivq_f32 #else - NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b) + NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { - float32x4_t recip = vrecpeq_f32(b); - recip = vmulq_f32(vrecpsq_f32(b, recip), recip); - return vmulq_f32(a, recip); + // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI + // estimate to 1/b + npyv_f32 recipe = vrecpeq_f32(b); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (2-d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe); + // a/b = a*recip(b) + return vmulq_f32(a, recipe); } #endif #define npyv_div_f64 vdivq_f64 diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h new file mode 100644 index 000000000..a2bbdf2a5 --- /dev/null +++ b/numpy/core/src/common/simd/neon/math.h @@ -0,0 +1,86 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_MATH_H +#define _NPY_SIMD_NEON_MATH_H + +/*************************** + * Elementary + ***************************/ +// Absolute +#define npyv_abs_f32 vabsq_f32 +#define npyv_abs_f64 vabsq_f64 + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return vmulq_f32(a, a); } +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) + { return vmulq_f64(a, a); } +#endif + +// Square root +#if NPY_SIMD_F64 + #define npyv_sqrt_f32 vsqrtq_f32 + #define npyv_sqrt_f64 vsqrtq_f64 +#else + // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI + NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) + { + const npyv_f32 zero = vdupq_n_f32(0.0f); + const npyv_u32 pinf = vdupq_n_u32(0x7f800000); + npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf); + // guard agianst floating-point division-by-zero error + npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a); + // estimate to (1/√a) + npyv_f32 rsqrte = vrsqrteq_f32(guard_byz); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2) + * converges to (1/√d)if x0 is the result of VRSQRTE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte); + // a * (1/√a) + npyv_f32 sqrt = vmulq_f32(a, rsqrte); + // return zero if the a is zero + // - return zero if a is zero. + // - return positive infinity if a is positive infinity + return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt); + } +#endif // NPY_SIMD_F64 + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ +#if NPY_SIMD_F64 + const npyv_f32 one = vdupq_n_f32(1.0f); + return npyv_div_f32(one, a); +#else + npyv_f32 recipe = vrecpeq_f32(a); + /** + * Newton-Raphson iteration: + * x[n+1] = x[n] * (2-d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * NOTE: at least 3 iterations is needed to improve precision + */ + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe); + return recipe; +#endif +} +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) + { + const npyv_f64 one = vdupq_n_f64(1.0); + return npyv_div_f64(one, a); + } +#endif // NPY_SIMD_F64 + +#endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index 280a34297..c8ddc92ad 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h new file mode 100644 index 000000000..b7203cd89 --- /dev/null +++ b/numpy/core/src/common/simd/sse/math.h @@ -0,0 +1,40 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_MATH_H +#define _NPY_SIMD_SSE_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 _mm_sqrt_ps +#define npyv_sqrt_f64 _mm_sqrt_pd + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return _mm_div_ps(_mm_set1_ps(1.0f), a); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return _mm_div_pd(_mm_set1_pd(1.0), a); } + +// Absolute +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + return _mm_and_ps( + a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)) + ); +} +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + return _mm_and_pd( + a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL)) + ); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return _mm_mul_ps(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return _mm_mul_pd(a, a); } + +#endif diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index 364b4baf1..132d3d347 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h new file mode 100644 index 000000000..7c8610b19 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/math.h @@ -0,0 +1,36 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_MATH_H +#define _NPY_SIMD_VSX_MATH_H +/*************************** + * Elementary + ***************************/ +// Square root +#define npyv_sqrt_f32 vec_sqrt +#define npyv_sqrt_f64 vec_sqrt + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ + const npyv_f32 one = npyv_setall_f32(1.0f); + return vec_div(one, a); +} +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ + const npyv_f64 one = npyv_setall_f64(1.0); + return vec_div(one, a); +} + +// Absolute +#define npyv_abs_f32 vec_abs +#define npyv_abs_f64 vec_abs + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return vec_mul(a, a); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return vec_mul(a, a); } + +#endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h index 5525dc1e6..27dde98e7 100644 --- a/numpy/core/src/common/simd/vsx/vsx.h +++ b/numpy/core/src/common/simd/vsx/vsx.h @@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3; #include "operators.h" #include "conversion.h" #include "arithmetic.h" +#include "math.h" diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index ef3d5a21a..c9efdeb4e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1491,26 +1491,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * */ /**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# - */ - -NPY_NO_EXPORT void -@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@(in1); - } - } -} - -/**end repeat**/ - -/**begin repeat * #func = rint, ceil, floor, trunc# * #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc# */ @@ -1579,53 +1559,6 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void * #typesub = f, # */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = npy_sqrt@typesub@(in1); - } - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ tmp = in1 > 0 ? in1 : -in1; - /* add 0 to clear -0.0 */ - *((@type@ *)op1) = tmp + 0; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = in1*in1; - } - } -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = 1.0f/in1; - } - } -} - /**begin repeat2 * #func = rint, ceil, floor, trunc# * #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc# @@ -2048,33 +1981,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - char * margs[] = {args[0], args[0], args[1]}; - npy_intp msteps[] = {steps[0], steps[0], steps[1]}; - if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = in1*in1; - } - } -} - -NPY_NO_EXPORT void -@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - @type@ one = 1.@c@; - char * margs[] = {(char*)&one, args[0], args[1]}; - npy_intp msteps[] = {0, steps[0], steps[1]}; - if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = 1/in1; - } - } -} - -NPY_NO_EXPORT void @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { OUTPUT_LOOP { @@ -2092,20 +1998,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ tmp = in1 > 0 ? in1 : -in1; - /* add 0 to clear -0.0 */ - *((@type@ *)op1) = tmp + 0; - } - } - npy_clear_floatstatus_barrier((char*)dimensions); -} - -NPY_NO_EXPORT void @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) { @@ -2216,6 +2108,42 @@ NPY_NO_EXPORT void /* ***************************************************************************** + ** LONGDOUBLE LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP { + const npy_longdouble in1 = *(npy_longdouble*)ip1; + *((npy_longdouble *)op1) = 1/in1; + } +} + +NPY_NO_EXPORT void +LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP { + const npy_longdouble in1 = *(npy_longdouble *)ip1; + const npy_longdouble tmp = in1 > 0 ? in1 : -in1; + /* add 0 to clear -0.0 */ + *((npy_longdouble *)op1) = tmp + 0; + } + npy_clear_floatstatus_barrier((char*)dimensions); +} + +NPY_NO_EXPORT void +LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP { + const npy_longdouble in1 = *(npy_longdouble *)ip1; + *((npy_longdouble *)op1) = in1*in1; + } +} + +/* + ***************************************************************************** ** HALF-FLOAT LOOPS ** ***************************************************************************** */ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 6b8b77e99..a0b68d168 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -6,6 +6,10 @@ #ifndef _NPY_UMATH_LOOPS_H_ #define _NPY_UMATH_LOOPS_H_ +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + #define BOOL_invert BOOL_logical_not #define BOOL_add BOOL_logical_or #define BOOL_bitwise_and BOOL_logical_and @@ -167,32 +171,29 @@ NPY_NO_EXPORT void ** FLOAT LOOPS ** ***************************************************************************** */ - +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif /**begin repeat * #TYPE = FLOAT, DOUBLE# */ -NPY_NO_EXPORT void -@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - /**begin repeat1 - * #func = maximum, minimum# + * #kind = sqrt, absolute, square, reciprocal# */ -NPY_NO_EXPORT void -@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat1**/ +/**end repeat**/ -/**begin repeat1 - * #isa = avx512f, fma# +/**begin repeat + * #TYPE = FLOAT, DOUBLE# */ - -/**begin repeat2 - * #func = sqrt, absolute, square, reciprocal# +/**begin repeat1 + * #func = maximum, minimum# */ NPY_NO_EXPORT void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ /**end repeat1**/ /**end repeat**/ diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src new file mode 100644 index 000000000..3a1ea82f9 --- /dev/null +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -0,0 +1,219 @@ +/*@targets + ** $maxopt baseline + ** sse2 vsx2 neon + **/ +/** + * Force use SSE only on x86, even if AVX2 or AVX512F are enabled + * through the baseline, since scatter(AVX512F) and gather very costly + * to handle non-contiguous memory access comparing with SSE for + * such small operations that this file covers. +*/ +#define NPY_SIMD_FORCE_128 +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +/********************************************************** + ** Scalars + **********************************************************/ +#if !NPY_SIMD +NPY_FINLINE float c_recip_f32(float a) +{ return 1.0f / a; } +NPY_FINLINE float c_abs_f32(float a) +{ + const float tmp = a > 0 ? a : -a; + /* add 0 to clear -0.0 */ + return tmp + 0; +} +NPY_FINLINE float c_square_f32(float a) +{ return a * a; } +#endif // !NPY_SIMD + +#if !NPY_SIMD_F64 +NPY_FINLINE double c_recip_f64(double a) +{ return 1.0 / a; } +NPY_FINLINE double c_abs_f64(double a) +{ + const double tmp = a > 0 ? a : -a; + /* add 0 to clear -0.0 */ + return tmp + 0; +} +NPY_FINLINE double c_square_f64(double a) +{ return a * a; } +#endif // !NPY_SIMD_F64 +/** + * MSVC(32-bit mode) requires a clarified contiguous loop + * in order to use SSE, otherwise it uses a soft version of square root + * that doesn't raise a domain error. + */ +#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD + #include <emmintrin.h> + NPY_FINLINE float c_sqrt_f32(float _a) + { + __m128 a = _mm_load_ss(&_a); + __m128 lower = _mm_sqrt_ss(a); + return _mm_cvtss_f32(lower); + } + NPY_FINLINE double c_sqrt_f64(double _a) + { + __m128d a = _mm_load_sd(&_a); + __m128d lower = _mm_sqrt_pd(a); + return _mm_cvtsd_f64(lower); + } +#else + #define c_sqrt_f32 npy_sqrtf + #define c_sqrt_f64 npy_sqrt +#endif + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/** Notes: + * - avoid the use of libmath to unify fp/domain errors + * for both scalars and vectors among all compilers/architectures. + * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_ + * to fill the remind lanes with 1.0 to avoid divide by zero fp + * exception in reciprocal. + */ +#define CONTIG 0 +#define NCONTIG 1 +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD, NPY_SIMD_F64# + */ +#if @VCHK@ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + * #intr = sqrt, abs, square, recip# + * #repl_0w1 = 0, 0, 0, 1# + */ +/**begin repeat2 + * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# + * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# + * #unroll = 4, 4, 2, 2# + */ +static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ +(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len) +{ + const npyv_lanetype_@sfx@ *src = _src; + npyv_lanetype_@sfx@ *dst = _dst; + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * @unroll@; + for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) { + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @STYPE@ == CONTIG + npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@); + #else + npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc); + #endif + npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@); + #endif + /**end repeat3**/ + /**begin repeat3 + * #N = 0, 1, 2, 3# + */ + #if @unroll@ > @N@ + #if @DTYPE@ == CONTIG + npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@); + #else + npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@); + #endif + #endif + /**end repeat3**/ + } + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + #if @STYPE@ == CONTIG + #if @repl_0w1@ + npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1); + #else + npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len); + #endif + #else + #if @repl_0w1@ + npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1); + #else + npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len); + #endif + #endif + npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0); + #if @DTYPE@ == CONTIG + npyv_store_till_@sfx@(dst, len, v_unary0); + #else + npyv_storen_till_@sfx@(dst, sdst, len, v_unary0); + #endif + } + npyv_cleanup(); +} +/**end repeat2**/ +/**end repeat1**/ +#endif // @VCHK@ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD, NPY_SIMD_F64# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + * #intr = sqrt, abs, square, recip# + * #clear = 0, 1, 0, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + const char *src = args[0]; char *dst = args[1]; + const npy_intp src_step = steps[0]; + const npy_intp dst_step = steps[1]; + npy_intp len = dimensions[0]; +#if @VCHK@ + const int lsize = sizeof(npyv_lanetype_@sfx@); + assert(src_step % lsize == 0 && dst_step % lsize == 0); + if (is_mem_overlap(src, src_step, dst, dst_step, len)) { + goto no_unroll; + } + const npy_intp ssrc = src_step / lsize; + const npy_intp sdst = dst_step / lsize; + if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) { + goto no_unroll; + } + if (ssrc == 1 && sdst == 1) { + simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len); + } + else if (sdst == 1) { + simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len); + } + else if (ssrc == 1) { + simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len); + } else { + simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len); + } + goto clear; +no_unroll: +#endif // @VCHK@ + for (; len > 0; --len, src += src_step, dst += dst_step) { + #if @VCHK@ + // to guarantee the same precsion and fp/domain errors for both scalars and vectors + simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1); + #else + const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src; + *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0); + #endif + } +#if @VCHK@ +clear:; +#endif +#if @clear@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat1**/ +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h new file mode 100644 index 000000000..f5540bdae --- /dev/null +++ b/numpy/core/src/umath/loops_utils.h @@ -0,0 +1,42 @@ +#ifndef _NPY_UMATH_LOOPS_UTILS_H_ +#define _NPY_UMATH_LOOPS_UTILS_H_ + +#include "numpy/npy_common.h" // NPY_FINLINE +/* + * nomemoverlap - returns false if two strided arrays have an overlapping + * region in memory. ip_size/op_size = size of the arrays which can be negative + * indicating negative steps. + */ +NPY_FINLINE npy_bool +nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size) +{ + char *ip_start, *ip_end, *op_start, *op_end; + if (ip_size < 0) { + ip_start = ip + ip_size; + ip_end = ip; + } + else { + ip_start = ip; + ip_end = ip + ip_size; + } + if (op_size < 0) { + op_start = op + op_size; + op_end = op; + } + else { + op_start = op; + op_end = op + op_size; + } + return (ip_start == op_start && op_end == ip_end) || + (ip_start > op_end) || (op_start > ip_end); +} + +// returns true if two strided arrays have an overlapping region in memory +// same as `nomemoverlap()` but requires array length and step sizes +NPY_FINLINE npy_bool +is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len) +{ + return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len)); +} + +#endif // _NPY_UMATH_LOOPS_UTILS_H_ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 40bb76914..fb4c84c9e 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -29,6 +29,7 @@ #endif #endif #include "simd/simd.h" +#include "loops_utils.h" // nomemoverlap #include <assert.h> #include <stdlib.h> #include <float.h> @@ -51,37 +52,6 @@ */ #define MAX_STEP_SIZE 2097152 -/* - * nomemoverlap - returns true if two strided arrays have an overlapping - * region in memory. ip_size/op_size = size of the arrays which can be negative - * indicating negative steps. - */ -static NPY_INLINE npy_bool -nomemoverlap(char *ip, - npy_intp ip_size, - char *op, - npy_intp op_size) -{ - char *ip_start, *ip_end, *op_start, *op_end; - if (ip_size < 0) { - ip_start = ip + ip_size; - ip_end = ip; - } - else { - ip_start = ip; - ip_end = ip + ip_size; - } - if (op_size < 0) { - op_start = op + op_size; - op_end = op; - } - else { - op_start = op; - op_end = op + op_size; - } - return (ip_start > op_end) | (op_start > ip_end); -} - #define IS_BINARY_STRIDE_ONE(esize, vsize) \ ((steps[0] == esize) && \ (steps[1] == esize) && \ @@ -390,7 +360,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c */ /**begin repeat2 - * #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc# + * #func = rint, floor, ceil, trunc# */ #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS @@ -510,9 +480,9 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c */ /**begin repeat1 - * #func = sqrt, absolute, negative, minimum, maximum# - * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 # - * #name = unary*3, unary_reduce*2# + * #func = absolute, negative, minimum, maximum# + * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 # + * #name = unary*2, unary_reduce*2# */ #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS @@ -1357,33 +1327,6 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy } /**end repeat1**/ -static void -sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) -{ - /* align output to VECTOR_SIZE_BYTES bytes */ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) { - op[i] = @scalarf@(ip[i]); - } - assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) || - npy_is_aligned(&op[i], VECTOR_SIZE_BYTES)); - if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ d = @vpre@_load_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d)); - } - } - LOOP_BLOCKED_END { - op[i] = @scalarf@(ip[i]); - } -} - - static NPY_INLINE @type@ scalar_abs_@type@(@type@ v) { @@ -2458,9 +2401,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s */ /**begin repeat1 - * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc# - * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc# - * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0# + * #func = rint, ceil, floor, trunc# + * #vectorf = rint, ceil, floor, trunc# */ #if defined @CHK@ @@ -2475,10 +2417,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void npy_intp num_remaining_elements = array_size; @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); @mask@ load_mask = @isa@_get_full_load_mask_ps(); -#if @replace_0_with_1@ - @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); -#endif - /* * Note: while generally indices are npy_intp, we ensure that our maximum index * will fit in an int32 as a precondition for this function via @@ -2495,20 +2433,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, num_lanes); -#if @replace_0_with_1@ - inv_load_mask = @isa@_invert_mask_ps(load_mask); -#endif } @vtype@ x; if (stride == 1) { x = @isa@_masked_load_ps(load_mask, ip); -#if @replace_0_with_1@ - /* - * Replace masked elements with 1.0f to avoid divide by zero fp - * exception in reciprocal - */ - x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask); -#endif } else { x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); @@ -2544,9 +2472,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void */ /**begin repeat1 - * #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc# - * #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc# - * #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0# + * #func = rint, ceil, floor, trunc# + * #vectorf = rint, ceil, floor, trunc# */ #if defined @CHK@ @@ -2560,9 +2487,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double); npy_intp num_remaining_elements = array_size; @mask@ load_mask = @isa@_get_full_load_mask_pd(); -#if @replace_0_with_1@ - @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); -#endif @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); /* @@ -2580,20 +2504,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void if (num_remaining_elements < num_lanes) { load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, num_lanes); -#if @replace_0_with_1@ - inv_load_mask = @isa@_invert_mask_pd(load_mask); -#endif } @vtype@ x; if (stride == 1) { x = @isa@_masked_load_pd(load_mask, ip); -#if @replace_0_with_1@ - /* - * Replace masked elements with 1.0f to avoid divide by zero fp - * exception in reciprocal - */ - x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask)); -#endif } else { x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 061d0e34f..13e8d5ede 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -1,6 +1,6 @@ # NOTE: Please avoid the use of numpy.testing since NPYV intrinsics # may be involved in their functionality. -import pytest +import pytest, math from numpy.core._simd import targets class _Test_Utility: @@ -80,6 +80,18 @@ class _Test_Utility: cvt_intrin = "reinterpret_u{0}_{1}" return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector) + def _pinfinity(self): + v = self.npyv.setall_u32(0x7f800000) + return self.npyv.reinterpret_f32_u32(v)[0] + + def _ninfinity(self): + v = self.npyv.setall_u32(0xff800000) + return self.npyv.reinterpret_f32_u32(v)[0] + + def _nan(self): + v = self.npyv.setall_u32(0x7fc00000) + return self.npyv.reinterpret_f32_u32(v)[0] + class _SIMD_INT(_Test_Utility): """ To test all integer vector types at once @@ -150,6 +162,65 @@ class _SIMD_FP(_Test_Utility): data_nfms = self.mul(data_fma, self.setall(-1)) assert nfms == data_nfms + def test_abs(self): + pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() + data = self._data() + vdata = self.load(self._data()) + + abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan)) + for case, desired in abs_cases: + data_abs = [desired]*self.nlanes + vabs = self.abs(self.setall(case)) + assert vabs == pytest.approx(data_abs, nan_ok=True) + + vabs = self.abs(self.mul(vdata, self.setall(-1))) + assert vabs == data + + def test_sqrt(self): + pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() + data = self._data() + vdata = self.load(self._data()) + + sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf)) + for case, desired in sqrt_cases: + data_sqrt = [desired]*self.nlanes + sqrt = self.sqrt(self.setall(case)) + assert sqrt == pytest.approx(data_sqrt, nan_ok=True) + + data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision + sqrt = self.sqrt(vdata) + assert sqrt == data_sqrt + + def test_square(self): + pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() + data = self._data() + vdata = self.load(self._data()) + # square + square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf)) + for case, desired in square_cases: + data_square = [desired]*self.nlanes + square = self.square(self.setall(case)) + assert square == pytest.approx(data_square, nan_ok=True) + + data_square = [x*x for x in data] + square = self.square(vdata) + assert square == data_square + + def test_reciprocal(self): + pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() + data = self._data() + vdata = self.load(self._data()) + + recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf)) + for case, desired in recip_cases: + data_recip = [desired]*self.nlanes + recip = self.recip(self.setall(case)) + assert recip == pytest.approx(data_recip, nan_ok=True) + + data_recip = self.load([1/x for x in data]) # load to truncate precision + recip = self.recip(vdata) + assert recip == data_recip + class _SIMD_ALL(_Test_Utility): """ To test all vector types at once |
