diff options
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 6 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 56 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 22 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_unary_fp.dispatch.c.src | 23 | ||||
| -rw-r--r-- | numpy/core/src/umath/simd.inc.src | 180 |
5 files changed, 21 insertions, 266 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 054150b28..b11504c03 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -844,7 +844,7 @@ defdict = { docstrings.get('numpy.core.umath.trunc'), None, TD('e', f='trunc', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg', f='trunc'), TD(O, f='npy_ObjectTrunc'), ), @@ -860,7 +860,7 @@ defdict = { docstrings.get('numpy.core.umath.floor'), None, TD('e', f='floor', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg', f='floor'), TD(O, f='npy_ObjectFloor'), ), @@ -869,7 +869,7 @@ defdict = { docstrings.get('numpy.core.umath.rint'), None, TD('e', f='rint', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg' + cmplx, f='rint'), TD(P, f='rint'), ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5f054d0a9..7f084ac39 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1506,62 +1506,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * */ /**begin repeat - * #func = rint, floor, trunc# - * #scalarf = npy_rint, npy_floor, npy_trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -* #type = npy_float, npy_double# -* #typesub = f, # -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@@typesub@(in1); - } -} - - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #isa = avx512f, fma# - * #ISA = AVX512F, FMA# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #TYPE = FLOAT, DOUBLE# - * #type = npy_float, npy_double# - * #typesub = f, # - */ - -/**begin repeat2 - * #func = rint, floor, trunc# - * #scalarf = npy_rint, npy_floor, npy_trunc# - */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@@typesub@(in1); - } - } -} - -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 3eafbdf66..e5235b464 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -186,7 +186,7 @@ NPY_NO_EXPORT void * #TYPE = FLOAT, DOUBLE# */ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# + * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) @@ -274,26 +274,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( /**end repeat**/ /**begin repeat - * #func = rint, floor, trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat2 - * #isa = avx512f, fma# - */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# * #c = f, f, , l# diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index 93761b98c..5817cf500 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -70,6 +70,15 @@ NPY_FINLINE double c_square_f64(double a) #define c_ceil_f32 npy_ceilf #define c_ceil_f64 npy_ceil +#define c_trunc_f32 npy_truncf +#define c_trunc_f64 npy_trunc + +#define c_floor_f32 npy_floorf +#define c_floor_f64 npy_floor + +#define c_rint_f32 npy_rintf +#define c_rint_f64 npy_rint + /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ @@ -139,10 +148,10 @@ NPY_FINLINE double c_square_f64(double a) */ #if @VCHK@ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# - * #intr = ceil, sqrt, abs, square, recip# - * #repl_0w1 = 0, 0, 0, 0, 1# - * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG# + * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# + * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# + * #repl_0w1 = 0*7, 1# + * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG# */ /**begin repeat2 * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# @@ -250,9 +259,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ * #VCHK = NPY_SIMD, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# - * #intr = ceil, sqrt, abs, square, recip# - * #clear = 0, 0, 1, 0, 0# + * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# + * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# + * #clear = 0, 0, 0, 0, 0, 1, 0, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 8b833ee56..b477027b3 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -123,47 +123,6 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c /**end repeat**/ /**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512f# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - * #REGISTER_SIZE = 32, 64# - */ - -/* prototypes */ - -/**begin repeat1 - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - */ - -/**begin repeat2 - * #func = rint, floor, trunc# - */ - -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride); -#endif - -static NPY_INLINE int -run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) { - @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# @@ -1119,144 +1078,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co /**end repeat**/ /**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vsize = 256, 512# - * #BYTES = 32, 64# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #mask = __m256, __mmask16# - * #vsub = , _mask# - * #vtype = __m256, __m512# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #func = rint, floor, trunc# - * #vectorf = rint, floor, trunc# - */ - -#if defined @CHK@ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_FLOAT(npy_float* op, - npy_float* ip, - const npy_intp array_size, - const npy_intp steps) -{ - const npy_intp stride = steps/(npy_intp)sizeof(npy_float); - const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); - npy_intp num_remaining_elements = array_size; - @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); - @mask@ load_mask = @isa@_get_full_load_mask_ps(); - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - - npy_int32 indexarr[16]; - for (npy_int32 ii = 0; ii < 16; ii++) { - indexarr[ii] = ii*stride; - } - @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, - num_lanes); - } - @vtype@ x; - if (stride == 1) { - x = @isa@_masked_load_ps(load_mask, ip); - } - else { - x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); - } - @vtype@ out = @isa@_@vectorf@_ps(x); - @masked_store@(op, @cvtps_epi32@(load_mask), out); - - ip += num_lanes*stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vsize = 256, 512# - * #BYTES = 32, 64# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #mask = __m256i, __mmask8# - * #vsub = , _mask# - * #vtype = __m256d, __m512d# - * #vindextype = __m128i, __m256i# - * #vindexsize = 128, 256# - * #vindexload = _mm_loadu_si128, _mm256_loadu_si256# - * #cvtps_epi32 = _mm256_cvtpd_epi32, # - * #castmask = _mm256_castsi256_pd, # - * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #func = rint, floor, trunc# - * #vectorf = rint, floor, trunc# - */ - -#if defined @CHK@ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_DOUBLE(npy_double* op, - npy_double* ip, - const npy_intp array_size, - const npy_intp steps) -{ - const npy_intp stride = steps/(npy_intp)sizeof(npy_double); - const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double); - npy_intp num_remaining_elements = array_size; - @mask@ load_mask = @isa@_get_full_load_mask_pd(); - @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - npy_int32 indexarr[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - indexarr[ii] = ii*stride; - } - @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, - num_lanes); - } - @vtype@ x; - if (stride == 1) { - x = @isa@_masked_load_pd(load_mask, ip); - } - else { - x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); - } - @vtype@ out = @isa@_@vectorf@_pd(x); - @masked_store@(op, load_mask, out); - - ip += num_lanes*stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# * #num_lanes = 16, 8# @@ -1535,3 +1356,4 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) #undef VECTOR_SIZE_BYTES #endif /* NPY_HAVE_SSE2_INTRINSICS */ #endif + |
