summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/code_generators/generate_umath.py6
-rw-r--r--numpy/core/src/umath/loops.c.src56
-rw-r--r--numpy/core/src/umath/loops.h.src22
-rw-r--r--numpy/core/src/umath/loops_unary_fp.dispatch.c.src23
-rw-r--r--numpy/core/src/umath/simd.inc.src180
5 files changed, 21 insertions, 266 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 054150b28..b11504c03 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -844,7 +844,7 @@ defdict = {
docstrings.get('numpy.core.umath.trunc'),
None,
TD('e', f='trunc', astype={'e': 'f'}),
- TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+ TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg', f='trunc'),
TD(O, f='npy_ObjectTrunc'),
),
@@ -860,7 +860,7 @@ defdict = {
docstrings.get('numpy.core.umath.floor'),
None,
TD('e', f='floor', astype={'e': 'f'}),
- TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+ TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg', f='floor'),
TD(O, f='npy_ObjectFloor'),
),
@@ -869,7 +869,7 @@ defdict = {
docstrings.get('numpy.core.umath.rint'),
None,
TD('e', f='rint', astype={'e': 'f'}),
- TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+ TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg' + cmplx, f='rint'),
TD(P, f='rint'),
),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5f054d0a9..7f084ac39 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1506,62 +1506,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
*/
/**begin repeat
- * #func = rint, floor, trunc#
- * #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-/**begin repeat1
-* #TYPE = FLOAT, DOUBLE#
-* #type = npy_float, npy_double#
-* #typesub = f, #
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = @scalarf@@typesub@(in1);
- }
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #isa = avx512f, fma#
- * #ISA = AVX512F, FMA#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- * #TYPE = FLOAT, DOUBLE#
- * #type = npy_float, npy_double#
- * #typesub = f, #
- */
-
-/**begin repeat2
- * #func = rint, floor, trunc#
- * #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = @scalarf@@typesub@(in1);
- }
- }
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 3eafbdf66..e5235b464 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -186,7 +186,7 @@ NPY_NO_EXPORT void
* #TYPE = FLOAT, DOUBLE#
*/
/**begin repeat1
- * #kind = ceil, sqrt, absolute, square, reciprocal#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -274,26 +274,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
/**end repeat**/
/**begin repeat
- * #func = rint, floor, trunc#
- */
-
-/**begin repeat1
-* #TYPE = FLOAT, DOUBLE#
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-/**begin repeat2
- * #isa = avx512f, fma#
- */
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* Float types
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, f, , l#
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 93761b98c..5817cf500 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -70,6 +70,15 @@ NPY_FINLINE double c_square_f64(double a)
#define c_ceil_f32 npy_ceilf
#define c_ceil_f64 npy_ceil
+#define c_trunc_f32 npy_truncf
+#define c_trunc_f64 npy_trunc
+
+#define c_floor_f32 npy_floorf
+#define c_floor_f64 npy_floor
+
+#define c_rint_f32 npy_rintf
+#define c_rint_f64 npy_rint
+
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
@@ -139,10 +148,10 @@ NPY_FINLINE double c_square_f64(double a)
*/
#if @VCHK@
/**begin repeat1
- * #kind = ceil, sqrt, absolute, square, reciprocal#
- * #intr = ceil, sqrt, abs, square, recip#
- * #repl_0w1 = 0, 0, 0, 0, 1#
- * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
+ * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip#
+ * #repl_0w1 = 0*7, 1#
+ * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG#
*/
/**begin repeat2
* #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
@@ -250,9 +259,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
* #VCHK = NPY_SIMD, NPY_SIMD_F64#
*/
/**begin repeat1
- * #kind = ceil, sqrt, absolute, square, reciprocal#
- * #intr = ceil, sqrt, abs, square, recip#
- * #clear = 0, 0, 1, 0, 0#
+ * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip#
+ * #clear = 0, 0, 0, 0, 0, 1, 0, 0#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8b833ee56..b477027b3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -123,47 +123,6 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
/**end repeat**/
/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512f#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- * #REGISTER_SIZE = 32, 64#
- */
-
-/* prototypes */
-
-/**begin repeat1
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- */
-
-/**begin repeat2
- * #func = rint, floor, trunc#
- */
-
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) {
- @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
@@ -1119,144 +1078,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
/**end repeat**/
/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256, __mmask16#
- * #vsub = , _mask#
- * #vtype = __m256, __m512#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- * #func = rint, floor, trunc#
- * #vectorf = rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_FLOAT(npy_float* op,
- npy_float* ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
- const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
- npy_intp num_remaining_elements = array_size;
- @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
- @mask@ load_mask = @isa@_get_full_load_mask_ps();
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via
- * IS_OUTPUT_BLOCKABLE_UNARY
- */
-
- npy_int32 indexarr[16];
- for (npy_int32 ii = 0; ii < 16; ii++) {
- indexarr[ii] = ii*stride;
- }
- @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
- num_lanes);
- }
- @vtype@ x;
- if (stride == 1) {
- x = @isa@_masked_load_ps(load_mask, ip);
- }
- else {
- x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
- }
- @vtype@ out = @isa@_@vectorf@_ps(x);
- @masked_store@(op, @cvtps_epi32@(load_mask), out);
-
- ip += num_lanes*stride;
- op += num_lanes;
- num_remaining_elements -= num_lanes;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256i, __mmask8#
- * #vsub = , _mask#
- * #vtype = __m256d, __m512d#
- * #vindextype = __m128i, __m256i#
- * #vindexsize = 128, 256#
- * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
- * #cvtps_epi32 = _mm256_cvtpd_epi32, #
- * #castmask = _mm256_castsi256_pd, #
- * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- * #func = rint, floor, trunc#
- * #vectorf = rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_DOUBLE(npy_double* op,
- npy_double* ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
- const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
- npy_intp num_remaining_elements = array_size;
- @mask@ load_mask = @isa@_get_full_load_mask_pd();
- @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via
- * IS_OUTPUT_BLOCKABLE_UNARY
- */
- npy_int32 indexarr[8];
- for (npy_int32 ii = 0; ii < 8; ii++) {
- indexarr[ii] = ii*stride;
- }
- @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
- num_lanes);
- }
- @vtype@ x;
- if (stride == 1) {
- x = @isa@_masked_load_pd(load_mask, ip);
- }
- else {
- x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
- }
- @vtype@ out = @isa@_@vectorf@_pd(x);
- @masked_store@(op, load_mask, out);
-
- ip += num_lanes*stride;
- op += num_lanes;
- num_remaining_elements -= num_lanes;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
* #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
* #num_lanes = 16, 8#
@@ -1535,3 +1356,4 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
#undef VECTOR_SIZE_BYTES
#endif /* NPY_HAVE_SSE2_INTRINSICS */
#endif
+