5 files changed, 21 insertions, 266 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 054150b28..b11504c03 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -844,7 +844,7 @@ defdict = {
           docstrings.get('numpy.core.umath.trunc'),
           None,
           TD('e', f='trunc', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
           ),
@@ -860,7 +860,7 @@ defdict = {
           docstrings.get('numpy.core.umath.floor'),
           None,
           TD('e', f='floor', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
           ),
@@ -869,7 +869,7 @@ defdict = {
           docstrings.get('numpy.core.umath.rint'),
           None,
           TD('e', f='rint', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
           ),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5f054d0a9..7f084ac39 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1506,62 +1506,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- *  #func = rint, floor, trunc#
- *  #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-/**begin repeat1
-*  #TYPE = FLOAT, DOUBLE#
-*  #type = npy_float, npy_double#
-*  #typesub = f, #
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        *(@type@ *)op1 = @scalarf@@typesub@(in1);
-    }
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #isa = avx512f, fma#
- * #ISA = AVX512F, FMA#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #TYPE = FLOAT, DOUBLE#
- *  #type = npy_float, npy_double#
- *  #typesub = f, #
- */
-
-/**begin repeat2
- *  #func = rint, floor, trunc#
- *  #scalarf = npy_rint, npy_floor, npy_trunc#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@@typesub@(in1);
-        }
-    }
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 3eafbdf66..e5235b464 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -186,7 +186,7 @@ NPY_NO_EXPORT void
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
- * #kind = ceil, sqrt, absolute, square, reciprocal#
+ * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -274,26 +274,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
 /**end repeat**/
 
 /**begin repeat
- *  #func = rint, floor, trunc#
- */
-
-/**begin repeat1
-*  #TYPE = FLOAT, DOUBLE#
-*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-/**begin repeat2
- * #isa = avx512f, fma#
- */
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 93761b98c..5817cf500 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -70,6 +70,15 @@ NPY_FINLINE double c_square_f64(double a)
 #define c_ceil_f32 npy_ceilf
 #define c_ceil_f64 npy_ceil
 
+#define c_trunc_f32 npy_truncf
+#define c_trunc_f64 npy_trunc
+
+#define c_floor_f32 npy_floorf
+#define c_floor_f64 npy_floor
+
+#define c_rint_f32 npy_rintf
+#define c_rint_f64 npy_rint
+
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
@@ -139,10 +148,10 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #if @VCHK@
 /**begin repeat1
- * #kind     = ceil, sqrt, absolute, square, reciprocal#
- * #intr     = ceil, sqrt, abs,      square, recip#
- * #repl_0w1 = 0,    0,    0,        0,      1#
- * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
+ * #kind     = rint,  floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr     = rint,  floor, ceil, trunc, sqrt, abs,      square, recip#
+ * #repl_0w1 = 0*7, 1#
+ * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -250,9 +259,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
  * #VCHK = NPY_SIMD, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind  = ceil, sqrt, absolute, square, reciprocal#
- * #intr  = ceil, sqrt, abs,      square, recip#
- * #clear = 0,    0,    1,        0,      0#
+ * #kind  = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
+ * #intr  = rint, floor, ceil, trunc, sqrt, abs,      square, recip#
+ * #clear = 0,    0,     0,    0,     0,    1,        0,      0#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8b833ee56..b477027b3 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -123,47 +123,6 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
 /**end repeat**/
 
 /**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512f#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- * #REGISTER_SIZE = 32, 64#
- */
-
-/* prototypes */
-
-/**begin repeat1
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- */
-
-/**begin repeat2
- *  #func = rint, floor, trunc#
- */
-
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) {
-        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat2**/
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
@@ -1119,144 +1078,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
 /**end repeat**/
 
 /**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256, __mmask16#
- * #vsub = , _mask#
- * #vtype = __m256, __m512#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #func = rint, floor, trunc#
- *  #vectorf = rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_FLOAT(npy_float* op,
-                   npy_float* ip,
-                   const npy_intp array_size,
-                   const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
-    npy_intp num_remaining_elements = array_size;
-    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
-    @mask@ load_mask = @isa@_get_full_load_mask_ps();
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 indexarr[16];
-    for (npy_int32 ii = 0; ii < 16; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
-                                                       num_lanes);
-        }
-        @vtype@ x;
-        if (stride == 1) {
-            x = @isa@_masked_load_ps(load_mask, ip);
-        }
-        else {
-            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
-        }
-        @vtype@ out = @isa@_@vectorf@_ps(x);
-        @masked_store@(op, @cvtps_epi32@(load_mask), out);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vsize = 256, 512#
- * #BYTES = 32, 64#
- * #cvtps_epi32 = _mm256_cvtps_epi32, #
- * #mask = __m256i, __mmask8#
- * #vsub = , _mask#
- * #vtype = __m256d, __m512d#
- * #vindextype = __m128i, __m256i#
- * #vindexsize = 128, 256#
- * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
- * #cvtps_epi32 = _mm256_cvtpd_epi32, #
- * #castmask = _mm256_castsi256_pd, #
- * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- */
-
-/**begin repeat1
- *  #func = rint, floor, trunc#
- *  #vectorf =  rint, floor, trunc#
- */
-
-#if defined @CHK@
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_@func@_DOUBLE(npy_double* op,
-                    npy_double* ip,
-                    const npy_intp array_size,
-                    const npy_intp steps)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
-    npy_intp num_remaining_elements = array_size;
-    @mask@ load_mask = @isa@_get_full_load_mask_pd();
-    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-    npy_int32 indexarr[8];
-    for (npy_int32 ii = 0; ii < 8; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
-                                                       num_lanes);
-        }
-        @vtype@ x;
-        if (stride == 1) {
-            x = @isa@_masked_load_pd(load_mask, ip);
-        }
-        else {
-            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
-        }
-        @vtype@ out = @isa@_@vectorf@_pd(x);
-        @masked_store@(op, load_mask, out);
-
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
  * #num_lanes = 16, 8#
@@ -1535,3 +1356,4 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 #undef VECTOR_SIZE_BYTES
 #endif  /* NPY_HAVE_SSE2_INTRINSICS */
 #endif
+