2 files changed, 49 insertions, 21 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index bc7e075cb..f29b15477 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1651,22 +1651,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
     char str[] = "@func@";
-    @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
+    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
+        UNARY_LOOP {
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
 #else
-    /*
-     * This is the path it would take if ISA was runtime detected, but not
-     * compiled for. It fixes the error on clang6.0 which fails to compile
-     * AVX512F version. Not sure if I like this idea, if during runtime it
-     * detects AXV512F, it will end up running the scalar version instead
-     * of AVX2.
-     */
-    UNARY_LOOP {
-	const npy_float in1 = *(npy_float *)ip1;
-	*(npy_float *)op1 = @scalarf@(in1);
-    }
+	        const npy_float in1 = *(npy_float *)ip1;
+	        *(npy_float *)op1 = @scalarf@(in1);
 #endif
+        }
+    }
 }
 
 /**end repeat1**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 07a3c19a4..6da75d724 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
 #endif
 
+static NPY_INLINE int
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
 /**end repeat**/
 
 
@@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
-                                                   char* operation)
+@ISA@_sincos_FLOAT(npy_float * op,
+                   npy_float * ip,
+                   const npy_intp array_size,
+                   const npy_intp steps,
+                   char* operation)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_int compute_cos = 1;
     npy_float large_number = 71476.0625f;
@@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
     @mask@ load_mask = @isa@_get_full_load_mask();
     npy_intp num_remaining_elements = array_size;
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                          num_lanes);
-        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+        }
+
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load(load_mask, ip);
+        }
+        else {
+            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+        }
 
         /*
          * For elements outside of this range, Cody-Waite's range reduction
@@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         /* process elements using glibc for large elements */
         if (compute_cos) {
             for (int ii = 0; iglibc_mask != 0; ii++) {
-                if (iglibc_mask & 0x01)
+                if (iglibc_mask & 0x01) {
                     op[ii] = npy_cosf(ip[ii]);
+                }
                 iglibc_mask  = iglibc_mask >> 1;
             }
         }
         else {
             for (int ii = 0; iglibc_mask != 0; ii++) {
-                if (iglibc_mask & 0x01)
+                if (iglibc_mask & 0x01) {
                     op[ii] = npy_sinf(ip[ii]);
+                }
                 iglibc_mask  = iglibc_mask >> 1;
             }
         }
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }