summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops.c.src21
-rw-r--r--numpy/core/src/umath/simd.inc.src49
2 files changed, 49 insertions, 21 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index bc7e075cb..f29b15477 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1651,22 +1651,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT NPY_GCC_OPT_3 void
FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
char str[] = "@func@";
- @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
+ if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
+ UNARY_LOOP {
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+ @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
#else
- /*
- * This is the path it would take if ISA was runtime detected, but not
- * compiled for. It fixes the error on clang6.0 which fails to compile
- * AVX512F version. Not sure if I like this idea, if during runtime it
- * detects AXV512F, it will end up running the scalar version instead
- * of AVX2.
- */
- UNARY_LOOP {
- const npy_float in1 = *(npy_float *)ip1;
- *(npy_float *)op1 = @scalarf@(in1);
- }
+ const npy_float in1 = *(npy_float *)ip1;
+ *(npy_float *)op1 = @scalarf@(in1);
#endif
+ }
+ }
}
/**end repeat1**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 07a3c19a4..6da75d724 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
+@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
#endif
+static NPY_INLINE int
+run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+ @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
+
/**end repeat**/
@@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
- char* operation)
+@ISA@_sincos_FLOAT(npy_float * op,
+ npy_float * ip,
+ const npy_intp array_size,
+ const npy_intp steps,
+ char* operation)
{
+ const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
npy_int compute_cos = 1;
npy_float large_number = 71476.0625f;
@@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
@mask@ load_mask = @isa@_get_full_load_mask();
npy_intp num_remaining_elements = array_size;
+ npy_int indexarr[16];
+ for (npy_int ii = 0; ii < 16; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+ @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
while (num_remaining_elements > 0) {
- if (num_remaining_elements < num_lanes)
+ if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
num_lanes);
- @vtype@ x = @isa@_masked_load(load_mask, ip);
+ }
+
+ @vtype@ x;
+ if (stride == 1) {
+ x = @isa@_masked_load(load_mask, ip);
+ }
+ else {
+ x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+ }
/*
* For elements outside of this range, Cody-Waite's range reduction
@@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
/* process elements using glibc for large elements */
if (compute_cos) {
for (int ii = 0; iglibc_mask != 0; ii++) {
- if (iglibc_mask & 0x01)
+ if (iglibc_mask & 0x01) {
op[ii] = npy_cosf(ip[ii]);
+ }
iglibc_mask = iglibc_mask >> 1;
}
}
else {
for (int ii = 0; iglibc_mask != 0; ii++) {
- if (iglibc_mask & 0x01)
+ if (iglibc_mask & 0x01) {
op[ii] = npy_sinf(ip[ii]);
+ }
iglibc_mask = iglibc_mask >> 1;
}
}
- ip += num_lanes;
+ ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}