summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/src/umath/fast_loop_macros.h122
-rw-r--r--numpy/core/src/umath/loops.c.src4
2 files changed, 67 insertions, 59 deletions
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index e3cfa1f72..bab126b45 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -74,10 +74,12 @@
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
steps[1] == sizeof(tin) && \
steps[2] == sizeof(tout))
+
/* binary loop input and output contiguous with first scalar */
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
steps[1] == sizeof(tin) && \
steps[2] == sizeof(tout))
+
/* binary loop input and output contiguous with second scalar */
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
steps[1] == 0 && \
@@ -86,61 +88,63 @@
/*
* loop with contiguous specialization
- * op should be the code storing the result in `tout * out`
+ * val should be the value to be stored in `tout *out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
-#define BASE_OUTPUT_LOOP(tout, op) \
+#define BASE_OUTPUT_LOOP(tout, val) \
OUTPUT_LOOP { \
- tout * out = (tout *)op1; \
- op; \
+ tout *out = (tout *)op1; \
+ *out = val; \
}
-#define OUTPUT_LOOP_FAST(tout, op) \
+
+#define OUTPUT_LOOP_FAST(tout, val) \
do { \
- /* condition allows compiler to optimize the generic macro */ \
- if (IS_OUTPUT_CONT(tout)) { \
- BASE_OUTPUT_LOOP(tout, op) \
- } \
- else { \
- BASE_OUTPUT_LOOP(tout, op) \
- } \
+ /* condition allows compiler to optimize the generic macro */ \
+ if (IS_OUTPUT_CONT(tout)) { \
+ BASE_OUTPUT_LOOP(tout, val) \
+ } \
+ else { \
+ BASE_OUTPUT_LOOP(tout, val) \
+ } \
} \
while (0)
/*
* loop with contiguous specialization
* op should be the code working on `tin in` and
- * storing the result in `tout * out`
+ * storing the result in `tout *out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
#define BASE_UNARY_LOOP(tin, tout, op) \
UNARY_LOOP { \
const tin in = *(tin *)ip1; \
- tout * out = (tout *)op1; \
+ tout *out = (tout *)op1; \
op; \
}
-#define UNARY_LOOP_FAST(tin, tout, op) \
+
+#define UNARY_LOOP_FAST(tin, tout, op) \
do { \
- /* condition allows compiler to optimize the generic macro */ \
- if (IS_UNARY_CONT(tin, tout)) { \
- if (args[0] == args[1]) { \
- BASE_UNARY_LOOP(tin, tout, op) \
+ /* condition allows compiler to optimize the generic macro */ \
+ if (IS_UNARY_CONT(tin, tout)) { \
+ if (args[0] == args[1]) { \
+ BASE_UNARY_LOOP(tin, tout, op) \
+ } \
+ else { \
+ BASE_UNARY_LOOP(tin, tout, op) \
+ } \
} \
else { \
BASE_UNARY_LOOP(tin, tout, op) \
} \
} \
- else { \
- BASE_UNARY_LOOP(tin, tout, op) \
- } \
- } \
while (0)
/*
* loop with contiguous specialization
* op should be the code working on `tin in1`, `tin in2` and
- * storing the result in `tout * out`
+ * storing the result in `tout *out`
* combine with NPY_GCC_OPT_3 to allow autovectorization
* should only be used where its worthwhile to avoid code bloat
*/
@@ -148,9 +152,10 @@
BINARY_LOOP { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
- tout * out = (tout *)op1; \
+ tout *out = (tout *)op1; \
op; \
}
+
/*
* unfortunately gcc 6/7 regressed and we need to give it additional hints to
* vectorize inplace operations (PR80198)
@@ -171,59 +176,62 @@
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
const tin in1 = *(tin *)ip1; \
const tin in2 = *(tin *)ip2; \
- tout * out = (tout *)op1; \
+ tout *out = (tout *)op1; \
op; \
}
+
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
const tin vin = *(tin *)vinp; \
- tout * out = (tout *)op1; \
+ tout *out = (tout *)op1; \
op; \
}
+
/* PR80198 again, scalar works without the pragma */
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
const tin cin = *(tin *)cinp; \
BINARY_LOOP { \
const tin vin = *(tin *)vinp; \
- tout * out = (tout *)vinp; \
+ tout *out = (tout *)vinp; \
op; \
}
-#define BINARY_LOOP_FAST(tin, tout, op) \
+
+#define BINARY_LOOP_FAST(tin, tout, op) \
do { \
- /* condition allows compiler to optimize the generic macro */ \
- if (IS_BINARY_CONT(tin, tout)) { \
- if (abs_ptrdiff(args[2], args[0]) == 0 && \
- abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
- BASE_BINARY_LOOP_INP(tin, tout, op) \
+ /* condition allows compiler to optimize the generic macro */ \
+ if (IS_BINARY_CONT(tin, tout)) { \
+ if (abs_ptrdiff(args[2], args[0]) == 0 && \
+ abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+ BASE_BINARY_LOOP_INP(tin, tout, op) \
+ } \
+ else if (abs_ptrdiff(args[2], args[1]) == 0 && \
+ abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+ BASE_BINARY_LOOP_INP(tin, tout, op) \
+ } \
+ else { \
+ BASE_BINARY_LOOP(tin, tout, op) \
+ } \
} \
- else if (abs_ptrdiff(args[2], args[1]) == 0 && \
- abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
- BASE_BINARY_LOOP_INP(tin, tout, op) \
+ else if (IS_BINARY_CONT_S1(tin, tout)) { \
+ if (abs_ptrdiff(args[2], args[1]) == 0) { \
+ BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
+ } \
+ else { \
+ BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+ } \
} \
- else { \
- BASE_BINARY_LOOP(tin, tout, op) \
- } \
- } \
- else if (IS_BINARY_CONT_S1(tin, tout)) { \
- if (abs_ptrdiff(args[2], args[1]) == 0) { \
- BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
+ else if (IS_BINARY_CONT_S2(tin, tout)) { \
+ if (abs_ptrdiff(args[2], args[0]) == 0) { \
+ BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
+ } \
+ else { \
+ BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+ }\
} \
else { \
- BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
- } \
- } \
- else if (IS_BINARY_CONT_S2(tin, tout)) { \
- if (abs_ptrdiff(args[2], args[0]) == 0) { \
- BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
+ BASE_BINARY_LOOP(tin, tout, op) \
} \
- else { \
- BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
- }\
- } \
- else { \
- BASE_BINARY_LOOP(tin, tout, op) \
- } \
} \
while (0)
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1e4ab350b..b7e28537a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -652,7 +652,7 @@ BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
- OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
+ OUTPUT_LOOP_FAST(npy_bool, @val@);
}
/**end repeat**/
@@ -896,7 +896,7 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
- OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
+ OUTPUT_LOOP_FAST(npy_bool, @val@);
}
/**end repeat1**/