2 files changed, 67 insertions, 59 deletions
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index e3cfa1f72..bab126b45 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -74,10 +74,12 @@
 #define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
                                    steps[1] == sizeof(tin) && \
                                    steps[2] == sizeof(tout))
+
 /* binary loop input and output contiguous with first scalar */
 #define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
                                    steps[1] == sizeof(tin) && \
                                    steps[2] == sizeof(tout))
+
 /* binary loop input and output contiguous with second scalar */
 #define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
                                    steps[1] == 0 && \
@@ -86,61 +88,63 @@
 
 /*
  * loop with contiguous specialization
- * op should be the code storing the result in `tout * out`
+ * val should be the value to be stored in `tout *out`
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
-#define BASE_OUTPUT_LOOP(tout, op) \
+#define BASE_OUTPUT_LOOP(tout, val) \
     OUTPUT_LOOP { \
-        tout * out = (tout *)op1; \
-        op; \
+        tout *out = (tout *)op1; \
+        *out = val; \
     }
-#define OUTPUT_LOOP_FAST(tout, op) \
+
+#define OUTPUT_LOOP_FAST(tout, val) \
     do { \
-    /* condition allows compiler to optimize the generic macro */ \
-    if (IS_OUTPUT_CONT(tout)) { \
-        BASE_OUTPUT_LOOP(tout, op) \
-    } \
-    else { \
-        BASE_OUTPUT_LOOP(tout, op) \
-    } \
+        /* condition allows compiler to optimize the generic macro */ \
+        if (IS_OUTPUT_CONT(tout)) { \
+            BASE_OUTPUT_LOOP(tout, val) \
+        } \
+        else { \
+            BASE_OUTPUT_LOOP(tout, val) \
+        } \
     } \
     while (0)
 
 /*
  * loop with contiguous specialization
  * op should be the code working on `tin in` and
- * storing the result in `tout * out`
+ * storing the result in `tout *out`
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
 #define BASE_UNARY_LOOP(tin, tout, op) \
     UNARY_LOOP { \
         const tin in = *(tin *)ip1; \
-        tout * out = (tout *)op1; \
+        tout *out = (tout *)op1; \
         op; \
     }
-#define UNARY_LOOP_FAST(tin, tout, op) \
+
+#define UNARY_LOOP_FAST(tin, tout, op)          \
     do { \
-    /* condition allows compiler to optimize the generic macro */ \
-    if (IS_UNARY_CONT(tin, tout)) { \
-        if (args[0] == args[1]) { \
-            BASE_UNARY_LOOP(tin, tout, op) \
+        /* condition allows compiler to optimize the generic macro */ \
+        if (IS_UNARY_CONT(tin, tout)) { \
+            if (args[0] == args[1]) { \
+                BASE_UNARY_LOOP(tin, tout, op) \
+            } \
+            else { \
+                BASE_UNARY_LOOP(tin, tout, op) \
+            } \
         } \
         else { \
             BASE_UNARY_LOOP(tin, tout, op) \
         } \
     } \
-    else { \
-        BASE_UNARY_LOOP(tin, tout, op) \
-    } \
-    } \
     while (0)
 
 /*
  * loop with contiguous specialization
  * op should be the code working on `tin in1`, `tin in2` and
- * storing the result in `tout * out`
+ * storing the result in `tout *out`
  * combine with NPY_GCC_OPT_3 to allow autovectorization
  * should only be used where its worthwhile to avoid code bloat
  */
@@ -148,9 +152,10 @@
     BINARY_LOOP { \
         const tin in1 = *(tin *)ip1; \
         const tin in2 = *(tin *)ip2; \
-        tout * out = (tout *)op1; \
+        tout *out = (tout *)op1; \
         op; \
     }
+
 /*
  * unfortunately gcc 6/7 regressed and we need to give it additional hints to
  * vectorize inplace operations (PR80198)
@@ -171,59 +176,62 @@
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
         const tin in1 = *(tin *)ip1; \
         const tin in2 = *(tin *)ip2; \
-        tout * out = (tout *)op1; \
+        tout *out = (tout *)op1; \
         op; \
     }
+
 #define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
     const tin cin = *(tin *)cinp; \
     BINARY_LOOP { \
         const tin vin = *(tin *)vinp; \
-        tout * out = (tout *)op1; \
+        tout *out = (tout *)op1; \
         op; \
     }
+
 /* PR80198 again, scalar works without the pragma */
 #define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
     const tin cin = *(tin *)cinp; \
     BINARY_LOOP { \
         const tin vin = *(tin *)vinp; \
-        tout * out = (tout *)vinp; \
+        tout *out = (tout *)vinp; \
         op; \
     }
-#define BINARY_LOOP_FAST(tin, tout, op) \
+
+#define BINARY_LOOP_FAST(tin, tout, op)         \
     do { \
-    /* condition allows compiler to optimize the generic macro */ \
-    if (IS_BINARY_CONT(tin, tout)) { \
-        if (abs_ptrdiff(args[2], args[0]) == 0 && \
-                abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
-            BASE_BINARY_LOOP_INP(tin, tout, op) \
+        /* condition allows compiler to optimize the generic macro */ \
+        if (IS_BINARY_CONT(tin, tout)) { \
+            if (abs_ptrdiff(args[2], args[0]) == 0 && \
+                    abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+                BASE_BINARY_LOOP_INP(tin, tout, op) \
+            } \
+            else if (abs_ptrdiff(args[2], args[1]) == 0 && \
+                         abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+                BASE_BINARY_LOOP_INP(tin, tout, op) \
+            } \
+            else { \
+                BASE_BINARY_LOOP(tin, tout, op) \
+            } \
         } \
-        else if (abs_ptrdiff(args[2], args[1]) == 0 && \
-                     abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
-            BASE_BINARY_LOOP_INP(tin, tout, op) \
+        else if (IS_BINARY_CONT_S1(tin, tout)) { \
+            if (abs_ptrdiff(args[2], args[1]) == 0) { \
+                BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
+            } \
+            else { \
+                BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
+            } \
         } \
-        else { \
-            BASE_BINARY_LOOP(tin, tout, op) \
-        } \
-    } \
-    else if (IS_BINARY_CONT_S1(tin, tout)) { \
-        if (abs_ptrdiff(args[2], args[1]) == 0) { \
-            BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
+        else if (IS_BINARY_CONT_S2(tin, tout)) { \
+            if (abs_ptrdiff(args[2], args[0]) == 0) { \
+                BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
+            } \
+            else { \
+                BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
+            }\
         } \
         else { \
-            BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
-        } \
-    } \
-    else if (IS_BINARY_CONT_S2(tin, tout)) { \
-        if (abs_ptrdiff(args[2], args[0]) == 0) { \
-            BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
+            BASE_BINARY_LOOP(tin, tout, op) \
         } \
-        else { \
-            BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
-        }\
-    } \
-    else { \
-        BASE_BINARY_LOOP(tin, tout, op) \
-    } \
     } \
     while (0)
 
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 1e4ab350b..b7e28537a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -652,7 +652,7 @@ BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
+    OUTPUT_LOOP_FAST(npy_bool, @val@);
 }
 
 /**end repeat**/
@@ -896,7 +896,7 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    OUTPUT_LOOP_FAST(npy_bool, *out = @val@);
+    OUTPUT_LOOP_FAST(npy_bool, @val@);
 }
 /**end repeat1**/