MAINT: make the simd function signature more specific

author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-07 19:44:41 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-08 20:44:05 +0200
commit: 564aa53e343e81de6864f43cb5f89932fdc1c718 (patch)
tree: 8b4c5faf9b426bf3ea691ddbf082e47b7b21b2d1
parent: 987dc32499e2bcfe160c61a5c5984e05552c7038 (diff)
download: numpy-564aa53e343e81de6864f43cb5f89932fdc1c718.tar.gz
1 files changed, 28 insertions, 29 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index e15909029..cd87db082 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -67,12 +67,9 @@
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
 
-/* align output to alignment
- * store alignment is usually more important than load alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\
-    type *ip1 = (type *)args[0], *op1 = (type *)args[1];\
-    npy_intp n = dimensions[0];\
-    npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\
+/* align var to alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
                                                 alignment, n);\
     for(i = 0; i < peel; i++)
 
@@ -1314,32 +1311,33 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
 #define NPY_HAVE_SIMD_@TYPE@
 
 static void
-sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
 {
-    UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
-        op1[i] = @scalarf@(ip1[i]);
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        op[i] = @scalarf@(ip[i]);
     }
-    assert(npy_is_aligned(&op1[i], 16));
-    if (npy_is_aligned(&ip1[i], 16)) {
+    assert(npy_is_aligned(&op[i], 16));
+    if (npy_is_aligned(&ip[i], 16)) {
         UNARY_LOOP_BLOCKED(@type@, 16) {
-            @vtype@ d = _mm_load_@vsuf@(&ip1[i]);
-            _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+            @vtype@ d = _mm_load_@vsuf@(&ip[i]);
+            _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d));
         }
     }
     else {
         UNARY_LOOP_BLOCKED(@type@, 16) {
-            @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]);
-            _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+            @vtype@ d = _mm_loadu_@vsuf@(&ip[i]);
+            _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d));
         }
     }
     UNARY_LOOP_BLOCKED_END {
-        op1[i] = @scalarf@(ip1[i]);
+        op[i] = @scalarf@(ip[i]);
     }
 }
 
 
 static void
-sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
 {
     /*
      * get 0x7FFFFFFF mask (everything but signbit set)
@@ -1348,27 +1346,28 @@ sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
      */
     const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
 
-    UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
-        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
         /* add 0 to clear -0.0 */
-        op1[i] = tmp + 0;
+        op[i] = tmp + 0;
     }
-    if (npy_is_aligned(&ip1[i], 16)) {
+    if (npy_is_aligned(&ip[i], 16)) {
         UNARY_LOOP_BLOCKED(@type@, 16) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
         }
     }
     else {
         UNARY_LOOP_BLOCKED(@type@, 16) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
         }
     }
     UNARY_LOOP_BLOCKED_END {
-        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
         /* add 0 to clear -0.0 */
-        op1[i] = tmp + 0;
+        op[i] = tmp + 0;
     }
 }
 
@@ -1380,7 +1379,7 @@ NPY_NO_EXPORT void
 {
 #ifdef HAVE_EMMINTRIN_H
     if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
-        sse2_sqrt_@TYPE@(args, dimensions, steps);
+        sse2_sqrt_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
     }
     else
 #endif
@@ -1621,7 +1620,7 @@ NPY_NO_EXPORT void
 {
 #if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
     if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
-        sse2_absolute_@TYPE@(args, dimensions, steps);
+        sse2_absolute_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
     }
     else
 #endif
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-07 19:44:41 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-08 20:44:05 +0200
commit	564aa53e343e81de6864f43cb5f89932fdc1c718 (patch)
tree	8b4c5faf9b426bf3ea691ddbf082e47b7b21b2d1
parent	987dc32499e2bcfe160c61a5c5984e05552c7038 (diff)
download	numpy-564aa53e343e81de6864f43cb5f89932fdc1c718.tar.gz