summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2013-06-07 19:44:41 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2013-06-08 20:44:05 +0200
commit564aa53e343e81de6864f43cb5f89932fdc1c718 (patch)
tree8b4c5faf9b426bf3ea691ddbf082e47b7b21b2d1 /numpy
parent987dc32499e2bcfe160c61a5c5984e05552c7038 (diff)
downloadnumpy-564aa53e343e81de6864f43cb5f89932fdc1c718.tar.gz
MAINT: make the simd function signature more specific
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops.c.src57
1 files changed, 28 insertions, 29 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index e15909029..cd87db082 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -67,12 +67,9 @@
npy_intp i;\
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
-/* align output to alignment
- * store alignment is usually more important than load alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\
- type *ip1 = (type *)args[0], *op1 = (type *)args[1];\
- npy_intp n = dimensions[0];\
- npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\
+/* align var to alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
alignment, n);\
for(i = 0; i < peel; i++)
@@ -1314,32 +1311,33 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
#define NPY_HAVE_SIMD_@TYPE@
static void
-sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
{
- UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
- op1[i] = @scalarf@(ip1[i]);
+ /* align output to 16 bytes */
+ UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ op[i] = @scalarf@(ip[i]);
}
- assert(npy_is_aligned(&op1[i], 16));
- if (npy_is_aligned(&ip1[i], 16)) {
+ assert(npy_is_aligned(&op[i], 16));
+ if (npy_is_aligned(&ip[i], 16)) {
UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ d = _mm_load_@vsuf@(&ip1[i]);
- _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+ @vtype@ d = _mm_load_@vsuf@(&ip[i]);
+ _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d));
}
}
else {
UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]);
- _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+ @vtype@ d = _mm_loadu_@vsuf@(&ip[i]);
+ _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d));
}
}
UNARY_LOOP_BLOCKED_END {
- op1[i] = @scalarf@(ip1[i]);
+ op[i] = @scalarf@(ip[i]);
}
}
static void
-sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
{
/*
* get 0x7FFFFFFF mask (everything but signbit set)
@@ -1348,27 +1346,28 @@ sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
*/
const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
- UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
- const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+ /* align output to 16 bytes */
+ UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
/* add 0 to clear -0.0 */
- op1[i] = tmp + 0;
+ op[i] = tmp + 0;
}
- if (npy_is_aligned(&ip1[i], 16)) {
+ if (npy_is_aligned(&ip[i], 16)) {
UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+ @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
}
}
else {
UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
}
}
UNARY_LOOP_BLOCKED_END {
- const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+ const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
/* add 0 to clear -0.0 */
- op1[i] = tmp + 0;
+ op[i] = tmp + 0;
}
}
@@ -1380,7 +1379,7 @@ NPY_NO_EXPORT void
{
#ifdef HAVE_EMMINTRIN_H
if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
- sse2_sqrt_@TYPE@(args, dimensions, steps);
+ sse2_sqrt_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
}
else
#endif
@@ -1621,7 +1620,7 @@ NPY_NO_EXPORT void
{
#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
- sse2_absolute_@TYPE@(args, dimensions, steps);
+ sse2_absolute_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
}
else
#endif