diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-07 19:44:41 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-08 20:44:05 +0200 |
commit | 564aa53e343e81de6864f43cb5f89932fdc1c718 (patch) | |
tree | 8b4c5faf9b426bf3ea691ddbf082e47b7b21b2d1 | |
parent | 987dc32499e2bcfe160c61a5c5984e05552c7038 (diff) | |
download | numpy-564aa53e343e81de6864f43cb5f89932fdc1c718.tar.gz |
MAINT: make the simd function signature more specific
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 57 |
1 files changed, 28 insertions, 29 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index e15909029..cd87db082 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -67,12 +67,9 @@ npy_intp i;\ for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2) -/* align output to alignment - * store alignment is usually more important than load alignment */ -#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\ - type *ip1 = (type *)args[0], *op1 = (type *)args[1];\ - npy_intp n = dimensions[0];\ - npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\ +/* align var to alignment */ +#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ + npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ alignment, n);\ for(i = 0; i < peel; i++) @@ -1314,32 +1311,33 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void * #define NPY_HAVE_SIMD_@TYPE@ static void -sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) { - UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) { - op1[i] = @scalarf@(ip1[i]); + /* align output to 16 bytes */ + UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + op[i] = @scalarf@(ip[i]); } - assert(npy_is_aligned(&op1[i], 16)); - if (npy_is_aligned(&ip1[i], 16)) { + assert(npy_is_aligned(&op[i], 16)); + if (npy_is_aligned(&ip[i], 16)) { UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ d = _mm_load_@vsuf@(&ip1[i]); - _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d)); + @vtype@ d = _mm_load_@vsuf@(&ip[i]); + _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d)); } } else { UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]); - _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d)); + @vtype@ d = _mm_loadu_@vsuf@(&ip[i]); + _mm_store_@vsuf@(&op[i], _mm_sqrt_@vsuf@(d)); } } UNARY_LOOP_BLOCKED_END { - op1[i] = @scalarf@(ip1[i]); + op[i] = @scalarf@(ip[i]); } } static void -sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n) { /* * get 0x7FFFFFFF mask (everything but signbit set) @@ -1348,27 +1346,28 @@ sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) */ const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); - UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) { - const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i]; + /* align output to 16 bytes */ + UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) { + const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; /* add 0 to clear -0.0 */ - op1[i] = tmp + 0; + op[i] = tmp + 0; } - if (npy_is_aligned(&ip1[i], 16)) { + if (npy_is_aligned(&ip[i], 16)) { UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a)); + @vtype@ a = @vpre@_load_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); } } else { UNARY_LOOP_BLOCKED(@type@, 16) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a)); + @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]); + @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a)); } } UNARY_LOOP_BLOCKED_END { - const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i]; + const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i]; /* add 0 to clear -0.0 */ - op1[i] = tmp + 0; + op[i] = tmp + 0; } } @@ -1380,7 +1379,7 @@ NPY_NO_EXPORT void { #ifdef HAVE_EMMINTRIN_H if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { - sse2_sqrt_@TYPE@(args, dimensions, steps); + sse2_sqrt_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); } else #endif @@ -1621,7 +1620,7 @@ NPY_NO_EXPORT void { #if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@ if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { - sse2_absolute_@TYPE@(args, dimensions, steps); + sse2_absolute_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); } else #endif |