summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/umath/loops.c.src55
-rw-r--r--numpy/core/tests/test_umath.py26
2 files changed, 76 insertions, 5 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index de78904bb..e15909029 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1302,8 +1302,11 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
* #scalarf = npy_sqrtf, npy_sqrt#
+ * #c = f, #
* #vtype = __m128, __m128d#
+ * #vpre = _mm, _mm#
* #vsuf = ps, pd#
+* #vtype = __m128, __m128d#
*/
#ifdef HAVE_EMMINTRIN_H
@@ -1335,6 +1338,40 @@ sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
}
+static void
+sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+ /*
+ * get 0x7FFFFFFF mask (everything but signbit set)
+ * float & ~mask will remove the sign
+ * this is equivalent to how the compiler implements fabs on amd64
+ */
+ const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+ UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
+ const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+ /* add 0 to clear -0.0 */
+ op1[i] = tmp + 0;
+ }
+ if (npy_is_aligned(&ip1[i], 16)) {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+ }
+ }
+ else {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+ @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+ }
+ }
+ UNARY_LOOP_BLOCKED_END {
+ const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+ /* add 0 to clear -0.0 */
+ op1[i] = tmp + 0;
+ }
+}
+
#endif
@@ -1582,11 +1619,19 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ tmp = in1 > 0 ? in1 : -in1;
- /* add 0 to clear -0.0 */
- *((@type@ *)op1) = tmp + 0;
+#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
+ if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+ sse2_absolute_@TYPE@(args, dimensions, steps);
+ }
+ else
+#endif
+ {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ tmp = in1 > 0 ? in1 : -in1;
+ /* add 0 to clear -0.0 */
+ *((@type@ *)op1) = tmp + 0;
+ }
}
}
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 586b64636..6bbb15e6b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,6 +687,32 @@ class TestSign(TestCase):
np.seterr(**olderr)
+class TestAbsolute(TestCase):
+ def test_abs_blocked(self):
+ "simd tests on abs"
+ for dt in [np.float32, np.float64]:
+ for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+ max_size=17):
+ tgt = [ncu.absolute(i) for i in inp]
+ np.absolute(inp, out=out)
+ assert_equal(out, tgt, err_msg=msg)
+ self.assertTrue((out >= 0).all())
+
+ prev = np.geterr()
+ # will throw invalid flag depending on compiler optimizations
+ np.seterr(invalid='ignore')
+ for v in [np.nan, -np.inf, np.inf]:
+ for i in range(inp.size):
+ d = np.arange(inp.size, dtype=dt)
+ inp[:] = -d
+ inp[i] = v
+ d[i] = -v if v == -np.inf else v
+ assert_array_equal(np.abs(inp), d, err_msg=msg)
+ np.abs(inp, out=out)
+ assert_array_equal(out, d, err_msg=msg)
+ np.seterr(invalid=prev['invalid'])
+
+
class TestSpecialMethods(TestCase):
def test_wrap(self):
class with_wrap(object):