2 files changed, 76 insertions, 5 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index de78904bb..e15909029 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1302,8 +1302,11 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
  *  #vtype = __m128, __m128d#
+ *  #vpre = _mm, _mm#
  *  #vsuf = ps, pd#
+*   #vtype = __m128, __m128d#
  */
 
 #ifdef HAVE_EMMINTRIN_H
@@ -1335,6 +1338,40 @@ sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 }
 
 
+static void
+sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+    /*
+     * get 0x7FFFFFFF mask (everything but signbit set)
+     * float & ~mask will remove the sign
+     * this is equivalent to how the compiler implements fabs on amd64
+     */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+    UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
+        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+        /* add 0 to clear -0.0 */
+        op1[i] = tmp + 0;
+    }
+    if (npy_is_aligned(&ip1[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
+            @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i];
+        /* add 0 to clear -0.0 */
+        op1[i] = tmp + 0;
+    }
+}
+
 #endif
 
 
@@ -1582,11 +1619,19 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    UNARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ tmp = in1 > 0 ? in1 : -in1;
-        /* add 0 to clear -0.0 */
-        *((@type@ *)op1) = tmp + 0;
+#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
+    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+        sse2_absolute_@TYPE@(args, dimensions, steps);
+    }
+    else
+#endif
+    {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
     }
 }
 
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 586b64636..6bbb15e6b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -687,6 +687,32 @@ class TestSign(TestCase):
             np.seterr(**olderr)
 
 
+class TestAbsolute(TestCase):
+    def test_abs_blocked(self):
+        "simd tests on abs"
+        for dt in [np.float32, np.float64]:
+            for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+                                                    max_size=17):
+                tgt = [ncu.absolute(i) for i in inp]
+                np.absolute(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+                self.assertTrue((out >= 0).all())
+
+                prev = np.geterr()
+                # will throw invalid flag depending on compiler optimizations
+                np.seterr(invalid='ignore')
+                for v in [np.nan, -np.inf, np.inf]:
+                    for i in range(inp.size):
+                        d = np.arange(inp.size, dtype=dt)
+                        inp[:] = -d
+                        inp[i] = v
+                        d[i] = -v if v == -np.inf else v
+                        assert_array_equal(np.abs(inp), d, err_msg=msg)
+                        np.abs(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+                np.seterr(invalid=prev['invalid'])
+
+
 class TestSpecialMethods(TestCase):
     def test_wrap(self):
         class with_wrap(object):