diff options
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 55 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 26 |
2 files changed, 76 insertions, 5 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index de78904bb..e15909029 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1302,8 +1302,11 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void * * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #scalarf = npy_sqrtf, npy_sqrt# + * #c = f, # * #vtype = __m128, __m128d# + * #vpre = _mm, _mm# * #vsuf = ps, pd# +* #vtype = __m128, __m128d# */ #ifdef HAVE_EMMINTRIN_H @@ -1335,6 +1338,40 @@ sse2_sqrt_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) } +static void +sse2_absolute_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ + /* + * get 0x7FFFFFFF mask (everything but signbit set) + * float & ~mask will remove the sign + * this is equivalent to how the compiler implements fabs on amd64 + */ + const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); + + UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) { + const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i]; + /* add 0 to clear -0.0 */ + op1[i] = tmp + 0; + } + if (npy_is_aligned(&ip1[i], 16)) { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a)); + } + } + else { + UNARY_LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); + @vpre@_store_@vsuf@(&op1[i], @vpre@_andnot_@vsuf@(mask, a)); + } + } + UNARY_LOOP_BLOCKED_END { + const @type@ tmp = ip1[i] > 0 ? ip1[i]: -ip1[i]; + /* add 0 to clear -0.0 */ + op1[i] = tmp + 0; + } +} + #endif @@ -1582,11 +1619,19 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ tmp = in1 > 0 ? in1 : -in1; - /* add 0 to clear -0.0 */ - *((@type@ *)op1) = tmp + 0; +#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@ + if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) { + sse2_absolute_@TYPE@(args, dimensions, steps); + } + else +#endif + { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ tmp = in1 > 0 ? in1 : -in1; + /* add 0 to clear -0.0 */ + *((@type@ *)op1) = tmp + 0; + } } } diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 586b64636..6bbb15e6b 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -687,6 +687,32 @@ class TestSign(TestCase): np.seterr(**olderr) +class TestAbsolute(TestCase): + def test_abs_blocked(self): + "simd tests on abs" + for dt in [np.float32, np.float64]: + for out, inp, msg in gen_alignment_data(dtype=dt, type='unary', + max_size=17): + tgt = [ncu.absolute(i) for i in inp] + np.absolute(inp, out=out) + assert_equal(out, tgt, err_msg=msg) + self.assertTrue((out >= 0).all()) + + prev = np.geterr() + # will throw invalid flag depending on compiler optimizations + np.seterr(invalid='ignore') + for v in [np.nan, -np.inf, np.inf]: + for i in range(inp.size): + d = np.arange(inp.size, dtype=dt) + inp[:] = -d + inp[i] = v + d[i] = -v if v == -np.inf else v + assert_array_equal(np.abs(inp), d, err_msg=msg) + np.abs(inp, out=out) + assert_array_equal(out, d, err_msg=msg) + np.seterr(invalid=prev['invalid']) + + class TestSpecialMethods(TestCase): def test_wrap(self): class with_wrap(object): |