diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-01-08 22:31:10 +0100 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-01-10 15:41:54 +0100 |
commit | c776f398fc72ae392e664b0aac48822522595eda (patch) | |
tree | cd4243f387e16b31304fad22e2bb42c32b2d1af8 | |
parent | dce2b4e1beb3a68bd66ddc9b793cca2c8d493de0 (diff) | |
download | numpy-c776f398fc72ae392e664b0aac48822522595eda.tar.gz |
ENH: vectorize isinf, isfinite and signbit
isfinite is especially valuable as its needed to verify inputs are
suitable for lapack.
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 5 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 166 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 33 |
3 files changed, 169 insertions, 35 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index aff6180c7..fc9ffec94 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1558,14 +1558,11 @@ NPY_NO_EXPORT void /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit# * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit# - * #isnan = 1, 0*3# **/ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { - char * margs[] = {args[0], args[0], args[1]}; - npy_intp msteps[] = {steps[0], steps[0], steps[1]}; - if (!@isnan@ || !run_binary_simd_not_equal_@TYPE@(margs, dimensions, msteps)) { + if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) { UNARY_LOOP { const @type@ in1 = *(@type@ *)ip1; *((npy_bool *)op1) = @func@(in1) != 0; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 84695f5d6..21ff97784 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -25,6 +25,7 @@ #endif #include <assert.h> #include <stdlib.h> +#include <float.h> #include <string.h> /* for memcpy */ /* Figure out the right abs function for pointer addresses */ @@ -259,6 +260,32 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps /**end repeat1**/ +/**begin repeat1 + * #kind = isnan, isfinite, isinf, signbit# + */ + +#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS + +static void +sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n); + +#endif + +static NPY_INLINE int +run_@kind@_simd_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS + if (steps[0] == sizeof(@type@) && steps[1] == 1 && + npy_is_aligned(args[0], sizeof(@type@))) { + sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} + +/**end repeat1**/ + /**end repeat**/ /* @@ -528,11 +555,104 @@ sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4, #endif } +static void +sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) { + op[i] = npy_signbit(ip1[i]); + } + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); + int r = @vpre@_movemask_@vsuf@(a); + if (sizeof(@type@) == 8) { + op[i] = r & 1; + op[i + 1] = (r >> 1); + } + else { + op[i] = r & 1; + op[i + 1] = (r >> 1) & 1; + op[i + 2] = (r >> 2) & 1; + op[i + 3] = (r >> 3); + } + } + LOOP_BLOCKED_END { + op[i] = npy_signbit(ip1[i]); + } +} + +/**begin repeat1 + * #kind = isnan, isfinite, isinf# + * #var = 0, 1, 2# + */ + +static void +sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) +{ +#if @var@ != 0 /* isinf/isfinite */ + /* signbit mask 0x7FFFFFFF after andnot */ + const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); + const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(), + @vpre@_setzero_@vsuf@()); +#if @double@ + const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX); +#else + const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX); +#endif +#endif + LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) { + op[i] = npy_@kind@(ip1[i]); + } + LOOP_BLOCKED(@type@, 64) { + @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]); + @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]); + @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]); + @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]); + @vtype@ r1, r2, r3, r4; +#if @var@ != 0 /* isinf/isfinite */ + /* fabs via masking of sign bit */ + r1 = @vpre@_andnot_@vsuf@(mask, a); + r2 = @vpre@_andnot_@vsuf@(mask, b); + r3 = @vpre@_andnot_@vsuf@(mask, c); + r4 = @vpre@_andnot_@vsuf@(mask, d); +#if @var@ == 1 /* isfinite */ + /* negative compare against max float, nan is always true */ + r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax); + r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax); + r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax); + r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax); +#else /* isinf */ + r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1); + r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2); + r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3); + r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4); +#endif + /* flip results to what we want (andnot as there is no sse not) */ + r1 = @vpre@_andnot_@vsuf@(r1, ones); + r2 = @vpre@_andnot_@vsuf@(r2, ones); + r3 = @vpre@_andnot_@vsuf@(r3, ones); + r4 = @vpre@_andnot_@vsuf@(r4, ones); +#endif +#if @var@ == 0 /* isnan */ + r1 = @vpre@_cmpneq_@vsuf@(a, a); + r2 = @vpre@_cmpneq_@vsuf@(b, b); + r3 = @vpre@_cmpneq_@vsuf@(c, c); + r4 = @vpre@_cmpneq_@vsuf@(d, d); +#endif + sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); + } + LOOP_BLOCKED_END { + op[i] = npy_@kind@(ip1[i]); + } + /* silence exceptions from comparisons */ + npy_clear_floatstatus(); +} + +/**end repeat1**/ + /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal# * #OP = ==, !=, <, <=, >, >=# * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge# - * #neq = 0, 1, 0*4# */ /* sets invalid fpu flag on QNaN for consistency with packed compare */ @@ -554,36 +674,20 @@ sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n) LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) { op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); } - /* isnan special unary case */ - if (@neq@ && ip1 == ip2) { - LOOP_BLOCKED(@type@, 64) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]); - @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]); - @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]); - @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]); - @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, a); - @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, b); - @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, c); - @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, d); - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - } - else { - LOOP_BLOCKED(@type@, 64) { - @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]); - @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]); - @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]); - @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]); - @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]); - @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]); - @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]); - @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]); - @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2); - @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2); - @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2); - @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2); - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } + LOOP_BLOCKED(@type@, 64) { + @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]); + @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]); + @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]); + @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]); + @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]); + @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]); + @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]); + @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]); + @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2); + @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2); + @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2); + @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2); + sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); } LOOP_BLOCKED_END { op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]); diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index d63118080..5602530dd 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -234,6 +234,31 @@ class TestBoolCmp(TestCase): self.nf[self.ef] = np.nan self.nd[self.ed] = np.nan + self.inff = self.f.copy() + self.infd = self.d.copy() + self.inff[::3][self.ef[::3]] = np.inf + self.infd[::3][self.ed[::3]] = np.inf + self.inff[1::3][self.ef[1::3]] = -np.inf + self.infd[1::3][self.ed[1::3]] = -np.inf + self.inff[2::3][self.ef[2::3]] = np.nan + self.infd[2::3][self.ed[2::3]] = np.nan + self.efnonan = self.ef.copy() + self.efnonan[2::3] = False + self.ednonan = self.ed.copy() + self.ednonan[2::3] = False + + self.signf = self.f.copy() + self.signd = self.d.copy() + self.signf[self.ef] *= -1. + self.signd[self.ed] *= -1. + self.signf[1::6][self.ef[1::6]] = -np.inf + self.signd[1::6][self.ed[1::6]] = -np.inf + self.signf[3::6][self.ef[3::6]] = -np.nan + self.signd[3::6][self.ed[3::6]] = -np.nan + self.signf[4::6][self.ef[4::6]] = -0. + self.signd[4::6][self.ed[4::6]] = -0. + + def test_float(self): # offset for alignment test for i in range(4): @@ -255,6 +280,10 @@ class TestBoolCmp(TestCase): # isnan on amd64 takes the same codepath assert_array_equal(np.isnan(self.nf[i:]), self.ef[i:]) + assert_array_equal(np.isfinite(self.nf[i:]), ~self.ef[i:]) + assert_array_equal(np.isfinite(self.inff[i:]), ~self.ef[i:]) + assert_array_equal(np.isinf(self.inff[i:]), self.efnonan[i:]) + assert_array_equal(np.signbit(self.signf[i:]), self.ef[i:]) def test_double(self): # offset for alignment test @@ -277,6 +306,10 @@ class TestBoolCmp(TestCase): # isnan on amd64 takes the same codepath assert_array_equal(np.isnan(self.nd[i:]), self.ed[i:]) + assert_array_equal(np.isfinite(self.nd[i:]), ~self.ed[i:]) + assert_array_equal(np.isfinite(self.infd[i:]), ~self.ed[i:]) + assert_array_equal(np.isinf(self.infd[i:]), self.ednonan[i:]) + assert_array_equal(np.signbit(self.signd[i:]), self.ed[i:]) class TestSeterr(TestCase): |