summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2016-01-08 22:31:10 +0100
committerJulian Taylor <jtaylor.debian@googlemail.com>2016-01-10 15:41:54 +0100
commitc776f398fc72ae392e664b0aac48822522595eda (patch)
treecd4243f387e16b31304fad22e2bb42c32b2d1af8
parentdce2b4e1beb3a68bd66ddc9b793cca2c8d493de0 (diff)
downloadnumpy-c776f398fc72ae392e664b0aac48822522595eda.tar.gz
ENH: vectorize isinf, isfinite and signbit
isfinite is especially valuable as its needed to verify inputs are suitable for lapack.
-rw-r--r--numpy/core/src/umath/loops.c.src5
-rw-r--r--numpy/core/src/umath/simd.inc.src166
-rw-r--r--numpy/core/tests/test_numeric.py33
3 files changed, 169 insertions, 35 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index aff6180c7..fc9ffec94 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1558,14 +1558,11 @@ NPY_NO_EXPORT void
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
- * #isnan = 1, 0*3#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
- char * margs[] = {args[0], args[0], args[1]};
- npy_intp msteps[] = {steps[0], steps[0], steps[1]};
- if (!@isnan@ || !run_binary_simd_not_equal_@TYPE@(margs, dimensions, msteps)) {
+ if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) {
UNARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
*((npy_bool *)op1) = @func@(in1) != 0;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 84695f5d6..21ff97784 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -25,6 +25,7 @@
#endif
#include <assert.h>
#include <stdlib.h>
+#include <float.h>
#include <string.h> /* for memcpy */
/* Figure out the right abs function for pointer addresses */
@@ -259,6 +260,32 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
/**end repeat1**/
+/**begin repeat1
+ * #kind = isnan, isfinite, isinf, signbit#
+ */
+
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+
+static void
+sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_@kind@_simd_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
+ if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
+ npy_is_aligned(args[0], sizeof(@type@))) {
+ sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/**end repeat1**/
+
/**end repeat**/
/*
@@ -528,11 +555,104 @@ sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
#endif
}
+static void
+sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
+{
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ op[i] = npy_signbit(ip1[i]);
+ }
+ LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
+ int r = @vpre@_movemask_@vsuf@(a);
+ if (sizeof(@type@) == 8) {
+ op[i] = r & 1;
+ op[i + 1] = (r >> 1);
+ }
+ else {
+ op[i] = r & 1;
+ op[i + 1] = (r >> 1) & 1;
+ op[i + 2] = (r >> 2) & 1;
+ op[i + 3] = (r >> 3);
+ }
+ }
+ LOOP_BLOCKED_END {
+ op[i] = npy_signbit(ip1[i]);
+ }
+}
+
+/**begin repeat1
+ * #kind = isnan, isfinite, isinf#
+ * #var = 0, 1, 2#
+ */
+
+static void
+sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
+{
+#if @var@ != 0 /* isinf/isfinite */
+ /* signbit mask 0x7FFFFFFF after andnot */
+ const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+ const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
+ @vpre@_setzero_@vsuf@());
+#if @double@
+ const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
+#else
+ const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
+#endif
+#endif
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ op[i] = npy_@kind@(ip1[i]);
+ }
+ LOOP_BLOCKED(@type@, 64) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ r1, r2, r3, r4;
+#if @var@ != 0 /* isinf/isfinite */
+ /* fabs via masking of sign bit */
+ r1 = @vpre@_andnot_@vsuf@(mask, a);
+ r2 = @vpre@_andnot_@vsuf@(mask, b);
+ r3 = @vpre@_andnot_@vsuf@(mask, c);
+ r4 = @vpre@_andnot_@vsuf@(mask, d);
+#if @var@ == 1 /* isfinite */
+ /* negative compare against max float, nan is always true */
+ r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
+ r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
+ r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
+ r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
+#else /* isinf */
+ r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
+ r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
+ r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
+ r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
+#endif
+ /* flip results to what we want (andnot as there is no sse not) */
+ r1 = @vpre@_andnot_@vsuf@(r1, ones);
+ r2 = @vpre@_andnot_@vsuf@(r2, ones);
+ r3 = @vpre@_andnot_@vsuf@(r3, ones);
+ r4 = @vpre@_andnot_@vsuf@(r4, ones);
+#endif
+#if @var@ == 0 /* isnan */
+ r1 = @vpre@_cmpneq_@vsuf@(a, a);
+ r2 = @vpre@_cmpneq_@vsuf@(b, b);
+ r3 = @vpre@_cmpneq_@vsuf@(c, c);
+ r4 = @vpre@_cmpneq_@vsuf@(d, d);
+#endif
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = npy_@kind@(ip1[i]);
+ }
+ /* silence exceptions from comparisons */
+ npy_clear_floatstatus();
+}
+
+/**end repeat1**/
+
/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal#
* #OP = ==, !=, <, <=, >, >=#
* #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
- * #neq = 0, 1, 0*4#
*/
/* sets invalid fpu flag on QNaN for consistency with packed compare */
@@ -554,36 +674,20 @@ sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
}
- /* isnan special unary case */
- if (@neq@ && ip1 == ip2) {
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, a);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, b);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, c);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, d);
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- }
- else {
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
+ LOOP_BLOCKED(@type@, 64) {
+ @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
+ @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
+ @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
+ @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index d63118080..5602530dd 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -234,6 +234,31 @@ class TestBoolCmp(TestCase):
self.nf[self.ef] = np.nan
self.nd[self.ed] = np.nan
+ self.inff = self.f.copy()
+ self.infd = self.d.copy()
+ self.inff[::3][self.ef[::3]] = np.inf
+ self.infd[::3][self.ed[::3]] = np.inf
+ self.inff[1::3][self.ef[1::3]] = -np.inf
+ self.infd[1::3][self.ed[1::3]] = -np.inf
+ self.inff[2::3][self.ef[2::3]] = np.nan
+ self.infd[2::3][self.ed[2::3]] = np.nan
+ self.efnonan = self.ef.copy()
+ self.efnonan[2::3] = False
+ self.ednonan = self.ed.copy()
+ self.ednonan[2::3] = False
+
+ self.signf = self.f.copy()
+ self.signd = self.d.copy()
+ self.signf[self.ef] *= -1.
+ self.signd[self.ed] *= -1.
+ self.signf[1::6][self.ef[1::6]] = -np.inf
+ self.signd[1::6][self.ed[1::6]] = -np.inf
+ self.signf[3::6][self.ef[3::6]] = -np.nan
+ self.signd[3::6][self.ed[3::6]] = -np.nan
+ self.signf[4::6][self.ef[4::6]] = -0.
+ self.signd[4::6][self.ed[4::6]] = -0.
+
+
def test_float(self):
# offset for alignment test
for i in range(4):
@@ -255,6 +280,10 @@ class TestBoolCmp(TestCase):
# isnan on amd64 takes the same codepath
assert_array_equal(np.isnan(self.nf[i:]), self.ef[i:])
+ assert_array_equal(np.isfinite(self.nf[i:]), ~self.ef[i:])
+ assert_array_equal(np.isfinite(self.inff[i:]), ~self.ef[i:])
+ assert_array_equal(np.isinf(self.inff[i:]), self.efnonan[i:])
+ assert_array_equal(np.signbit(self.signf[i:]), self.ef[i:])
def test_double(self):
# offset for alignment test
@@ -277,6 +306,10 @@ class TestBoolCmp(TestCase):
# isnan on amd64 takes the same codepath
assert_array_equal(np.isnan(self.nd[i:]), self.ed[i:])
+ assert_array_equal(np.isfinite(self.nd[i:]), ~self.ed[i:])
+ assert_array_equal(np.isfinite(self.infd[i:]), ~self.ed[i:])
+ assert_array_equal(np.isinf(self.infd[i:]), self.ednonan[i:])
+ assert_array_equal(np.signbit(self.signd[i:]), self.ed[i:])
class TestSeterr(TestCase):