Merge pull request #13581 from r-devulap/gather-for-avxsimd

ENH: AVX support for exp/log for strided float32 arrays
author: Julian Taylor <juliantaylor108@gmail.com> 2019-05-19 18:45:51 +0200
committer: GitHub <noreply@github.com> 2019-05-19 18:45:51 +0200
commit: 433d8b2493c716b6617e1fb8b055bc702f17ddb7 (patch)
tree: ee810f5e48ecff7cac0efe30c43fc0fc88048d90
parent: 5e46e3d05cbfb162e0decda6b306f4f62170b6a1 (diff)
parent: 59b2a1d08592b11b82b82b6c53aead689648262e (diff)
download: numpy-433d8b2493c716b6617e1fb8b055bc702f17ddb7.tar.gz
3 files changed, 111 insertions, 23 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index a2649ed93..f84d74efe 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1626,12 +1626,11 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
             /*
              * We use the AVX function to compute exp/log for scalar elements as well.
              * This is needed to ensure the output of strided and non-strided
-             * cases match. But this worsens the performance of strided arrays.
-             * There is plan to fix this in a subsequent patch by using gather
-             * instructions for strided arrays in the AVX function.
+             * cases match. SIMD code handles strided input cases, but not
+             * strided output.
              */
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1);
+            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
 #else
             const npy_float in1 = *(npy_float *)ip1;
             *(npy_float *)op1 = @scalarf@(in1);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 1c6ac4426..6ddd6fcc6 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -51,6 +51,15 @@ abs_ptrdiff(char *a, char *b)
      ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
       ((abs_ptrdiff(args[1], args[0]) == 0))))
 
+/*
+ * output should be contiguous, can handle strided input data
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[1] == (esize) && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+      ((abs_ptrdiff(args[1], args[0]) == 0))))
+
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
     (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
      npy_is_aligned(args[1], (esize)) && \
@@ -134,15 +143,15 @@ abs_ptrdiff(char *a, char *b)
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE void
-@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
 #endif
 
 static NPY_INLINE int
 run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
-        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
         return 1;
     }
     else
@@ -1127,15 +1136,24 @@ avx2_get_full_load_mask(void)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + total_elem - num_elem;
+    float* addr = maskint + total_elem - num_lanes;
     return _mm256_loadu_ps(addr);
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_masked_gather(__m256 src,
+                   npy_float* addr,
+                   __m256i vindex,
+                   __m256 mask)
+{
+    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
 avx2_masked_load(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
@@ -1221,6 +1239,15 @@ avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_masked_gather(__m512 src,
+                     npy_float* addr,
+                     __m512i vindex,
+                     __mmask16 kmask)
+{
+    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_masked_load(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
@@ -1310,11 +1337,19 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_exp_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_float xmax = 88.72283935546875f;
     npy_float xmin = -87.3365478515625f;
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
 
     /* Load up frequently used constants */
     @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
@@ -1333,6 +1368,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
     @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ poly, num_poly, denom_poly, quadrant;
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
     @vtype@i exponent;
 
     @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
@@ -1342,10 +1378,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                         num_lanes);
-        @vtype@ x  = @isa@_masked_load(load_mask, ip);
+                                                    num_lanes);
+        }
+
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load(load_mask, ip);
+        }
+        else {
+            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+        }
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1396,13 +1440,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
 
-    if (@mask_to_int@(overflow_mask))
+    if (@mask_to_int@(overflow_mask)) {
         npy_set_floatstatus_overflow();
+    }
 }
 
 /*
@@ -1420,9 +1465,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_log_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
 {
+    const npy_intp stride = steps/sizeof(npy_float);
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
 
     /* Load up frequently used constants */
     @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
@@ -1443,6 +1496,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
     @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
     @vtype@ poly, num_poly, denom_poly, exponent;
 
     @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
@@ -1453,10 +1507,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
     while (num_remaining_elements > 0) {
 
-        if (num_remaining_elements < num_lanes)
+        if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                         num_lanes);
-        @vtype@ x_in  = @isa@_masked_load(load_mask, ip);
+                                                    num_lanes);
+        }
+
+        @vtype@ x_in;
+        if (stride == 1) {
+            x_in = @isa@_masked_load(load_mask, ip);
+        }
+        else {
+            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+        }
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
         zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
@@ -1506,15 +1568,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
-        ip += num_lanes;
+        ip += num_lanes*stride;
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
 
-    if (@mask_to_int@(invalid_mask))
+    if (@mask_to_int@(invalid_mask)) {
         npy_set_floatstatus_invalid();
-    if (@mask_to_int@(divide_by_zero_mask))
+    }
+    if (@mask_to_int@(divide_by_zero_mask)) {
         npy_set_floatstatus_divbyzero();
+    }
 }
 #endif
 /**end repeat**/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2c963b5eb..cd2034d9c 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -13,7 +13,7 @@ import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
-    assert_allclose, assert_no_warnings, suppress_warnings,
+    assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
     _gen_alignment_data
     )
 
@@ -678,6 +678,31 @@ class TestSpecialFloats(object):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
+class TestExpLogFloat32(object):
+    def test_exp_float32(self):
+        np.random.seed(42)
+        x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+
+    def test_log_float32(self):
+        np.random.seed(42)
+        x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
+        x_f64 = np.float64(x_f32)
+        assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+
+    def test_strided_exp_log_float32(self):
+        np.random.seed(42)
+        strides = np.random.randint(low=-100, high=100, size=100)
+        sizes = np.random.randint(low=1, high=2000, size=100)
+        for ii in sizes:
+            x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
+            exp_true = np.exp(x_f32)
+            log_true = np.log(x_f32)
+            for jj in strides:
+                assert_equal(np.exp(x_f32[::jj]), exp_true[::jj])
+                assert_equal(np.log(x_f32[::jj]), log_true[::jj])
+
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):
         x = [1, 2, 3, 4, 5]
author	Julian Taylor <juliantaylor108@gmail.com>	2019-05-19 18:45:51 +0200
committer	GitHub <noreply@github.com>	2019-05-19 18:45:51 +0200
commit	433d8b2493c716b6617e1fb8b055bc702f17ddb7 (patch)
tree	ee810f5e48ecff7cac0efe30c43fc0fc88048d90
parent	5e46e3d05cbfb162e0decda6b306f4f62170b6a1 (diff)
parent	59b2a1d08592b11b82b82b6c53aead689648262e (diff)
download	numpy-433d8b2493c716b6617e1fb8b055bc702f17ddb7.tar.gz