summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <juliantaylor108@gmail.com>2019-05-19 18:45:51 +0200
committerGitHub <noreply@github.com>2019-05-19 18:45:51 +0200
commit433d8b2493c716b6617e1fb8b055bc702f17ddb7 (patch)
treeee810f5e48ecff7cac0efe30c43fc0fc88048d90
parent5e46e3d05cbfb162e0decda6b306f4f62170b6a1 (diff)
parent59b2a1d08592b11b82b82b6c53aead689648262e (diff)
downloadnumpy-433d8b2493c716b6617e1fb8b055bc702f17ddb7.tar.gz
Merge pull request #13581 from r-devulap/gather-for-avxsimd
ENH: AVX support for exp/log for strided float32 arrays
-rw-r--r--numpy/core/src/umath/loops.c.src7
-rw-r--r--numpy/core/src/umath/simd.inc.src100
-rw-r--r--numpy/core/tests/test_umath.py27
3 files changed, 111 insertions, 23 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index a2649ed93..f84d74efe 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1626,12 +1626,11 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
/*
* We use the AVX function to compute exp/log for scalar elements as well.
* This is needed to ensure the output of strided and non-strided
- * cases match. But this worsens the performance of strided arrays.
- * There is plan to fix this in a subsequent patch by using gather
- * instructions for strided arrays in the AVX function.
+ * cases match. SIMD code handles strided input cases, but not
+ * strided output.
*/
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
- @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1);
+ @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
#else
const npy_float in1 = *(npy_float *)ip1;
*(npy_float *)op1 = @scalarf@(in1);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 1c6ac4426..6ddd6fcc6 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -51,6 +51,15 @@ abs_ptrdiff(char *a, char *b)
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
((abs_ptrdiff(args[1], args[0]) == 0))))
+/*
+ * output should be contiguous, can handle strided input data
+ */
+#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
+ (steps[1] == (esize) && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+ ((abs_ptrdiff(args[1], args[0]) == 0))))
+
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
npy_is_aligned(args[1], (esize)) && \
@@ -134,15 +143,15 @@ abs_ptrdiff(char *a, char *b)
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
-@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif
static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
- if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
- @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+ @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
return 1;
}
else
@@ -1127,15 +1136,24 @@ avx2_get_full_load_mask(void)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
-avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
- float* addr = maskint + total_elem - num_elem;
+ float* addr = maskint + total_elem - num_lanes;
return _mm256_loadu_ps(addr);
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
+avx2_masked_gather(__m256 src,
+ npy_float* addr,
+ __m256i vindex,
+ __m256 mask)
+{
+ return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
avx2_masked_load(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
@@ -1221,6 +1239,15 @@ avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
+avx512_masked_gather(__m512 src,
+ npy_float* addr,
+ __m512i vindex,
+ __mmask16 kmask)
+{
+ return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_load(__mmask16 mask, npy_float* addr)
{
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
@@ -1310,11 +1337,19 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_exp_FLOAT(npy_float * op,
+ npy_float * ip,
+ const npy_intp array_size,
+ const npy_intp steps)
{
+ const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
npy_float xmax = 88.72283935546875f;
npy_float xmin = -87.3365478515625f;
+ npy_int indexarr[16];
+ for (npy_int ii = 0; ii < 16; ii++) {
+ indexarr[ii] = ii*stride;
+ }
/* Load up frequently used constants */
@vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
@@ -1333,6 +1368,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
@vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
@vtype@ poly, num_poly, denom_poly, quadrant;
+ @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
@vtype@i exponent;
@mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
@@ -1342,10 +1378,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
while (num_remaining_elements > 0) {
- if (num_remaining_elements < num_lanes)
+ if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
- num_lanes);
- @vtype@ x = @isa@_masked_load(load_mask, ip);
+ num_lanes);
+ }
+
+ @vtype@ x;
+ if (stride == 1) {
+ x = @isa@_masked_load(load_mask, ip);
+ }
+ else {
+ x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+ }
xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1396,13 +1440,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@masked_store@(op, @cvtps_epi32@(load_mask), poly);
- ip += num_lanes;
+ ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}
- if (@mask_to_int@(overflow_mask))
+ if (@mask_to_int@(overflow_mask)) {
npy_set_floatstatus_overflow();
+ }
}
/*
@@ -1420,9 +1465,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
*/
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
+@ISA@_log_FLOAT(npy_float * op,
+ npy_float * ip,
+ const npy_intp array_size,
+ const npy_intp steps)
{
+ const npy_intp stride = steps/sizeof(npy_float);
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ npy_int indexarr[16];
+ for (npy_int ii = 0; ii < 16; ii++) {
+ indexarr[ii] = ii*stride;
+ }
/* Load up frequently used constants */
@vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
@@ -1443,6 +1496,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
@vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
@vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+ @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
@vtype@ poly, num_poly, denom_poly, exponent;
@mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
@@ -1453,10 +1507,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
while (num_remaining_elements > 0) {
- if (num_remaining_elements < num_lanes)
+ if (num_remaining_elements < num_lanes) {
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
- num_lanes);
- @vtype@ x_in = @isa@_masked_load(load_mask, ip);
+ num_lanes);
+ }
+
+ @vtype@ x_in;
+ if (stride == 1) {
+ x_in = @isa@_masked_load(load_mask, ip);
+ }
+ else {
+ x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+ }
negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
@@ -1506,15 +1568,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@masked_store@(op, @cvtps_epi32@(load_mask), poly);
- ip += num_lanes;
+ ip += num_lanes*stride;
op += num_lanes;
num_remaining_elements -= num_lanes;
}
- if (@mask_to_int@(invalid_mask))
+ if (@mask_to_int@(invalid_mask)) {
npy_set_floatstatus_invalid();
- if (@mask_to_int@(divide_by_zero_mask))
+ }
+ if (@mask_to_int@(divide_by_zero_mask)) {
npy_set_floatstatus_divbyzero();
+ }
}
#endif
/**end repeat**/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2c963b5eb..cd2034d9c 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -13,7 +13,7 @@ import numpy as np
from numpy.testing import (
assert_, assert_equal, assert_raises, assert_raises_regex,
assert_array_equal, assert_almost_equal, assert_array_almost_equal,
- assert_allclose, assert_no_warnings, suppress_warnings,
+ assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
_gen_alignment_data
)
@@ -678,6 +678,31 @@ class TestSpecialFloats(object):
assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
assert_raises(FloatingPointError, np.log, np.float32(-1.0))
+class TestExpLogFloat32(object):
+ def test_exp_float32(self):
+ np.random.seed(42)
+ x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
+ x_f64 = np.float64(x_f32)
+ assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6)
+
+ def test_log_float32(self):
+ np.random.seed(42)
+ x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000))
+ x_f64 = np.float64(x_f32)
+ assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9)
+
+ def test_strided_exp_log_float32(self):
+ np.random.seed(42)
+ strides = np.random.randint(low=-100, high=100, size=100)
+ sizes = np.random.randint(low=1, high=2000, size=100)
+ for ii in sizes:
+ x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
+ exp_true = np.exp(x_f32)
+ log_true = np.log(x_f32)
+ for jj in strides:
+ assert_equal(np.exp(x_f32[::jj]), exp_true[::jj])
+ assert_equal(np.log(x_f32[::jj]), log_true[::jj])
+
class TestLogAddExp(_FilterInvalids):
def test_logaddexp_values(self):
x = [1, 2, 3, 4, 5]