diff options
-rw-r--r-- | benchmarks/benchmarks/bench_avx.py | 17 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 16 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 6 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 205 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 28 |
6 files changed, 272 insertions, 8 deletions
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py index 4f915f82a..ff105811d 100644 --- a/benchmarks/benchmarks/bench_avx.py +++ b/benchmarks/benchmarks/bench_avx.py @@ -14,6 +14,7 @@ avx_ufuncs = ['sin', 'floor', 'ceil' , 'trunc', + 'frexp', 'isnan', 'isfinite', 'isinf', @@ -60,6 +61,22 @@ class AVX_BFunc(Benchmark): def time_ufunc(self, ufuncname, dtype, stride): self.f(self.arr1[::stride], self.arr2[::stride]) +class AVX_ldexp(Benchmark): + + params = [dtype, stride] + param_names = ['dtype', 'stride'] + timeout = 10 + + def setup(self, dtype, stride): + np.seterr(all='ignore') + self.f = getattr(np, 'ldexp') + N = 10000 + self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype) + self.arr2 = np.array(np.random.rand(stride*N), dtype='i') + + def time_ufunc(self, dtype, stride): + self.f(self.arr1[::stride], self.arr2[::stride]) + cmplx_bfuncs = ['add', 'subtract', 'multiply', diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index b5d5eb94a..2ce2fdb55 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -898,10 +898,10 @@ defdict = { docstrings.get('numpy.core.umath.ldexp'), None, [TypeDescription('e', None, 'ei', 'e'), - TypeDescription('f', None, 'fi', 'f'), + TypeDescription('f', None, 'fi', 'f', simd=['avx512_skx']), TypeDescription('e', FuncNameSuffix('long'), 'el', 'e'), TypeDescription('f', FuncNameSuffix('long'), 'fl', 'f'), - TypeDescription('d', None, 'di', 'd'), + TypeDescription('d', None, 'di', 'd', simd=['avx512_skx']), TypeDescription('d', FuncNameSuffix('long'), 'dl', 'd'), TypeDescription('g', None, 'gi', 'g'), TypeDescription('g', FuncNameSuffix('long'), 'gl', 'g'), @@ -912,8 +912,8 @@ defdict = { docstrings.get('numpy.core.umath.frexp'), None, [TypeDescription('e', None, 'e', 'ei'), - TypeDescription('f', None, 'f', 'fi'), - TypeDescription('d', None, 'd', 'di'), + TypeDescription('f', None, 'f', 'fi', simd=['avx512_skx']), + TypeDescription('d', None, 'd', 'di', simd=['avx512_skx']), TypeDescription('g', None, 'g', 'gi'), ], ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index dbb8dd48e..0cfa1cea7 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -2137,6 +2137,14 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void +@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + if (!run_unary_two_out_avx512_skx_frexp_@TYPE@(args, dimensions, steps)) { + @TYPE@_frexp(args, dimensions, steps, func); + } +} + +NPY_NO_EXPORT void @TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { BINARY_LOOP { @@ -2147,6 +2155,14 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void +@TYPE@_ldexp_avx512_skx(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) +{ + if (!run_binary_avx512_skx_ldexp_@TYPE@(args, dimensions, steps)) { + @TYPE@_ldexp(args, dimensions, steps, func); + } +} + +NPY_NO_EXPORT void @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { /* diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index b63d442ef..5dd49c465 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -339,9 +339,15 @@ NPY_NO_EXPORT void @TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void +@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void @TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void +@TYPE@_ldexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void @TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); #define @TYPE@_true_divide @TYPE@_divide diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 2f7574d47..48e89915c 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -120,6 +120,13 @@ nomemoverlap(char *ip, (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0]))) +#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \ + ((abs(steps[0]) < MAX_STEP_SIZE) && \ + (abs(steps[1]) < MAX_STEP_SIZE) && \ + (abs(steps[2]) < MAX_STEP_SIZE) && \ + (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ + (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0]))) + /* * 1) Output should be contiguous, can handle strided input data * 2) Input step should be smaller than MAX_STEP_SIZE for performance @@ -294,6 +301,42 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in /**end repeat1**/ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void +AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); + +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void +AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); +#endif + +static NPY_INLINE int +run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ + if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) { + AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} + +static NPY_INLINE int +run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ + if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) { + AVX512_SKX_frexp_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} /**end repeat**/ /**begin repeat @@ -2089,13 +2132,167 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co * #num_lanes = 16, 8# * #vsuffix = ps, pd# * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# + * #vtype1 = __m512, __m512d# + * #vtype2 = __m512i, __m256i# * #scale = 4, 8# * #vindextype = __m512i, __m256i# * #vindexsize = 512, 256# * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32# + * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32# + * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32# + * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32# + * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256# */ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void +AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); + const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int); + const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@); + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = array_size; + @type@* ip1 = (@type@*) args[0]; + int* ip2 = (int*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP + */ + + npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { + index_ip1[ii] = ii*stride_ip1; + index_ip2[ii] = ii*stride_ip2; + index_op[ii] = ii*stride_op; + } + @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); + @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); + @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); + @vtype1@ zeros_f = _mm512_setzero_@vsuffix@(); + @vtype2@ zeros = @setzero@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype1@ x1; + @vtype2@ x2; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); + } + if (stride_ip2 == 1) { + x2 = @vtype2_load@(load_mask, ip2); + } + else { + x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4); + } + + @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2)); + + if (stride_op == 1) { + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + } + else { + /* scatter! */ + _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); + } + + ip1 += @num_lanes@*stride_ip1; + ip2 += @num_lanes@*stride_ip2; + op += @num_lanes@*stride_op; + num_remaining_elements -= @num_lanes@; + } +} + +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void +AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); + const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@); + const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int); + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* op1 = (@type@*) args[1]; + int* op2 = (int*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP + */ + + npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { + index_ip1[ii] = ii*stride_ip1; + index_op1[ii] = ii*stride_op1; + index_op2[ii] = ii*stride_op2; + } + @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); + @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]); + @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]); + @vtype1@ zeros_f = _mm512_setzero_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype1@ x1; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); + } + + /* + * The x86 instructions vpgetmant and vpgetexp do not conform + * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0 + * We mask these values with spmask to avoid invalid exceptions. + */ + @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask( + x1, 0b10011111)); + @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@( + spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); + out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1); + @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32( + _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0), + _mm512_maskz_getexp_@vsuffix@(spmask, x1))); + if (stride_op1 == 1) { + _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1); + } + else { + _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@); + } + if (stride_op2 == 1) { + @vtype2_store@(op2, load_mask, out2); + } + else { + @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4); + } + + ip1 += @num_lanes@*stride_ip1; + op1 += @num_lanes@*stride_op1; + op2 += @num_lanes@*stride_op2; + num_remaining_elements -= @num_lanes@; + } +} +#endif + /**begin repeat1 * #func = maximum, minimum# * #vectorf = max, min# @@ -2131,14 +2328,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); - @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); + @vtype1@ zeros_f = _mm512_setzero_@vsuffix@(); while (num_remaining_elements > 0) { if (num_remaining_elements < @num_lanes@) { load_mask = avx512_get_partial_load_mask_@vsuffix@( num_remaining_elements, @num_lanes@); } - @vtype@ x1, x2; + @vtype1@ x1, x2; if (stride_ip1 == 1) { x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); } @@ -2158,7 +2355,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s * this issue to conform with NumPy behaviour. */ @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ); - @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); + @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1); if (stride_op == 1) { diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 0b9b06d01..91acd6ac3 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -3,6 +3,7 @@ import warnings import fnmatch import itertools import pytest +import sys from fractions import Fraction import numpy.core.umath as ncu @@ -789,6 +790,33 @@ class TestFPClass: assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride]) assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride]) +class TestLDExp: + @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4]) + @pytest.mark.parametrize("dtype", ['f', 'd']) + def test_ldexp(self, dtype, stride): + mant = np.array([0.125, 0.25, 0.5, 1., 1., 2., 4., 8.], dtype=dtype) + exp = np.array([3, 2, 1, 0, 0, -1, -2, -3], dtype='i') + out = np.zeros(8, dtype=dtype) + assert_equal(np.ldexp(mant[::stride], exp[::stride], out=out[::stride]), np.ones(8, dtype=dtype)[::stride]) + assert_equal(out[::stride], np.ones(8, dtype=dtype)[::stride]) + +class TestFRExp: + @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4]) + @pytest.mark.parametrize("dtype", ['f', 'd']) + @pytest.mark.skipif(not sys.platform.startswith('linux'), + reason="np.frexp gives different answers for NAN/INF on windows and linux") + def test_frexp(self, dtype, stride): + arr = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 1.0, -1.0], dtype=dtype) + mant_true = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 0.5, -0.5], dtype=dtype) + exp_true = np.array([0, 0, 0, 0, 0, 0, 1, 1], dtype='i') + out_mant = np.ones(8, dtype=dtype) + out_exp = 2*np.ones(8, dtype='i') + mant, exp = np.frexp(arr[::stride], out=(out_mant[::stride], out_exp[::stride])) + assert_equal(mant_true[::stride], mant) + assert_equal(exp_true[::stride], exp) + assert_equal(out_mant[::stride], mant_true[::stride]) + assert_equal(out_exp[::stride], exp_true[::stride]) + # func : [maxulperror, low, high] avx_ufuncs = {'sqrt' :[1, 0., 100.], 'absolute' :[0, -100., 100.], |