diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2019-10-31 19:07:34 -0700 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2020-01-28 08:52:42 -0800 |
commit | f7f8e621ce4774722968f2f0a77d305cfabf46d5 (patch) | |
tree | 8df9e27756d8d5032e22a7c10bbba60dafbd753d /numpy | |
parent | 71ad52e79b1b35ed13dade1c395d7ad57deb9e3a (diff) | |
download | numpy-f7f8e621ce4774722968f2f0a77d305cfabf46d5.tar.gz |
ENH: Use AVX-512 for np.maximum and np.minimum
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 4 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 28 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 118 |
4 files changed, 156 insertions, 2 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index f9ee7d993..1fd08241d 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -492,14 +492,14 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj), + TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj), + TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMin') ), 'clip': diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 3b180ce59..b310d73ff 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1898,6 +1898,34 @@ NPY_NO_EXPORT void * #OP = >=, <=# **/ NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* */ + if (IS_BINARY_REDUCE) { + if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_REDUCE_LOOP(@type@) { + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; + } + *((@type@ *)iop1) = io1; + } + } + else { + if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; + *((@type@ *)op1) = in1; + } + } + } + npy_clear_floatstatus_barrier((char*)dimensions); +} + +NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { /* */ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 8ddf201d7..6c89627ca 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -175,6 +175,14 @@ NPY_NO_EXPORT void @TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ + +/**begin repeat1 * #isa = avx512f, fma# */ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 5473b58f1..69f003473 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -130,6 +130,35 @@ abs_ptrdiff(char *a, char *b) */ /**begin repeat + * #type = npy_float, npy_double, npy_longdouble# + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #EXISTS = 1, 1, 0# + */ + +/**begin repeat1 + * #func = maximum, minimum# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); +#endif + +static NPY_INLINE int +run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; +#endif + return 0; +} + + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512f# * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# @@ -1671,6 +1700,95 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d #endif /**end repeat**/ +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #num_lanes = 16, 8# + * #vsuffix = ps, pd# + * #mask = __mmask16, __mmask8# + * #vtype = __m512, __m512d# + * #scale = 4, 8# + * #vindextype = __m512i, __m256i# + * #vindexsize = 512, 256# + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + */ + +/**begin repeat1 + * #func = maximum, minimum# + * #vectorf = max, min# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + const npy_intp stride_ip1 = steps[0]/sizeof(@type@); + const npy_intp stride_ip2 = steps[1]/sizeof(@type@); + const npy_intp stride_op = steps[2]/sizeof(@type@); + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* ip2 = (@type@*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + npy_int index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; + for (npy_int ii = 0; ii < @num_lanes@; ii++) { + index_ip1[ii] = ii*stride_ip1; + index_ip2[ii] = ii*stride_ip2; + index_op[ii] = ii*stride_op; + } + @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); + @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); + @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); + @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1, x2; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); + } + if (stride_ip2 == 1) { + x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); + } + else { + x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask); + } + + /* + * when only one of the argument is a nan, the maxps/maxpd instruction + * returns the second argument. The additional blend instruction fixes + * this issue to conform with NumPy behaviour. + */ + @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ); + @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); + out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1); + + if (stride_op == 1) { + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + } + else { + /* scatter! */ + _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); + } + + ip1 += @num_lanes@*stride_ip1; + ip2 += @num_lanes@*stride_ip2; + op += @num_lanes@*stride_op; + num_remaining_elements -= @num_lanes@; + } +} +#endif +/**end repeat**/ +/**end repeat1**/ /**begin repeat * #ISA = FMA, AVX512F# |