summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmarks/benchmarks/bench_avx.py17
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/src/umath/loops.c.src16
-rw-r--r--numpy/core/src/umath/loops.h.src6
-rw-r--r--numpy/core/src/umath/simd.inc.src205
-rw-r--r--numpy/core/tests/test_umath.py28
6 files changed, 272 insertions, 8 deletions
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py
index 4f915f82a..ff105811d 100644
--- a/benchmarks/benchmarks/bench_avx.py
+++ b/benchmarks/benchmarks/bench_avx.py
@@ -14,6 +14,7 @@ avx_ufuncs = ['sin',
'floor',
'ceil' ,
'trunc',
+ 'frexp',
'isnan',
'isfinite',
'isinf',
@@ -60,6 +61,22 @@ class AVX_BFunc(Benchmark):
def time_ufunc(self, ufuncname, dtype, stride):
self.f(self.arr1[::stride], self.arr2[::stride])
+class AVX_ldexp(Benchmark):
+
+ params = [dtype, stride]
+ param_names = ['dtype', 'stride']
+ timeout = 10
+
+ def setup(self, dtype, stride):
+ np.seterr(all='ignore')
+ self.f = getattr(np, 'ldexp')
+ N = 10000
+ self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
+ self.arr2 = np.array(np.random.rand(stride*N), dtype='i')
+
+ def time_ufunc(self, dtype, stride):
+ self.f(self.arr1[::stride], self.arr2[::stride])
+
cmplx_bfuncs = ['add',
'subtract',
'multiply',
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index b5d5eb94a..2ce2fdb55 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -898,10 +898,10 @@ defdict = {
docstrings.get('numpy.core.umath.ldexp'),
None,
[TypeDescription('e', None, 'ei', 'e'),
- TypeDescription('f', None, 'fi', 'f'),
+ TypeDescription('f', None, 'fi', 'f', simd=['avx512_skx']),
TypeDescription('e', FuncNameSuffix('long'), 'el', 'e'),
TypeDescription('f', FuncNameSuffix('long'), 'fl', 'f'),
- TypeDescription('d', None, 'di', 'd'),
+ TypeDescription('d', None, 'di', 'd', simd=['avx512_skx']),
TypeDescription('d', FuncNameSuffix('long'), 'dl', 'd'),
TypeDescription('g', None, 'gi', 'g'),
TypeDescription('g', FuncNameSuffix('long'), 'gl', 'g'),
@@ -912,8 +912,8 @@ defdict = {
docstrings.get('numpy.core.umath.frexp'),
None,
[TypeDescription('e', None, 'e', 'ei'),
- TypeDescription('f', None, 'f', 'fi'),
- TypeDescription('d', None, 'd', 'di'),
+ TypeDescription('f', None, 'f', 'fi', simd=['avx512_skx']),
+ TypeDescription('d', None, 'd', 'di', simd=['avx512_skx']),
TypeDescription('g', None, 'g', 'gi'),
],
),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index dbb8dd48e..0cfa1cea7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -2137,6 +2137,14 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
+@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ if (!run_unary_two_out_avx512_skx_frexp_@TYPE@(args, dimensions, steps)) {
+ @TYPE@_frexp(args, dimensions, steps, func);
+ }
+}
+
+NPY_NO_EXPORT void
@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
@@ -2147,6 +2155,14 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
+@TYPE@_ldexp_avx512_skx(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
+{
+ if (!run_binary_avx512_skx_ldexp_@TYPE@(args, dimensions, steps)) {
+ @TYPE@_ldexp(args, dimensions, steps, func);
+ }
+}
+
+NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/*
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index b63d442ef..5dd49c465 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -339,9 +339,15 @@ NPY_NO_EXPORT void
@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
+@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
+@TYPE@_ldexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
#define @TYPE@_true_divide @TYPE@_divide
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 2f7574d47..48e89915c 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -120,6 +120,13 @@ nomemoverlap(char *ip,
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
+#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
+ ((abs(steps[0]) < MAX_STEP_SIZE) && \
+ (abs(steps[1]) < MAX_STEP_SIZE) && \
+ (abs(steps[2]) < MAX_STEP_SIZE) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
+
/*
* 1) Output should be contiguous, can handle strided input data
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
@@ -294,6 +301,42 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
/**end repeat1**/
+
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
+AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
+
+static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
+AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
+#endif
+
+static NPY_INLINE int
+run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+ if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
+ AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
+
+static NPY_INLINE int
+run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
+ if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
+ AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
/**end repeat**/
/**begin repeat
@@ -2089,13 +2132,167 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
* #num_lanes = 16, 8#
* #vsuffix = ps, pd#
* #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
+ * #vtype1 = __m512, __m512d#
+ * #vtype2 = __m512i, __m256i#
* #scale = 4, 8#
* #vindextype = __m512i, __m256i#
* #vindexsize = 512, 256#
* #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
+ * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
+ * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
+ * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
+ * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
+ * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
*/
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
+AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+ const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
+ const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
+ const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
+ const npy_intp array_size = dimensions[0];
+ npy_intp num_remaining_elements = array_size;
+ @type@* ip1 = (@type@*) args[0];
+ int* ip2 = (int*) args[1];
+ @type@* op = (@type@*) args[2];
+
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+ */
+
+ npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
+ for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
+ index_ip1[ii] = ii*stride_ip1;
+ index_ip2[ii] = ii*stride_ip2;
+ index_op[ii] = ii*stride_op;
+ }
+ @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+ @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
+ @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
+ @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
+ @vtype2@ zeros = @setzero@();
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < @num_lanes@) {
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
+ num_remaining_elements, @num_lanes@);
+ }
+ @vtype1@ x1;
+ @vtype2@ x2;
+ if (stride_ip1 == 1) {
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+ }
+ else {
+ x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+ }
+ if (stride_ip2 == 1) {
+ x2 = @vtype2_load@(load_mask, ip2);
+ }
+ else {
+ x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
+ }
+
+ @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
+
+ if (stride_op == 1) {
+ _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
+ }
+ else {
+ /* scatter! */
+ _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
+ }
+
+ ip1 += @num_lanes@*stride_ip1;
+ ip2 += @num_lanes@*stride_ip2;
+ op += @num_lanes@*stride_op;
+ num_remaining_elements -= @num_lanes@;
+ }
+}
+
+static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
+AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+ const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
+ const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
+ const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
+ const npy_intp array_size = dimensions[0];
+ npy_intp num_remaining_elements = array_size;
+ @type@* ip1 = (@type@*) args[0];
+ @type@* op1 = (@type@*) args[1];
+ int* op2 = (int*) args[2];
+
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+ */
+
+ npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
+ for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
+ index_ip1[ii] = ii*stride_ip1;
+ index_op1[ii] = ii*stride_op1;
+ index_op2[ii] = ii*stride_op2;
+ }
+ @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
+ @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
+ @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
+ @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < @num_lanes@) {
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
+ num_remaining_elements, @num_lanes@);
+ }
+ @vtype1@ x1;
+ if (stride_ip1 == 1) {
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
+ }
+ else {
+ x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
+ }
+
+ /*
+ * The x86 instructions vpgetmant and vpgetexp do not conform
+ * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
+ * We mask these values with spmask to avoid invalid exceptions.
+ */
+ @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
+ x1, 0b10011111));
+ @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
+ spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+ out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
+ @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
+ _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
+ _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
+ if (stride_op1 == 1) {
+ _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
+ }
+ else {
+ _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
+ }
+ if (stride_op2 == 1) {
+ @vtype2_store@(op2, load_mask, out2);
+ }
+ else {
+ @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
+ }
+
+ ip1 += @num_lanes@*stride_ip1;
+ op1 += @num_lanes@*stride_op1;
+ op2 += @num_lanes@*stride_op2;
+ num_remaining_elements -= @num_lanes@;
+ }
+}
+#endif
+
/**begin repeat1
* #func = maximum, minimum#
* #vectorf = max, min#
@@ -2131,14 +2328,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
@vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
@vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
@vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
- @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
+ @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
while (num_remaining_elements > 0) {
if (num_remaining_elements < @num_lanes@) {
load_mask = avx512_get_partial_load_mask_@vsuffix@(
num_remaining_elements, @num_lanes@);
}
- @vtype@ x1, x2;
+ @vtype1@ x1, x2;
if (stride_ip1 == 1) {
x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
}
@@ -2158,7 +2355,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
* this issue to conform with NumPy behaviour.
*/
@mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
- @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
+ @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
if (stride_op == 1) {
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 0b9b06d01..91acd6ac3 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -3,6 +3,7 @@ import warnings
import fnmatch
import itertools
import pytest
+import sys
from fractions import Fraction
import numpy.core.umath as ncu
@@ -789,6 +790,33 @@ class TestFPClass:
assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride])
assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride])
+class TestLDExp:
+ @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+ @pytest.mark.parametrize("dtype", ['f', 'd'])
+ def test_ldexp(self, dtype, stride):
+ mant = np.array([0.125, 0.25, 0.5, 1., 1., 2., 4., 8.], dtype=dtype)
+ exp = np.array([3, 2, 1, 0, 0, -1, -2, -3], dtype='i')
+ out = np.zeros(8, dtype=dtype)
+ assert_equal(np.ldexp(mant[::stride], exp[::stride], out=out[::stride]), np.ones(8, dtype=dtype)[::stride])
+ assert_equal(out[::stride], np.ones(8, dtype=dtype)[::stride])
+
+class TestFRExp:
+ @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+ @pytest.mark.parametrize("dtype", ['f', 'd'])
+ @pytest.mark.skipif(not sys.platform.startswith('linux'),
+ reason="np.frexp gives different answers for NAN/INF on windows and linux")
+ def test_frexp(self, dtype, stride):
+ arr = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 1.0, -1.0], dtype=dtype)
+ mant_true = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 0.5, -0.5], dtype=dtype)
+ exp_true = np.array([0, 0, 0, 0, 0, 0, 1, 1], dtype='i')
+ out_mant = np.ones(8, dtype=dtype)
+ out_exp = 2*np.ones(8, dtype='i')
+ mant, exp = np.frexp(arr[::stride], out=(out_mant[::stride], out_exp[::stride]))
+ assert_equal(mant_true[::stride], mant)
+ assert_equal(exp_true[::stride], exp)
+ assert_equal(out_mant[::stride], mant_true[::stride])
+ assert_equal(out_exp[::stride], exp_true[::stride])
+
# func : [maxulperror, low, high]
avx_ufuncs = {'sqrt' :[1, 0., 100.],
'absolute' :[0, -100., 100.],