diff options
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 56 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 13 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 404 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 90 |
5 files changed, 520 insertions, 51 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 6729fe197..1c87a148d 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -358,14 +358,14 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.square'), None, - TD(ints+inexact, simd=[('avx2', ints)]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]), TD(O, f='Py_square'), ), 'reciprocal': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.reciprocal'), None, - TD(ints+inexact, simd=[('avx2', ints)]), + TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]), TD(O, f='Py_reciprocal'), ), # This is no longer used as numpy.ones_like, however it is @@ -395,7 +395,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', - TD(bints+flts+timedeltaonly), + TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD(cmplx, out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), @@ -762,7 +762,7 @@ defdict = { docstrings.get('numpy.core.umath.sqrt'), None, TD('e', f='sqrt', astype={'e':'f'}), - TD(inexactvec), + TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), TD('fdg' + cmplx, f='sqrt'), TD(P, f='sqrt'), ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5443223ab..ad2ec9e4a 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1657,6 +1657,60 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE */ /**begin repeat1 + * #TYPE = FLOAT, DOUBLE# + * #type = npy_float, npy_double# + * #typesub = f, # + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = npy_sqrt@typesub@(in1); + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ tmp = in1 > 0 ? in1 : -in1; + /* add 0 to clear -0.0 */ + *((@type@ *)op1) = tmp + 0; + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = in1*in1; + } + } +} + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) { + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = 1.0f/in1; + } + } +} + +/**end repeat1**/ + +/**begin repeat1 * #func = exp, log# * #scalarf = npy_expf, npy_logf# */ @@ -1706,8 +1760,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY } /**end repeat1**/ - - /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 5070ab38b..fe1b1145d 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -175,6 +175,19 @@ NPY_NO_EXPORT void */ NPY_NO_EXPORT void @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #isa = avx512f, fma# + */ + +/**begin repeat2 + * #func = sqrt, absolute, square, reciprocal# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**end repeat2**/ +/**end repeat1**/ /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 88e5e1f1b..c0dc53dd8 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -139,6 +139,37 @@ abs_ptrdiff(char *a, char *b) /* prototypes */ /**begin repeat1 + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + */ + +/**begin repeat2 + * #func = sqrt, absolute, square, reciprocal# + */ + +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE void +@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride); +#endif + +static NPY_INLINE int +run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) { + @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); + return 1; + } + else + return 0; +#endif + return 0; +} + +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 * #func = exp, log# */ @@ -1144,41 +1175,76 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_full_load_mask(void) +fma_get_full_load_mask_ps(void) { return _mm256_set1_ps(-1.0); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_get_full_load_mask_pd(void) +{ + return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) +fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float* addr = maskint + total_elem - num_lanes; + float* addr = maskint + num_lanes - num_elem; return _mm256_loadu_ps(addr); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) +{ + npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; + npy_int* addr = maskint + 2*num_lanes - 2*num_elem; + return _mm256_loadu_si256((__m256i*) addr); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_gather(__m256 src, - npy_float* addr, - __m256i vindex, - __m256 mask) +fma_masked_gather_ps(__m256 src, + npy_float* addr, + __m256i vindex, + __m256 mask) { return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_masked_gather_pd(__m256d src, + npy_double* addr, + __m128i vindex, + __m256d mask) +{ + return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_load(__m256 mask, npy_float* addr) +fma_masked_load_ps(__m256 mask, npy_float* addr) { return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_masked_load_pd(__m256i mask, npy_double* addr) +{ + return _mm256_maskload_pd(addr, mask); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask) +fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) { return _mm256_blendv_ps(x, val, mask); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d +fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) +{ + return _mm256_blendv_pd(x, val, mask); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_blend(__m256 x, __m256 y, __m256 ymask) { @@ -1186,6 +1252,18 @@ fma_blend(__m256 x, __m256 y, __m256 ymask) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_invert_mask_ps(__m256 ymask) +{ + return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i +fma_invert_mask_pd(__m256i ymask) +{ + return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp) { return _mm256_cvtepi32_ps( @@ -1290,42 +1368,91 @@ fma_scalef_ps(__m256 poly, __m256 quadrant) } } +/**begin repeat + * #vsub = ps, pd# + * #vtype = __m256, __m256d# + */ +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_abs_@vsub@(@vtype@ x) +{ + return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ +fma_reciprocal_@vsub@(@vtype@ x) +{ + return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); +} +/**end repeat**/ #endif #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_full_load_mask(void) +avx512_get_full_load_mask_ps(void) { return 0xFFFF; } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_get_full_load_mask_pd(void) +{ + return 0xFF; +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) +avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) { return (0x0001 << num_elem) - 0x0001; } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) +{ + return (0x01 << num_elem) - 0x01; +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_gather(__m512 src, - npy_float* addr, - __m512i vindex, - __mmask16 kmask) +avx512_masked_gather_ps(__m512 src, + npy_float* addr, + __m512i vindex, + __mmask16 kmask) { return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_masked_gather_pd(__m512d src, + npy_double* addr, + __m256i vindex, + __mmask8 kmask) +{ + return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_load(__mmask16 mask, npy_float* addr) +avx512_masked_load_ps(__mmask16 mask, npy_float* addr) { return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_masked_load_pd(__mmask8 mask, npy_double* addr) +{ + return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask) +avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) { return _mm512_mask_blend_ps(mask, x, val); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) +{ + return _mm512_mask_blend_pd(mask, x, val); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_blend(__m512 x, __m512 y, __mmask16 ymask) { @@ -1333,6 +1460,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_invert_mask_ps(__mmask16 ymask) +{ + return _mm512_knot(ymask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 +avx512_invert_mask_pd(__mmask8 ymask) +{ + return _mm512_knot(ymask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp) { return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp); @@ -1361,6 +1500,22 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) { return _mm512_scalef_ps(poly, quadrant); } +/**begin repeat + * #vsub = ps, pd# + * #vtype= __m512, __m512d# + */ +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_abs_@vsub@(@vtype@ x) +{ + return _mm512_abs_@vsub@(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ +avx512_reciprocal_@vsub@(@vtype@ x) +{ + return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); +} +/**end repeat**/ #endif /**begin repeat @@ -1438,9 +1593,175 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ sin = @fmadd@(sin, x, x); return sin; } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_sqrt_ps(@vtype@ x) +{ + return _mm@vsize@_sqrt_ps(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +@isa@_sqrt_pd(@vtype@d x) +{ + return _mm@vsize@_sqrt_pd(x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_square_ps(@vtype@ x) +{ + return _mm@vsize@_mul_ps(x,x); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d +@isa@_square_pd(@vtype@d x) +{ + return _mm@vsize@_mul_pd(x,x); +} + #endif /**end repeat**/ + +/**begin repeat + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# + * #vsize = 256, 512# + * #BYTES = 32, 64# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #mask = __m256, __mmask16# + * #vsub = , _mask# + * #vtype = __m256, __m512# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# + */ + +/**begin repeat1 + * #func = sqrt, absolute, square, reciprocal# + * #vectorf = sqrt, abs, square, reciprocal# + */ + +#if defined @CHK@ +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_@func@_FLOAT(npy_float* op, + npy_float* ip, + const npy_intp array_size, + const npy_intp steps) +{ + const npy_intp stride = steps/sizeof(npy_float); + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_intp num_remaining_elements = array_size; + @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); + @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); + npy_int indexarr[16]; + for (npy_int ii = 0; ii < 16; ii++) { + indexarr[ii] = ii*stride; + } + @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < num_lanes) { + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); + inv_load_mask = @isa@_invert_mask_ps(load_mask); + } + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load_ps(load_mask, ip); + /* + * Replace masked elements with 1.0f to avoid divide by zero fp + * exception in reciprocal + */ + x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask); + } + else { + x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); + } + @vtype@ out = @isa@_@vectorf@_ps(x); + @masked_store@(op, @cvtps_epi32@(load_mask), out); + + ip += num_lanes*stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} +#endif +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# + * #vsize = 256, 512# + * #BYTES = 32, 64# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #mask = __m256i, __mmask8# + * #vsub = , _mask# + * #vtype = __m256d, __m512d# + * #vindextype = __m128i, __m256i# + * #vindexsize = 128, 256# + * #vindexload = _mm_loadu_si128, _mm256_loadu_si256# + * #cvtps_epi32 = _mm256_cvtpd_epi32, # + * #castmask = _mm256_castsi256_pd, # + * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# + */ + +/**begin repeat1 + * #func = sqrt, absolute, square, reciprocal# + * #vectorf = sqrt, abs, square, reciprocal# + */ + +#if defined @CHK@ +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_@func@_DOUBLE(npy_double* op, + npy_double* ip, + const npy_intp array_size, + const npy_intp steps) +{ + const npy_intp stride = steps/sizeof(npy_double); + const npy_int num_lanes = @BYTES@/sizeof(npy_double); + npy_intp num_remaining_elements = array_size; + @mask@ load_mask = @isa@_get_full_load_mask_pd(); + @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); + @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); + npy_int indexarr[8]; + for (npy_int ii = 0; ii < 8; ii++) { + indexarr[ii] = ii*stride; + } + @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < num_lanes) { + load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, + num_lanes); + inv_load_mask = @isa@_invert_mask_pd(load_mask); + } + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load_pd(load_mask, ip); + /* + * Replace masked elements with 1.0f to avoid divide by zero fp + * exception in reciprocal + */ + x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask)); + } + else { + x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); + } + @vtype@ out = @isa@_@vectorf@_pd(x); + @masked_store@(op, load_mask, out); + + ip += num_lanes*stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} +#endif +/**end repeat1**/ +/**end repeat**/ + /**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512# @@ -1460,7 +1781,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# */ - /* * Vectorized approximate sine/cosine algorithms: The following code is a * vectorized version of the algorithm presented here: @@ -1519,7 +1839,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ quadrant, reduced_x, reduced_x2, cos, sin; @vtype@i iquadrant; @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; npy_int indexarr[16]; for (npy_int ii = 0; ii < 16; ii++) { @@ -1530,16 +1850,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, num_lanes); } @vtype@ x; if (stride == 1) { - x = @isa@_masked_load(load_mask, ip); + x = @isa@_masked_load_ps(load_mask, ip); } else { - x = @isa@_masked_gather(zero_f, ip, vindex, load_mask); + x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask); } /* @@ -1551,7 +1871,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void glibc_mask = @isa@_in_range_mask(x, large_number,-large_number); glibc_mask = @and_masks@(load_mask, glibc_mask); nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ); - x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask)); + x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask)); npy_int iglibc_mask = @mask_to_int@(glibc_mask); if (iglibc_mask != @full_mask@) { @@ -1584,7 +1904,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void /* multiply by -1 for appropriate elements */ negate_mask = @isa@_should_negate(iquadrant, twos, twos); cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask); - cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); + cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); @masked_store@(op, @cvtps_epi32@(load_mask), cos); } @@ -1662,27 +1982,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask; - @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes); - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); } @vtype@ x; if (stride == 1) { - x = @isa@_masked_load(load_mask, ip); + x = @isa@_masked_load_ps(load_mask, ip); } else { - x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask); } nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ); - x = @isa@_set_masked_lanes(x, zeros_f, nan_mask); + x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask); xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ); xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ); @@ -1690,7 +2010,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void overflow_mask = @or_masks@(overflow_mask, @xor_masks@(xmax_mask, inf_mask)); - x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@( + x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@( @or_masks@(nan_mask, xmin_mask), xmax_mask)); quadrant = _mm@vsize@_mul_ps(x, log2e); @@ -1723,9 +2043,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void * elem < xmin; return 0.0f * elem = +/- nan, return nan */ - poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); - poly = @isa@_set_masked_lanes(poly, inf, xmax_mask); - poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask); + poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); + poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask); + poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask); @masked_store@(op, @cvtps_epi32@(load_mask), poly); @@ -1790,24 +2110,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @vtype@ poly, num_poly, denom_poly, exponent; @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask; - @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes); + @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes); @mask@ divide_by_zero_mask = invalid_mask; - @mask@ load_mask = @isa@_get_full_load_mask(); + @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; while (num_remaining_elements > 0) { if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask(num_remaining_elements, - num_lanes); + load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, + num_lanes); } @vtype@ x_in; if (stride == 1) { - x_in = @isa@_masked_load(load_mask, ip); + x_in = @isa@_masked_load_ps(load_mask, ip); } else { - x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask); + x_in = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask); } negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ); @@ -1818,7 +2138,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @and_masks@(zero_mask, load_mask)); invalid_mask = @or_masks@(invalid_mask, negx_mask); - @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask); + @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask); /* set x = normalized mantissa */ exponent = @isa@_get_exponent(x); diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index ef48fed05..294a9b1fd 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -694,8 +694,92 @@ class TestSpecialFloats(object): assert_raises(FloatingPointError, np.cos, np.float32(-np.inf)) assert_raises(FloatingPointError, np.cos, np.float32(np.inf)) + def test_sqrt_values(self): + with np.errstate(all='ignore'): + x = [np.nan, np.nan, np.inf, np.nan, 0.] + y = [np.nan, -np.nan, np.inf, -np.inf, 0.] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.sqrt(yf), xf) + + with np.errstate(invalid='raise'): + for dt in ['f', 'd', 'g']: + assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt)) + + def test_abs_values(self): + x = [np.nan, np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0] + y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.abs(yf), xf) + + def test_square_values(self): + x = [np.nan, np.nan, np.inf, np.inf] + y = [np.nan, -np.nan, np.inf, -np.inf] + with np.errstate(all='ignore'): + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.square(yf), xf) + + with np.errstate(over='raise'): + assert_raises(FloatingPointError, np.square, np.array(1E32, dtype='f')) + assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d')) -class TestSIMDFloat32(object): + def test_reciprocal_values(self): + with np.errstate(all='ignore'): + x = [np.nan, np.nan, 0.0, -0.0, np.inf, -np.inf] + y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.reciprocal(yf), xf) + + with np.errstate(divide='raise'): + for dt in ['f', 'd', 'g']: + assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt)) + +# func : [maxulperror, low, high] +avx_ufuncs = {'sqrt' :[1, 0., 100.], + 'absolute' :[0, -100., 100.], + 'reciprocal' :[1, 1., 100.], + 'square' :[1, -100., 100.]} + +class TestAVXUfuncs(object): + def test_avx_based_ufunc(self): + strides = np.array([-4,-3,-2,-1,1,2,3,4]) + np.random.seed(42) + for func, prop in avx_ufuncs.items(): + maxulperr = prop[0] + minval = prop[1] + maxval = prop[2] + # various array sizes to ensure masking in AVX is tested + for size in range(1,32): + myfunc = getattr(np, func) + x_f32 = np.float32(np.random.uniform(low=minval, high=maxval, + size=size)) + x_f64 = np.float64(x_f32) + x_f128 = np.float128(x_f32) + y_true128 = myfunc(x_f128) + if maxulperr == 0: + assert_equal(myfunc(x_f32), np.float32(y_true128)) + assert_equal(myfunc(x_f64), np.float64(y_true128)) + else: + assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128), + maxulp=maxulperr) + assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128), + maxulp=maxulperr) + # various strides to test gather instruction + if size > 1: + y_true32 = myfunc(x_f32) + y_true64 = myfunc(x_f64) + for jj in strides: + assert_equal(myfunc(x_f64[::jj]), y_true64[::jj]) + assert_equal(myfunc(x_f32[::jj]), y_true32[::jj]) + +class TestAVXFloat32Transcendental(object): def test_exp_float32(self): np.random.seed(42) x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000)) @@ -722,8 +806,8 @@ class TestSIMDFloat32(object): def test_strided_float32(self): np.random.seed(42) - strides = np.random.randint(low=-100, high=100, size=100) - sizes = np.random.randint(low=1, high=2000, size=100) + strides = np.array([-4,-3,-2,-1,1,2,3,4]) + sizes = np.arange(2,100) for ii in sizes: x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii)) exp_true = np.exp(x_f32) |