summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/code_generators/generate_umath.py8
-rw-r--r--numpy/core/src/umath/loops.c.src56
-rw-r--r--numpy/core/src/umath/loops.h.src13
-rw-r--r--numpy/core/src/umath/simd.inc.src404
-rw-r--r--numpy/core/tests/test_umath.py90
5 files changed, 520 insertions, 51 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6729fe197..1c87a148d 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -358,14 +358,14 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.square'),
None,
- TD(ints+inexact, simd=[('avx2', ints)]),
+ TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
TD(O, f='Py_square'),
),
'reciprocal':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.reciprocal'),
None,
- TD(ints+inexact, simd=[('avx2', ints)]),
+ TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
TD(O, f='Py_reciprocal'),
),
# This is no longer used as numpy.ones_like, however it is
@@ -395,7 +395,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.absolute'),
'PyUFunc_AbsoluteTypeResolver',
- TD(bints+flts+timedeltaonly),
+ TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD(cmplx, out=('f', 'd', 'g')),
TD(O, f='PyNumber_Absolute'),
),
@@ -762,7 +762,7 @@ defdict = {
docstrings.get('numpy.core.umath.sqrt'),
None,
TD('e', f='sqrt', astype={'e':'f'}),
- TD(inexactvec),
+ TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg' + cmplx, f='sqrt'),
TD(P, f='sqrt'),
),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5443223ab..ad2ec9e4a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1657,6 +1657,60 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
*/
/**begin repeat1
+ * #TYPE = FLOAT, DOUBLE#
+ * #type = npy_float, npy_double#
+ * #typesub = f, #
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = npy_sqrt@typesub@(in1);
+ }
+ }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ tmp = in1 > 0 ? in1 : -in1;
+ /* add 0 to clear -0.0 */
+ *((@type@ *)op1) = tmp + 0;
+ }
+ }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = in1*in1;
+ }
+ }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = 1.0f/in1;
+ }
+ }
+}
+
+/**end repeat1**/
+
+/**begin repeat1
* #func = exp, log#
* #scalarf = npy_expf, npy_logf#
*/
@@ -1706,8 +1760,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
}
/**end repeat1**/
-
-
/**end repeat**/
/**begin repeat
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5070ab38b..fe1b1145d 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -175,6 +175,19 @@ NPY_NO_EXPORT void
*/
NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, fma#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat2**/
+/**end repeat1**/
/**end repeat**/
/**begin repeat
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 88e5e1f1b..c0dc53dd8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -139,6 +139,37 @@ abs_ptrdiff(char *a, char *b)
/* prototypes */
/**begin repeat1
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
+ @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+ return 0;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
* #func = exp, log#
*/
@@ -1144,41 +1175,76 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask(void)
+fma_get_full_load_mask_ps(void)
{
return _mm256_set1_ps(-1.0);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_full_load_mask_pd(void)
+{
+ return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
- float* addr = maskint + total_elem - num_lanes;
+ float* addr = maskint + num_lanes - num_elem;
return _mm256_loadu_ps(addr);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+ npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+ npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+ return _mm256_loadu_si256((__m256i*) addr);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather(__m256 src,
- npy_float* addr,
- __m256i vindex,
- __m256 mask)
+fma_masked_gather_ps(__m256 src,
+ npy_float* addr,
+ __m256i vindex,
+ __m256 mask)
{
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_gather_pd(__m256d src,
+ npy_double* addr,
+ __m128i vindex,
+ __m256d mask)
+{
+ return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load(__m256 mask, npy_float* addr)
+fma_masked_load_ps(__m256 mask, npy_float* addr)
{
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+ return _mm256_maskload_pd(addr, mask);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
return _mm256_blendv_ps(x, val, mask);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+ return _mm256_blendv_pd(x, val, mask);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
@@ -1186,6 +1252,18 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_invert_mask_ps(__m256 ymask)
+{
+ return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_invert_mask_pd(__m256i ymask)
+{
+ return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
{
return _mm256_cvtepi32_ps(
@@ -1290,42 +1368,91 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
}
}
+/**begin repeat
+ * #vsub = ps, pd#
+ * #vtype = __m256, __m256d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_abs_@vsub@(@vtype@ x)
+{
+ return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_reciprocal_@vsub@(@vtype@ x)
+{
+ return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
+}
+/**end repeat**/
#endif
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask(void)
+avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+ return 0xFF;
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
return (0x0001 << num_elem) - 0x0001;
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+ return (0x01 << num_elem) - 0x01;
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather(__m512 src,
- npy_float* addr,
- __m512i vindex,
- __mmask16 kmask)
+avx512_masked_gather_ps(__m512 src,
+ npy_float* addr,
+ __m512i vindex,
+ __mmask16 kmask)
{
return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_gather_pd(__m512d src,
+ npy_double* addr,
+ __m256i vindex,
+ __mmask8 kmask)
+{
+ return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load(__mmask16 mask, npy_float* addr)
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+ return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
return _mm512_mask_blend_ps(mask, x, val);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+ return _mm512_mask_blend_pd(mask, x, val);
+}
+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
@@ -1333,6 +1460,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_invert_mask_ps(__mmask16 ymask)
+{
+ return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_invert_mask_pd(__mmask8 ymask)
+{
+ return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
{
return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
@@ -1361,6 +1500,22 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
return _mm512_scalef_ps(poly, quadrant);
}
+/**begin repeat
+ * #vsub = ps, pd#
+ * #vtype= __m512, __m512d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_abs_@vsub@(@vtype@ x)
+{
+ return _mm512_abs_@vsub@(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_reciprocal_@vsub@(@vtype@ x)
+{
+ return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
+}
+/**end repeat**/
#endif
/**begin repeat
@@ -1438,9 +1593,175 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
sin = @fmadd@(sin, x, x);
return sin;
}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sqrt_ps(@vtype@ x)
+{
+ return _mm@vsize@_sqrt_ps(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_sqrt_pd(@vtype@d x)
+{
+ return _mm@vsize@_sqrt_pd(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_square_ps(@vtype@ x)
+{
+ return _mm@vsize@_mul_ps(x,x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_square_pd(@vtype@d x)
+{
+ return _mm@vsize@_mul_pd(x,x);
+}
+
#endif
/**end repeat**/
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
+ * #vtype = __m256, __m512#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute, square, reciprocal#
+ * #vectorf = sqrt, abs, square, reciprocal#
+ */
+
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_FLOAT(npy_float* op,
+ npy_float* ip,
+ const npy_intp array_size,
+ const npy_intp steps)
+{
+ const npy_intp stride = steps/sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ npy_intp num_remaining_elements = array_size;
+ @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+ @mask@ load_mask = @isa@_get_full_load_mask_ps();
+ @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
+ npy_int indexarr[16];
+ for (npy_int ii = 0; ii < 16; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+ @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < num_lanes) {
+ load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+ num_lanes);
+ inv_load_mask = @isa@_invert_mask_ps(load_mask);
+ }
+ @vtype@ x;
+ if (stride == 1) {
+ x = @isa@_masked_load_ps(load_mask, ip);
+ /*
+ * Replace masked elements with 1.0f to avoid divide by zero fp
+ * exception in reciprocal
+ */
+ x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
+ }
+ else {
+ x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
+ }
+ @vtype@ out = @isa@_@vectorf@_ps(x);
+ @masked_store@(op, @cvtps_epi32@(load_mask), out);
+
+ ip += num_lanes*stride;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256i, __mmask8#
+ * #vsub = , _mask#
+ * #vtype = __m256d, __m512d#
+ * #vindextype = __m128i, __m256i#
+ * #vindexsize = 128, 256#
+ * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
+ * #cvtps_epi32 = _mm256_cvtpd_epi32, #
+ * #castmask = _mm256_castsi256_pd, #
+ * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute, square, reciprocal#
+ * #vectorf = sqrt, abs, square, reciprocal#
+ */
+
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_DOUBLE(npy_double* op,
+ npy_double* ip,
+ const npy_intp array_size,
+ const npy_intp steps)
+{
+ const npy_intp stride = steps/sizeof(npy_double);
+ const npy_int num_lanes = @BYTES@/sizeof(npy_double);
+ npy_intp num_remaining_elements = array_size;
+ @mask@ load_mask = @isa@_get_full_load_mask_pd();
+ @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
+ @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
+ npy_int indexarr[8];
+ for (npy_int ii = 0; ii < 8; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+ @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
+
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < num_lanes) {
+ load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
+ num_lanes);
+ inv_load_mask = @isa@_invert_mask_pd(load_mask);
+ }
+ @vtype@ x;
+ if (stride == 1) {
+ x = @isa@_masked_load_pd(load_mask, ip);
+ /*
+ * Replace masked elements with 1.0f to avoid divide by zero fp
+ * exception in reciprocal
+ */
+ x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
+ }
+ else {
+ x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
+ }
+ @vtype@ out = @isa@_@vectorf@_pd(x);
+ @masked_store@(op, load_mask, out);
+
+ ip += num_lanes*stride;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
/**begin repeat
* #ISA = FMA, AVX512F#
* #isa = fma, avx512#
@@ -1460,7 +1781,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
*/
-
/*
* Vectorized approximate sine/cosine algorithms: The following code is a
* vectorized version of the algorithm presented here:
@@ -1519,7 +1839,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
@vtype@i iquadrant;
@mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
- @mask@ load_mask = @isa@_get_full_load_mask();
+ @mask@ load_mask = @isa@_get_full_load_mask_ps();
npy_intp num_remaining_elements = array_size;
npy_int indexarr[16];
for (npy_int ii = 0; ii < 16; ii++) {
@@ -1530,16 +1850,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
while (num_remaining_elements > 0) {
if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+ load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
num_lanes);
}
@vtype@ x;
if (stride == 1) {
- x = @isa@_masked_load(load_mask, ip);
+ x = @isa@_masked_load_ps(load_mask, ip);
}
else {
- x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+ x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
}
/*
@@ -1551,7 +1871,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
glibc_mask = @and_masks@(load_mask, glibc_mask);
nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
- x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+ x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
npy_int iglibc_mask = @mask_to_int@(glibc_mask);
if (iglibc_mask != @full_mask@) {
@@ -1584,7 +1904,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
/* multiply by -1 for appropriate elements */
negate_mask = @isa@_should_negate(iquadrant, twos, twos);
cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
- cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+ cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
@masked_store@(op, @cvtps_epi32@(load_mask), cos);
}
@@ -1662,27 +1982,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
@mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
- @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes);
- @mask@ load_mask = @isa@_get_full_load_mask();
+ @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+ @mask@ load_mask = @isa@_get_full_load_mask_ps();
npy_intp num_remaining_elements = array_size;
while (num_remaining_elements > 0) {
if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
- num_lanes);
+ load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+ num_lanes);
}
@vtype@ x;
if (stride == 1) {
- x = @isa@_masked_load(load_mask, ip);
+ x = @isa@_masked_load_ps(load_mask, ip);
}
else {
- x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+ x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
}
nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
- x = @isa@_set_masked_lanes(x, zeros_f, nan_mask);
+ x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1690,7 +2010,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
overflow_mask = @or_masks@(overflow_mask,
@xor_masks@(xmax_mask, inf_mask));
- x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@(
+ x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
@or_masks@(nan_mask, xmin_mask), xmax_mask));
quadrant = _mm@vsize@_mul_ps(x, log2e);
@@ -1723,9 +2043,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
* elem < xmin; return 0.0f
* elem = +/- nan, return nan
*/
- poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
- poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
- poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+ poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+ poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
+ poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
@masked_store@(op, @cvtps_epi32@(load_mask), poly);
@@ -1790,24 +2110,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@vtype@ poly, num_poly, denom_poly, exponent;
@mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
- @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes);
+ @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
@mask@ divide_by_zero_mask = invalid_mask;
- @mask@ load_mask = @isa@_get_full_load_mask();
+ @mask@ load_mask = @isa@_get_full_load_mask_ps();
npy_intp num_remaining_elements = array_size;
while (num_remaining_elements > 0) {
if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
- num_lanes);
+ load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+ num_lanes);
}
@vtype@ x_in;
if (stride == 1) {
- x_in = @isa@_masked_load(load_mask, ip);
+ x_in = @isa@_masked_load_ps(load_mask, ip);
}
else {
- x_in = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+ x_in = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
}
negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
@@ -1818,7 +2138,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@and_masks@(zero_mask, load_mask));
invalid_mask = @or_masks@(invalid_mask, negx_mask);
- @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+ @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
/* set x = normalized mantissa */
exponent = @isa@_get_exponent(x);
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index ef48fed05..294a9b1fd 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -694,8 +694,92 @@ class TestSpecialFloats(object):
assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
+ def test_sqrt_values(self):
+ with np.errstate(all='ignore'):
+ x = [np.nan, np.nan, np.inf, np.nan, 0.]
+ y = [np.nan, -np.nan, np.inf, -np.inf, 0.]
+ for dt in ['f', 'd', 'g']:
+ xf = np.array(x, dtype=dt)
+ yf = np.array(y, dtype=dt)
+ assert_equal(np.sqrt(yf), xf)
+
+ with np.errstate(invalid='raise'):
+ for dt in ['f', 'd', 'g']:
+ assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt))
+
+ def test_abs_values(self):
+ x = [np.nan, np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0]
+ y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0]
+ for dt in ['f', 'd', 'g']:
+ xf = np.array(x, dtype=dt)
+ yf = np.array(y, dtype=dt)
+ assert_equal(np.abs(yf), xf)
+
+ def test_square_values(self):
+ x = [np.nan, np.nan, np.inf, np.inf]
+ y = [np.nan, -np.nan, np.inf, -np.inf]
+ with np.errstate(all='ignore'):
+ for dt in ['f', 'd', 'g']:
+ xf = np.array(x, dtype=dt)
+ yf = np.array(y, dtype=dt)
+ assert_equal(np.square(yf), xf)
+
+ with np.errstate(over='raise'):
+ assert_raises(FloatingPointError, np.square, np.array(1E32, dtype='f'))
+ assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d'))
-class TestSIMDFloat32(object):
+ def test_reciprocal_values(self):
+ with np.errstate(all='ignore'):
+ x = [np.nan, np.nan, 0.0, -0.0, np.inf, -np.inf]
+ y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.]
+ for dt in ['f', 'd', 'g']:
+ xf = np.array(x, dtype=dt)
+ yf = np.array(y, dtype=dt)
+ assert_equal(np.reciprocal(yf), xf)
+
+ with np.errstate(divide='raise'):
+ for dt in ['f', 'd', 'g']:
+ assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt))
+
+# func : [maxulperror, low, high]
+avx_ufuncs = {'sqrt' :[1, 0., 100.],
+ 'absolute' :[0, -100., 100.],
+ 'reciprocal' :[1, 1., 100.],
+ 'square' :[1, -100., 100.]}
+
+class TestAVXUfuncs(object):
+ def test_avx_based_ufunc(self):
+ strides = np.array([-4,-3,-2,-1,1,2,3,4])
+ np.random.seed(42)
+ for func, prop in avx_ufuncs.items():
+ maxulperr = prop[0]
+ minval = prop[1]
+ maxval = prop[2]
+ # various array sizes to ensure masking in AVX is tested
+ for size in range(1,32):
+ myfunc = getattr(np, func)
+ x_f32 = np.float32(np.random.uniform(low=minval, high=maxval,
+ size=size))
+ x_f64 = np.float64(x_f32)
+ x_f128 = np.float128(x_f32)
+ y_true128 = myfunc(x_f128)
+ if maxulperr == 0:
+ assert_equal(myfunc(x_f32), np.float32(y_true128))
+ assert_equal(myfunc(x_f64), np.float64(y_true128))
+ else:
+ assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128),
+ maxulp=maxulperr)
+ assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128),
+ maxulp=maxulperr)
+ # various strides to test gather instruction
+ if size > 1:
+ y_true32 = myfunc(x_f32)
+ y_true64 = myfunc(x_f64)
+ for jj in strides:
+ assert_equal(myfunc(x_f64[::jj]), y_true64[::jj])
+ assert_equal(myfunc(x_f32[::jj]), y_true32[::jj])
+
+class TestAVXFloat32Transcendental(object):
def test_exp_float32(self):
np.random.seed(42)
x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
@@ -722,8 +806,8 @@ class TestSIMDFloat32(object):
def test_strided_float32(self):
np.random.seed(42)
- strides = np.random.randint(low=-100, high=100, size=100)
- sizes = np.random.randint(low=1, high=2000, size=100)
+ strides = np.array([-4,-3,-2,-1,1,2,3,4])
+ sizes = np.arange(2,100)
for ii in sizes:
x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
exp_true = np.exp(x_f32)