5 files changed, 520 insertions, 51 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6729fe197..1c87a148d 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -358,14 +358,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -395,7 +395,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly),
+          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD(cmplx, out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -762,7 +762,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5443223ab..ad2ec9e4a 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1657,6 +1657,60 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
  */
 
 /**begin repeat1
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #type = npy_float, npy_double#
+ *  #typesub = f, #
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = in1*in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = 1.0f/in1;
+        }
+    }
+}
+
+/**end repeat1**/
+
+/**begin repeat1
  *  #func = exp, log#
  *  #scalarf = npy_expf, npy_logf#
  */
@@ -1706,8 +1760,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
-
-
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5070ab38b..fe1b1145d 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -175,6 +175,19 @@ NPY_NO_EXPORT void
  */
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, fma#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat2**/
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 88e5e1f1b..c0dc53dd8 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -139,6 +139,37 @@ abs_ptrdiff(char *a, char *b)
 /* prototypes */
 
 /**begin repeat1
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ */
+
+/**begin repeat2
+ *  #func = sqrt, absolute, square, reciprocal#
+ */
+
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
+        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
  * #func = exp, log#
  */
 
@@ -1144,41 +1175,76 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask(void)
+fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_full_load_mask_pd(void)
+{
+    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + total_elem - num_lanes;
+    float* addr = maskint + num_lanes - num_elem;
     return _mm256_loadu_ps(addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+    return _mm256_loadu_si256((__m256i*) addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather(__m256 src,
-                   npy_float* addr,
-                   __m256i vindex,
-                   __m256 mask)
+fma_masked_gather_ps(__m256 src,
+                     npy_float* addr,
+                     __m256i vindex,
+                     __m256 mask)
 {
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_gather_pd(__m256d src,
+                     npy_double* addr,
+                     __m128i vindex,
+                     __m256d mask)
+{
+    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load(__m256 mask, npy_float* addr)
+fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+    return _mm256_maskload_pd(addr, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+    return _mm256_blendv_pd(x, val, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
@@ -1186,6 +1252,18 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_invert_mask_ps(__m256 ymask)
+{
+    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_invert_mask_pd(__m256i ymask)
+{
+    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
 {
    return _mm256_cvtepi32_ps(
@@ -1290,42 +1368,91 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
      }
 }
 
+/**begin repeat
+ *  #vsub = ps, pd#
+ *  #vtype = __m256, __m256d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_abs_@vsub@(@vtype@ x)
+{
+    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
+}
+/**end repeat**/
 #endif
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask(void)
+avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather(__m512 src,
-                     npy_float* addr,
-                     __m512i vindex,
-                     __mmask16 kmask)
+avx512_masked_gather_ps(__m512 src,
+                        npy_float* addr,
+                        __m512i vindex,
+                        __mmask16 kmask)
 {
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_gather_pd(__m512d src,
+                        npy_double* addr,
+                        __m256i vindex,
+                        __mmask8 kmask)
+{
+    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load(__mmask16 mask, npy_float* addr)
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+    return _mm512_mask_blend_pd(mask, x, val);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
@@ -1333,6 +1460,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_invert_mask_ps(__mmask16 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_invert_mask_pd(__mmask8 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
 {
     return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
@@ -1361,6 +1500,22 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 {
     return _mm512_scalef_ps(poly, quadrant);
 }
+/**begin repeat
+ *  #vsub = ps, pd#
+ *  #vtype= __m512, __m512d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_abs_@vsub@(@vtype@ x)
+{
+    return _mm512_abs_@vsub@(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
+}
+/**end repeat**/
 #endif
 
 /**begin repeat
@@ -1438,9 +1593,175 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
     sin = @fmadd@(sin, x, x);
     return sin;
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sqrt_ps(@vtype@ x)
+{
+    return _mm@vsize@_sqrt_ps(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_sqrt_pd(@vtype@d x)
+{
+    return _mm@vsize@_sqrt_pd(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_square_ps(@vtype@ x)
+{
+    return _mm@vsize@_mul_ps(x,x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_square_pd(@vtype@d x)
+{
+    return _mm@vsize@_mul_pd(x,x);
+}
+
 #endif
 /**end repeat**/
 
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
+ * #vtype = __m256, __m512#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal#
+ *  #vectorf = sqrt, abs, square, reciprocal#
+ */
+
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_FLOAT(npy_float* op,
+                 npy_float* ip,
+                 const npy_intp array_size,
+                 const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_intp num_remaining_elements = array_size;
+    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
+    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+            inv_load_mask = @isa@_invert_mask_ps(load_mask);
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_ps(load_mask, ip);
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
+        }
+        else {
+            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
+        }
+        @vtype@ out = @isa@_@vectorf@_ps(x);
+        @masked_store@(op, @cvtps_epi32@(load_mask), out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256i, __mmask8#
+ * #vsub = , _mask#
+ * #vtype = __m256d, __m512d#
+ * #vindextype = __m128i, __m256i#
+ * #vindexsize = 128, 256#
+ * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
+ * #cvtps_epi32 = _mm256_cvtpd_epi32, #
+ * #castmask = _mm256_castsi256_pd, #
+ * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal#
+ *  #vectorf = sqrt, abs, square, reciprocal#
+ */
+
+#if defined @CHK@
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_DOUBLE(npy_double* op,
+                  npy_double* ip,
+                  const npy_intp array_size,
+                  const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_double);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_double);
+    npy_intp num_remaining_elements = array_size;
+    @mask@ load_mask = @isa@_get_full_load_mask_pd();
+    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
+    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
+    npy_int indexarr[8];
+    for (npy_int ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
+                                                       num_lanes);
+            inv_load_mask = @isa@_invert_mask_pd(load_mask);
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_pd(load_mask, ip);
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
+        }
+        else {
+            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
+        }
+        @vtype@ out = @isa@_@vectorf@_pd(x);
+        @masked_store@(op, load_mask, out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
 /**begin repeat
  * #ISA = FMA, AVX512F#
  * #isa = fma, avx512#
@@ -1460,7 +1781,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  */
 
-
 /*
  * Vectorized approximate sine/cosine algorithms: The following code is a
  * vectorized version of the algorithm presented here:
@@ -1519,7 +1839,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
     @vtype@i iquadrant;
     @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
     npy_int indexarr[16];
     for (npy_int ii = 0; ii < 16; ii++) {
@@ -1530,16 +1850,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                          num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
         }
 
         /*
@@ -1551,7 +1871,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -1584,7 +1904,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             /* multiply by -1 for appropriate elements */
             negate_mask = @isa@_should_negate(iquadrant, twos, twos);
             cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
-            cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
 
             @masked_store@(op, @cvtps_epi32@(load_mask), cos);
         }
@@ -1662,27 +1982,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
 
     @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
-    @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes);
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zeros_f, nan_mask);
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1690,7 +2010,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         overflow_mask = @or_masks@(overflow_mask,
                                     @xor_masks@(xmax_mask, inf_mask));
 
-        x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@(
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
                                     @or_masks@(nan_mask, xmin_mask), xmax_mask));
 
         quadrant = _mm@vsize@_mul_ps(x, log2e);
@@ -1723,9 +2043,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * elem < xmin; return 0.0f
          * elem = +/- nan, return nan
          */
-        poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-        poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
-        poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
@@ -1790,24 +2110,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ poly, num_poly, denom_poly, exponent;
 
     @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
-    @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes);
+    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
     @mask@ divide_by_zero_mask = invalid_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x_in;
         if (stride == 1) {
-            x_in = @isa@_masked_load(load_mask, ip);
+            x_in = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
@@ -1818,7 +2138,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                         @and_masks@(zero_mask, load_mask));
         invalid_mask = @or_masks@(invalid_mask, negx_mask);
 
-        @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
 
         /* set x = normalized mantissa */
         exponent = @isa@_get_exponent(x);
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index ef48fed05..294a9b1fd 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -694,8 +694,92 @@ class TestSpecialFloats(object):
             assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
 
+    def test_sqrt_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.inf, np.nan, 0.]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.sqrt(yf), xf)
+
+        with np.errstate(invalid='raise'):
+            for dt in ['f', 'd', 'g']:
+                assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt))
+
+    def test_abs_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0]
+        y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_equal(np.abs(yf), xf)
+
+    def test_square_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf]
+        y = [np.nan, -np.nan, np.inf, -np.inf]
+        with np.errstate(all='ignore'):
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.square(yf), xf)
+
+        with np.errstate(over='raise'):
+            assert_raises(FloatingPointError, np.square, np.array(1E32,  dtype='f'))
+            assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d'))
 
-class TestSIMDFloat32(object):
+    def test_reciprocal_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, 0.0, -0.0, np.inf, -np.inf]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.reciprocal(yf), xf)
+
+        with np.errstate(divide='raise'):
+            for dt in ['f', 'd', 'g']:
+                assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt))
+
+# func : [maxulperror, low, high]
+avx_ufuncs = {'sqrt'        :[1, 0., 100.],
+              'absolute'    :[0, -100., 100.],
+              'reciprocal'  :[1, 1., 100.],
+              'square'      :[1, -100., 100.]}
+
+class TestAVXUfuncs(object):
+    def test_avx_based_ufunc(self):
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        np.random.seed(42)
+        for func, prop in avx_ufuncs.items():
+            maxulperr = prop[0]
+            minval = prop[1]
+            maxval = prop[2]
+            # various array sizes to ensure masking in AVX is tested
+            for size in range(1,32):
+                myfunc = getattr(np, func)
+                x_f32 = np.float32(np.random.uniform(low=minval, high=maxval,
+                    size=size))
+                x_f64 = np.float64(x_f32)
+                x_f128 = np.float128(x_f32)
+                y_true128 = myfunc(x_f128)
+                if maxulperr == 0:
+                    assert_equal(myfunc(x_f32), np.float32(y_true128))
+                    assert_equal(myfunc(x_f64), np.float64(y_true128))
+                else:
+                    assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128),
+                            maxulp=maxulperr)
+                    assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128),
+                            maxulp=maxulperr)
+                # various strides to test gather instruction
+                if size > 1:
+                    y_true32 = myfunc(x_f32)
+                    y_true64 = myfunc(x_f64)
+                    for jj in strides:
+                        assert_equal(myfunc(x_f64[::jj]), y_true64[::jj])
+                        assert_equal(myfunc(x_f32[::jj]), y_true32[::jj])
+
+class TestAVXFloat32Transcendental(object):
     def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
@@ -722,8 +806,8 @@ class TestSIMDFloat32(object):
 
     def test_strided_float32(self):
         np.random.seed(42)
-        strides = np.random.randint(low=-100, high=100, size=100)
-        sizes = np.random.randint(low=1, high=2000, size=100)
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        sizes = np.arange(2,100)
         for ii in sizes:
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
             exp_true = np.exp(x_f32)