23 files changed, 702 insertions, 236 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index dc5c2577a..cb1147b93 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -352,14 +352,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
+          TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
+          TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -389,7 +389,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
           TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -767,7 +767,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 560b82912..78d6d3120 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -1,15 +1,15 @@
 #ifndef _NPY_COMMON_H_
 #define _NPY_COMMON_H_
 
+/* need Python.h for npy_intp, npy_uintp */
+#include <Python.h>
+
 /* numpconfig.h is auto-generated */
 #include "numpyconfig.h"
 #ifdef HAVE_NPY_CONFIG_H
 #include <npy_config.h>
 #endif
 
-/* need Python.h for npy_intp, npy_uintp */
-#include <Python.h>
-
 /*
  * using static inline modifiers when defining npy_math functions
  * allows the compiler to make optimizations when possible
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index a07f49501..dbade058f 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -5,14 +5,12 @@
 extern "C" {
 #endif
 
+#include <numpy/npy_common.h>
+
 #include <math.h>
 #ifdef __SUNPRO_CC
 #include <sunmath.h>
 #endif
-#ifdef HAVE_NPY_CONFIG_H
-#include <npy_config.h>
-#endif
-#include <numpy/npy_common.h>
 
 /* By adding static inline specifiers to npy_math function definitions when
    appropriate, compiler is given the opportunity to optimize */
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 68aa0a851..6ada03f73 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -904,6 +904,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 3d7af2333..18c383871 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -16,6 +16,7 @@
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
@@ -356,6 +357,17 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
 #endif // sum_sup
 
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /***************************
@@ -371,6 +383,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
  * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
@@ -494,6 +507,17 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(sum_@sfx@)
 #endif // sum_sup
 
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 693a9857d..28dd00032 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -1,8 +1,8 @@
 #ifndef _NPY_CPU_FEATURES_H_
 #define _NPY_CPU_FEATURES_H_
 
-#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 #include <Python.h> // for PyObject
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 0641f2314..6f0d3c0d9 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -67,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
new file mode 100644
index 000000000..b3eba6f5f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MATH_H
+#define _NPY_SIMD_AVX2_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm256_sqrt_ps
+#define npyv_sqrt_f64 _mm256_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm256_and_ps(
+        a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm256_and_pd(
+        a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm256_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm256_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index b09d772f2..2de33765a 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -72,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
new file mode 100644
index 000000000..1db710670
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MATH_H
+#define _NPY_SIMD_AVX512_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm512_sqrt_ps
+#define npyv_sqrt_f64 _mm512_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_ps(a, a, 8);
+#else
+    return npyv_and_f32(
+        a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
+    );
+#endif
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_pd(a, a, 8);
+#else
+    return npyv_and_f64(
+        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+    );
+#endif
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm512_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm512_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index bc14ffb75..87e00d5d1 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -63,14 +63,26 @@
 /***************************
  * Division
  ***************************/
-#ifdef __aarch64__
+#if NPY_SIMD_F64
     #define npyv_div_f32 vdivq_f32
 #else
-    NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+    NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
     {
-        float32x4_t recip = vrecpeq_f32(b);
-        recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
-        return vmulq_f32(a, recip);
+        // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+        // estimate to 1/b
+        npyv_f32 recipe = vrecpeq_f32(b);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (2-d * x[n])
+         * converges to (1/d) if x0 is the result of VRECPE applied to d.
+         *
+         *  NOTE: at least 3 iterations is needed to improve precision
+         */
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        // a/b = a*recip(b)
+        return vmulq_f32(a, recipe);
     }
 #endif
 #define npyv_div_f64 vdivq_f64
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
new file mode 100644
index 000000000..a2bbdf2a5
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -0,0 +1,86 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MATH_H
+#define _NPY_SIMD_NEON_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Absolute
+#define npyv_abs_f32 vabsq_f32
+#define npyv_abs_f64 vabsq_f64
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vmulq_f32(a, a); }
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+    { return vmulq_f64(a, a); }
+#endif
+
+// Square root
+#if NPY_SIMD_F64
+    #define npyv_sqrt_f32 vsqrtq_f32
+    #define npyv_sqrt_f64 vsqrtq_f64
+#else
+    // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+    NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
+    {
+        const npyv_f32 zero = vdupq_n_f32(0.0f);
+        const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
+        npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
+        // guard agianst floating-point division-by-zero error
+        npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
+        // estimate to (1/√a)
+        npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
+         * converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
+         *
+         * NOTE: at least 3 iterations is needed to improve precision
+         */
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        // a * (1/√a)
+        npyv_f32 sqrt = vmulq_f32(a, rsqrte);
+        // return zero if the a is zero
+        // - return zero if a is zero.
+        // - return positive infinity if a is positive infinity
+        return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
+    }
+#endif // NPY_SIMD_F64
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+#if NPY_SIMD_F64
+    const npyv_f32 one = vdupq_n_f32(1.0f);
+    return npyv_div_f32(one, a);
+#else
+    npyv_f32 recipe = vrecpeq_f32(a);
+    /**
+     * Newton-Raphson iteration:
+     *  x[n+1] = x[n] * (2-d * x[n])
+     * converges to (1/d) if x0 is the result of VRECPE applied to d.
+     *
+     * NOTE: at least 3 iterations is needed to improve precision
+     */
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    return recipe;
+#endif
+}
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+    {
+        const npyv_f64 one = vdupq_n_f64(1.0);
+        return npyv_div_f64(one, a);
+    }
+#endif // NPY_SIMD_F64
+
+#endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index 280a34297..c8ddc92ad 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
new file mode 100644
index 000000000..b7203cd89
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MATH_H
+#define _NPY_SIMD_SSE_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm_sqrt_ps
+#define npyv_sqrt_f64 _mm_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm_and_ps(
+        a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm_and_pd(
+        a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 364b4baf1..132d3d347 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
new file mode 100644
index 000000000..7c8610b19
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -0,0 +1,36 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MATH_H
+#define _NPY_SIMD_VSX_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 vec_sqrt
+#define npyv_sqrt_f64 vec_sqrt
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+    const npyv_f32 one = npyv_setall_f32(1.0f);
+    return vec_div(one, a);
+}
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{
+    const npyv_f64 one = npyv_setall_f64(1.0);
+    return vec_div(one, a);
+}
+
+// Absolute
+#define npyv_abs_f32 vec_abs
+#define npyv_abs_f64 vec_abs
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vec_mul(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return vec_mul(a, a); }
+
+#endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 5525dc1e6..27dde98e7 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ef3d5a21a..c9efdeb4e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1491,26 +1491,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@(in1);
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
  */
@@ -1579,53 +1559,6 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *  #typesub = f, #
  */
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = 1.0f/in1;
-        }
-    }
-}
-
 /**begin repeat2
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -2048,33 +1981,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    char * margs[] = {args[0], args[0], args[1]};
-    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
-    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    @type@ one = 1.@c@;
-    char * margs[] = {(char*)&one, args[0], args[1]};
-    npy_intp msteps[] = {0, steps[0], steps[1]};
-    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = 1/in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -2092,20 +1998,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
 @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
@@ -2216,6 +2108,42 @@ NPY_NO_EXPORT void
 
 /*
  *****************************************************************************
+ **                          LONGDOUBLE LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble*)ip1;
+        *((npy_longdouble *)op1) = 1/in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
+        /* add 0 to clear -0.0 */
+        *((npy_longdouble *)op1) = tmp + 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1*in1;
+    }
+}
+
+/*
+ *****************************************************************************
  **                          HALF-FLOAT LOOPS                               **
  *****************************************************************************
  */
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 6b8b77e99..a0b68d168 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -6,6 +6,10 @@
 #ifndef _NPY_UMATH_LOOPS_H_
 #define _NPY_UMATH_LOOPS_H_
 
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
 #define BOOL_invert BOOL_logical_not
 #define BOOL_add BOOL_logical_or
 #define BOOL_bitwise_and BOOL_logical_and
@@ -167,32 +171,29 @@ NPY_NO_EXPORT void
  **                             FLOAT LOOPS                                 **
  *****************************************************************************
  */
-
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 /**begin repeat1
- * #func = maximum, minimum#
+ * #kind = sqrt, absolute, square, reciprocal#
  */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
+/**end repeat**/
 
-/**begin repeat1
- * #isa = avx512f, fma#
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
  */
-
-/**begin repeat2
- * #func = sqrt, absolute, square, reciprocal#
+/**begin repeat1
+ * #func = maximum, minimum#
  */
 NPY_NO_EXPORT void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-/**end repeat2**/
 /**end repeat1**/
 /**end repeat**/
 
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
new file mode 100644
index 000000000..3a1ea82f9
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -0,0 +1,219 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 vsx2 neon
+ **/
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/**********************************************************
+ ** Scalars
+ **********************************************************/
+#if !NPY_SIMD
+NPY_FINLINE float c_recip_f32(float a)
+{ return 1.0f / a; }
+NPY_FINLINE float c_abs_f32(float a)
+{
+    const float tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE float c_square_f32(float a)
+{ return a * a; }
+#endif // !NPY_SIMD
+
+#if !NPY_SIMD_F64
+NPY_FINLINE double c_recip_f64(double a)
+{ return 1.0 / a; }
+NPY_FINLINE double c_abs_f64(double a)
+{
+    const double tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE double c_square_f64(double a)
+{ return a * a; }
+#endif // !NPY_SIMD_F64
+/**
+ * MSVC(32-bit mode) requires a clarified contiguous loop
+ * in order to use SSE, otherwise it uses a soft version of square root
+ * that doesn't raise a domain error.
+ */
+#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
+    #include <emmintrin.h>
+    NPY_FINLINE float c_sqrt_f32(float _a)
+    {
+        __m128 a = _mm_load_ss(&_a);
+        __m128 lower = _mm_sqrt_ss(a);
+        return _mm_cvtss_f32(lower);
+    }
+    NPY_FINLINE double c_sqrt_f64(double _a)
+    {
+        __m128d a = _mm_load_sd(&_a);
+        __m128d lower = _mm_sqrt_pd(a);
+        return _mm_cvtsd_f64(lower);
+    }
+#else
+    #define c_sqrt_f32 npy_sqrtf
+    #define c_sqrt_f64 npy_sqrt
+#endif
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind     = sqrt, absolute, square, reciprocal#
+ * #intr     = sqrt, abs,      square, recip#
+ * #repl_0w1 = 0,    0,        0,      1#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ * #unroll = 4,      4,       2,       2#
+ */
+static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *src = _src;
+          npyv_lanetype_@sfx@ *dst = _dst;
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * @unroll@;
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @STYPE@ == CONTIG
+                npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
+            #else
+                npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
+            #endif
+            npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
+        #endif
+        /**end repeat3**/
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @DTYPE@ == CONTIG
+                npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
+            #else
+                npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
+            #endif
+        #endif
+        /**end repeat3**/
+    }
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if @STYPE@ == CONTIG
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
+        #endif
+    #else
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+        #endif
+    #endif
+        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+    #if @DTYPE@ == CONTIG
+        npyv_store_till_@sfx@(dst, len, v_unary0);
+    #else
+        npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
+    #endif
+    }
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind  = sqrt, absolute, square, reciprocal#
+ * #intr  = sqrt, abs,      square, recip#
+ * #clear = 0,    1,        0,      0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if @VCHK@
+    const int lsize = sizeof(npyv_lanetype_@sfx@);
+    assert(src_step % lsize == 0 && dst_step % lsize == 0);
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // @VCHK@
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if @VCHK@
+        // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
+        *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
+    #endif
+    }
+#if @VCHK@
+clear:;
+#endif
+#if @clear@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
new file mode 100644
index 000000000..f5540bdae
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h
@@ -0,0 +1,42 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 40bb76914..fb4c84c9e 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -29,6 +29,7 @@
 #endif
 #endif
 #include "simd/simd.h"
+#include "loops_utils.h" // nomemoverlap
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
@@ -51,37 +52,6 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-/*
- * nomemoverlap - returns true if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-static NPY_INLINE npy_bool
-nomemoverlap(char *ip,
-             npy_intp ip_size,
-             char *op,
-             npy_intp op_size)
-{
-    char *ip_start, *ip_end, *op_start, *op_end;
-    if (ip_size < 0) {
-        ip_start = ip + ip_size;
-        ip_end = ip;
-    }
-    else {
-        ip_start = ip;
-        ip_end = ip + ip_size;
-    }
-    if (op_size < 0) {
-        op_start = op + op_size;
-        op_end = op;
-    }
-    else {
-        op_start = op;
-        op_end = op + op_size;
-    }
-    return (ip_start > op_end) | (op_start > ip_end);
-}
-
 #define IS_BINARY_STRIDE_ONE(esize, vsize) \
     ((steps[0] == esize) && \
      (steps[1] == esize) && \
@@ -390,7 +360,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat2
- *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ *  #func = rint, floor, ceil, trunc#
  */
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -510,9 +480,9 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat1
- * #func = sqrt, absolute, negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary*3, unary_reduce*2#
+ * #func = absolute, negative, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 #
+ * #name = unary*2, unary_reduce*2#
  */
 
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -1357,33 +1327,6 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
 }
 /**end repeat1**/
 
-static void
-sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    /* align output to VECTOR_SIZE_BYTES bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = @scalarf@(ip[i]);
-    }
-    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
-           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
-    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = @scalarf@(ip[i]);
-    }
-}
-
-
 static NPY_INLINE
 @type@ scalar_abs_@type@(@type@ v)
 {
@@ -2458,9 +2401,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf = rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2475,10 +2417,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     npy_intp num_remaining_elements = array_size;
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
     @mask@ load_mask = @isa@_get_full_load_mask_ps();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
-
     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via
@@ -2495,20 +2433,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_ps(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
-#endif
         }
         else {
             x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
@@ -2544,9 +2472,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf =  rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2560,9 +2487,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
     npy_intp num_remaining_elements = array_size;
     @mask@ load_mask = @isa@_get_full_load_mask_pd();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
     @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
 
     /*
@@ -2580,20 +2504,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_pd(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
-#endif
         }
         else {
             x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 061d0e34f..13e8d5ede 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -1,6 +1,6 @@
 # NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
 # may be involved in their functionality.
-import pytest
+import pytest, math
 from numpy.core._simd import targets
 
 class _Test_Utility:
@@ -80,6 +80,18 @@ class _Test_Utility:
                 cvt_intrin = "reinterpret_u{0}_{1}"
             return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
 
+    def _pinfinity(self):
+        v = self.npyv.setall_u32(0x7f800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _ninfinity(self):
+        v = self.npyv.setall_u32(0xff800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _nan(self):
+        v = self.npyv.setall_u32(0x7fc00000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
 class _SIMD_INT(_Test_Utility):
     """
     To test all integer vector types at once
@@ -150,6 +162,65 @@ class _SIMD_FP(_Test_Utility):
         data_nfms = self.mul(data_fma, self.setall(-1))
         assert nfms == data_nfms
 
+    def test_abs(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan))
+        for case, desired in abs_cases:
+            data_abs = [desired]*self.nlanes
+            vabs = self.abs(self.setall(case))
+            assert vabs == pytest.approx(data_abs, nan_ok=True)
+
+        vabs = self.abs(self.mul(vdata, self.setall(-1)))
+        assert vabs == data
+
+    def test_sqrt(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf))
+        for case, desired in sqrt_cases:
+            data_sqrt = [desired]*self.nlanes
+            sqrt  = self.sqrt(self.setall(case))
+            assert sqrt == pytest.approx(data_sqrt, nan_ok=True)
+
+        data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision
+        sqrt = self.sqrt(vdata)
+        assert sqrt == data_sqrt
+
+    def test_square(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+        # square
+        square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf))
+        for case, desired in square_cases:
+            data_square = [desired]*self.nlanes
+            square  = self.square(self.setall(case))
+            assert square == pytest.approx(data_square, nan_ok=True)
+
+        data_square = [x*x for x in data]
+        square = self.square(vdata)
+        assert square == data_square
+
+    def test_reciprocal(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf))
+        for case, desired in recip_cases:
+            data_recip = [desired]*self.nlanes
+            recip = self.recip(self.setall(case))
+            assert recip == pytest.approx(data_recip, nan_ok=True)
+
+        data_recip = self.load([1/x for x in data]) # load to truncate precision
+        recip = self.recip(vdata)
+        assert recip == data_recip
+
 class _SIMD_ALL(_Test_Utility):
     """
     To test all vector types at once