6 files changed, 128 insertions, 2 deletions
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 858dcccfc..36d8621e8 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -294,3 +294,23 @@ class ArgParsingReduce(Benchmark):
 
     def time_add_reduce_arg_parsing(self, arg_pack):
         np.add.reduce(*arg_pack.args, **arg_pack.kwargs)
+
+class BinaryBench(Benchmark):
+    def setup(self):
+        N = 1000000
+        self.a32 = np.random.rand(N).astype(np.float32)
+        self.b32 = np.random.rand(N).astype(np.float32)
+        self.a64 = np.random.rand(N).astype(np.float64)
+        self.b64 = np.random.rand(N).astype(np.float64)
+    
+    def time_pow_32(self):
+        np.power(self.a32, self.b32)
+
+    def time_pow_64(self):
+        np.power(self.a64, self.b64)
+
+    def time_atan2_32(self):
+        np.arctan2(self.a32, self.b32)
+
+    def time_atan2_64(self):
+        np.arctan2(self.a64, self.b64)
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index e9a2cc6f5..89320128b 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -396,6 +396,8 @@ defdict = {
           docstrings.get('numpy.core.umath.power'),
           None,
           TD(ints),
+          TD('e', f='pow', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(inexact, f='pow', astype={'e': 'f'}),
           TD(O, f='npy_ObjectPower'),
           ),
@@ -877,6 +879,7 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.arctan2'),
           None,
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
           TD(flts, f='atan2', astype={'e': 'f'}),
           TD(P, f='arctan2'),
           ),
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
index 1111025d7..77e1e1edc 100644
--- a/numpy/core/src/common/npy_svml.h
+++ b/numpy/core/src/common/npy_svml.h
@@ -13,13 +13,14 @@ extern __m512 __svml_tanf16(__m512 x);
 extern __m512 __svml_asinf16(__m512 x);
 extern __m512 __svml_acosf16(__m512 x);
 extern __m512 __svml_atanf16(__m512 x);
-extern __m512 __svml_atan2f16(__m512 x);
+extern __m512 __svml_atan2f16(__m512 x, __m512 y);
 extern __m512 __svml_sinhf16(__m512 x);
 extern __m512 __svml_coshf16(__m512 x);
 extern __m512 __svml_tanhf16(__m512 x);
 extern __m512 __svml_asinhf16(__m512 x);
 extern __m512 __svml_acoshf16(__m512 x);
 extern __m512 __svml_atanhf16(__m512 x);
+extern __m512 __svml_powf16(__m512 x, __m512 y);
 
 extern __m512d __svml_exp8(__m512d x);
 extern __m512d __svml_exp28(__m512d x);
@@ -35,11 +36,12 @@ extern __m512d __svml_tan8(__m512d x);
 extern __m512d __svml_asin8(__m512d x);
 extern __m512d __svml_acos8(__m512d x);
 extern __m512d __svml_atan8(__m512d x);
-extern __m512d __svml_atan28(__m512d x);
+extern __m512d __svml_atan28(__m512d x, __m512d y);
 extern __m512d __svml_sinh8(__m512d x);
 extern __m512d __svml_cosh8(__m512d x);
 extern __m512d __svml_tanh8(__m512d x);
 extern __m512d __svml_asinh8(__m512d x);
 extern __m512d __svml_acosh8(__m512d x);
 extern __m512d __svml_atanh8(__m512d x);
+extern __m512d __svml_pow8(__m512d x, __m512d y);
 #endif
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5af9f1788..4b363d84b 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -286,6 +286,19 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
+ * #func = power, arctan2#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
  * #func = maximum, minimum#
  */
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 281b4231d..538b5039f 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -74,6 +74,47 @@ simd_@func@_f64(const double *src, npy_intp ssrc,
     npyv_cleanup();
 }
 /**end repeat**/
+
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = pow, atan2#
+ */
+ 
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1,
+                  const npyv_lanetype_@sfx@ *src2, npy_intp ssrc2,
+                        npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) {
+        npyv_@sfx@ x1;
+        if (ssrc1 == 1) {
+            x1 = npyv_load_till_@sfx@(src1, len, 1);
+        } else {
+            x1 = npyv_loadn_till_@sfx@(src1, ssrc1, len, 1);
+        }
+        
+        npyv_@sfx@ x2;
+        if (ssrc2 == 1) {
+            x2 = npyv_load_till_@sfx@(src2, len, 1);
+        } else {
+            x2 = npyv_loadn_till_@sfx@(src2, ssrc2, len, 1);
+        }
+        
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x1, x2);
+        if (sdst == 1) {
+            npyv_store_till_@sfx@(dst, len, out);
+        } else {
+            npyv_storen_till_@sfx@(dst, sdst, len, out);
+        }
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
 #endif
 
 /**begin repeat
@@ -113,6 +154,45 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 /**end repeat**/
 
 /**begin repeat
+ *  #TYPE = DOUBLE, FLOAT#
+ *  #type = npy_double, npy_float#
+ *  #vsub = , f#
+ *  #sfx  = f64, f32#
+ */
+/**begin repeat1
+ *  #func = power, arctan2#
+ *  #intrin = pow, atan2#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const @type@ *src1 = (@type@*)args[0];
+    const @type@ *src2 = (@type@*)args[1];
+          @type@ *dst  = (@type@*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_@sfx@(ssrc1) && npyv_loadable_stride_@sfx@(ssrc2) &&
+        npyv_storable_stride_@sfx@(sdst)) {
+        simd_@intrin@_@sfx@(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_@intrin@@vsub@(in1, in2);
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  *  #func = sin, cos#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index bc8927302..afe42b56a 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1095,6 +1095,14 @@ class TestPower:
             assert_raises(ValueError, np.power, a, minusone)
             assert_raises(ValueError, np.power, one, b)
             assert_raises(ValueError, np.power, one, minusone)
+    
+    def test_float_to_inf_power(self):
+        for dt in [np.float32, np.float64]:
+            a = np.array([1, 1, 2, 2, -2, -2, np.inf, -np.inf], dt)
+            b = np.array([np.inf, -np.inf, np.inf, -np.inf,
+                                np.inf, -np.inf, np.inf, -np.inf], dt)
+            r = np.array([1, 1, np.inf, 0, np.inf, 0, np.inf, 0], dt)
+            assert_equal(np.power(a, b), r)
 
 
 class TestFloat_power: