7 files changed, 88 insertions, 2 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 54770959c..5c494ae7a 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square#
+ * #intrin = sqrt, recip, abs, square, ceil#
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square#
+ * #intrin = sqrt, recip, abs, square, ceil#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index 9460183df..b1f3915a6 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -105,4 +105,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
 }
 
+// ceil
+#define npyv_ceil_f32 _mm256_ceil_ps
+#define npyv_ceil_f64 _mm256_ceil_pd
+
 #endif // _NPY_SIMD_AVX2_MATH_H
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 0949b2b06..c4f8d3410 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -112,4 +112,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
 #define npyv_min_u64 _mm512_min_epu64
 #define npyv_min_s64 _mm512_min_epi64
 
+// ceil
+#define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF)
+#define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF)
+
 #endif // _NPY_SIMD_AVX512_MATH_H
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 19ea6f22f..8d370e624 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,4 +153,20 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// ceil
+#ifdef NPY_HAVE_ASIMD
+    #define npyv_ceil_f32 vrndpq_f32
+#else
+   NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+   {
+       npyv_f32 conv_trunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
+       npyv_f32 conv_trunc_add_one = npyv_add_f32(conv_trunc, vdupq_n_f32(1.0f));
+       npyv_u32 mask = vcltq_f32(conv_trunc, a);
+       return vbslq_f32(mask, conv_trunc, conv_trunc_add_one);
+   }
+#endif
+#if NPY_SIMD_F64
+    #define npyv_ceil_f64 vrndpq_f64
+#endif // NPY_SIMD_F64
+
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 97d35afc5..02eb06a29 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -143,4 +143,35 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// ceil
+#ifdef NPY_HAVE_SSE41
+    #define npyv_ceil_f32 _mm_ceil_ps
+    #define npyv_ceil_f64 _mm_ceil_pd
+#else
+    NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+    {
+        const npyv_f32 szero = _mm_set1_ps(-0.0f);
+        const npyv_f32 one = _mm_set1_ps(1.0f);
+        npyv_s32 roundi = _mm_cvttps_epi32(a);
+        npyv_f32 round = _mm_cvtepi32_ps(roundi);
+        npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
+        // respect signed zero, e.g. -0.5 -> -0.0
+        npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+        // if overflow return a
+        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+    }
+    NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
+    {
+        const npyv_f64 szero = _mm_set1_pd(-0.0);
+        const npyv_f64 one = _mm_set1_pd(1.0);
+        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+        npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+        // round by add magic number 2^52
+        npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+        npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
+        // respect signed zero, e.g. -0.5 -> -0.0
+        return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+    }
+#endif
+
 #endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
index b2e393c7c..f387dac4d 100644
--- a/numpy/core/src/common/simd/vsx/math.h
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -69,4 +69,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_min_u64 vec_min
 #define npyv_min_s64 vec_min
 
+// ceil
+#define npyv_ceil_f32 vec_ceil
+#define npyv_ceil_f64 vec_ceil
+
 #endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 0270ad901..379fef8af 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -330,6 +330,33 @@ class _SIMD_FP(_Test_Utility):
         square = self.square(vdata)
         assert square == data_square
 
+    @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil)])
+    def test_rounding(self, intrin, func):
+        """
+        Test intrinsics:
+            npyv_ceil_##SFX
+        """
+        intrin = eval(intrin)
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        # special cases
+        round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf))
+        for case, desired in round_cases:
+            data_round = [desired]*self.nlanes
+            _round = intrin(self.setall(case))
+            assert _round == pytest.approx(data_round, nan_ok=True)
+        for x in range(0, 2**20, 256**2):
+            for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15):
+                data = [x*w+a for a in range(self.nlanes)]
+                vdata = self.load(data)
+                data_round = [func(x) for x in data]
+                _round = intrin(vdata)
+                assert _round == data_round
+        # signed zero
+        for w in (-0.25, -0.30, -0.45):
+            _round = self._to_unsigned(intrin(self.setall(w)))
+            data_round = self._to_unsigned(self.setall(-0.0))
+            assert _round == data_round
+
     def test_max(self):
         """
         Test intrinsics: