MAINT, TST: Add testing cases for partial/non-contig load and store

author: Sayed Adel <seiko@imavr.com> 2020-10-27 11:43:09 +0000
committer: Sayed Adel <seiko@imavr.com> 2020-10-27 11:46:58 +0000
commit: 5d8c3be00dcc2b4a64dcd1ff108474b992edbc53 (patch)
tree: dac6f14d957ae2becc7ac6d2d59659c04f6e6960 /numpy
parent: 8cc5009857391ab2a11fb1af1042c0b83544e97d (diff)
download: numpy-5d8c3be00dcc2b4a64dcd1ff108474b992edbc53.tar.gz
2 files changed, 316 insertions, 7 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 1989be7e3..45e1762f4 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -20,6 +20,7 @@
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
  */
@@ -61,6 +62,172 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 }
 /**end repeat1**/
 
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
+SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+
+// Partial Store
+static PyObject *
+simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_@sfx@(
+        seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+// Non-contiguous Load
+/**begin repeat1
+ * #intrin = loadn, loadn_till, loadn_tillz#
+ * #till   = 0,     1,          1#
+ * #fill   = 0,     1,          0#
+ * #format = ,    O&O&,         O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if @fill@
+    simd_arg fill_arg = {.dtype = simd_data_@sfx@};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&:@intrin@_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if @fill@
+        ,simd_arg_converter, &fill_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@sfx@ rvec = npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        , nlane_arg.data.u32
+    #endif
+    #if @fill@
+        , fill_arg.data.@sfx@
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+
+// Non-contiguous Store
+/**begin repeat1
+ * #intrin = storen, storen_till#
+ * #till   = 0,      1#
+ * #format = ,       O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&O&:storen_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+#endif // @ncont_sup@
+
+
 /***************************
  * Misc
  ***************************/
@@ -205,6 +372,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
  * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
  */
@@ -219,6 +387,19 @@ static PyMethodDef simd__intrinsics_methods[] = {
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+/**begin repeat1
+ * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz,
+ *           store_till, storen, storen_till#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // ncont_sup
+
+
 /***************************
  * Misc
  ***************************/
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 77a636491..50e77a4b8 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -11,18 +11,20 @@ class _Test_Utility:
 
     def __getattr__(self, attr):
         """
-        To call NPV intrinsics without the prefix 'npyv_' and
+        To call NPV intrinsics without the attribute 'npyv' and
         auto suffixing intrinsics according to class attribute 'sfx'
         """
         return getattr(self.npyv, attr + "_" + self.sfx)
 
-    def _data(self, n=None, reverse=False):
+    def _data(self, start=None, count=None, reverse=False):
         """
         Create list of consecutive numbers according to number of vector's lanes.
         """
-        if n is None:
-            n = 1
-        rng = range(n, n + self.nlanes)
+        if start is None:
+            start = 1
+        if count is None:
+            count = self.nlanes
+        rng = range(start, start + count)
         if reverse:
             rng = reversed(rng)
         if self._is_fp():
@@ -196,6 +198,132 @@ class _SIMD_ALL(_Test_Utility):
         assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
         assert store_h != vdata  # detect overflow
 
+    def test_memory_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4] # test out of range
+        for n in lanes:
+            load_till  = self.load_till(data, n, 15)
+            data_till  = data[:n] + [15] * (self.nlanes-n)
+            assert load_till == data_till
+            load_tillz = self.load_tillz(data, n)
+            data_tillz = data[:n] + [0] * (self.nlanes-n)
+            assert load_tillz == data_tillz
+
+    def test_memory_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        data_rev = self._data(reverse=True)
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for n in lanes:
+            data_till = data_rev.copy()
+            data_till[:n] = data[:n]
+            store_till = self._data(reverse=True)
+            self.store_till(store_till, n, vdata)
+            assert store_till == data_till
+
+    def test_memory_noncont_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = self.load(data[::stride]) # cast unsigned
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+    def test_memory_noncont_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = list(self.load(data[::stride])) # cast unsigned
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+    def test_memory_noncont_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        vdata = self.load(self._data())
+        for stride in range(1, 64):
+            data = [15] * stride * self.nlanes
+            data[::stride] = vdata
+            storen = [15] * stride * self.nlanes
+            storen += [127]*64
+            self.storen(storen, stride, vdata)
+            assert storen[:-64] == data
+            assert storen[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            data = [15] * -stride * self.nlanes
+            data[::stride] = vdata
+            storen = [127]*64
+            storen += [15] * -stride * self.nlanes
+            self.storen(storen, stride, vdata)
+            assert storen[64:] == data
+            assert storen[:64] == [127]*64 # detect overflow
+
+    def test_memory_noncont_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            for n in lanes:
+                data_till = [15] * stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [15] * stride * self.nlanes
+                storen_till += [127]*64
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[:-64] == data_till
+                assert storen_till[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            for n in lanes:
+                data_till = [15] * -stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [127]*64
+                storen_till += [15] * -stride * self.nlanes
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[64:] == data_till
+                assert storen_till[:64] == [127]*64 # detect overflow
+
     def test_misc(self):
         broadcast_zero = self.zero()
         assert broadcast_zero == [0] * self.nlanes
@@ -388,7 +516,7 @@ class _SIMD_ALL(_Test_Utility):
 int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
 fp_sfx  = ("f32", "f64")
 all_sfx = int_sfx + fp_sfx
-tests_registery = {
+tests_registry = {
     int_sfx : _SIMD_INT,
     fp_sfx  : _SIMD_FP,
     all_sfx : _SIMD_ALL
@@ -411,7 +539,7 @@ for target_name, npyv in targets.items():
     elif not npyv.simd_f64:
         skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
 
-    for sfxes, cls in tests_registery.items():
+    for sfxes, cls in tests_registry.items():
         for sfx in sfxes:
             skip_m = skip_sfx.get(sfx, skip)
             inhr = (cls,)
author	Sayed Adel <seiko@imavr.com>	2020-10-27 11:43:09 +0000
committer	Sayed Adel <seiko@imavr.com>	2020-10-27 11:46:58 +0000
commit	5d8c3be00dcc2b4a64dcd1ff108474b992edbc53 (patch)
tree	dac6f14d957ae2becc7ac6d2d59659c04f6e6960 /numpy
parent	8cc5009857391ab2a11fb1af1042c0b83544e97d (diff)
download	numpy-5d8c3be00dcc2b4a64dcd1ff108474b992edbc53.tar.gz