summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-10-27 11:43:09 +0000
committerSayed Adel <seiko@imavr.com>2020-10-27 11:46:58 +0000
commit5d8c3be00dcc2b4a64dcd1ff108474b992edbc53 (patch)
treedac6f14d957ae2becc7ac6d2d59659c04f6e6960 /numpy
parent8cc5009857391ab2a11fb1af1042c0b83544e97d (diff)
downloadnumpy-5d8c3be00dcc2b4a64dcd1ff108474b992edbc53.tar.gz
MAINT, TST: Add testing cases for partial/non-contig load and store
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src181
-rw-r--r--numpy/core/tests/test_simd.py142
2 files changed, 316 insertions, 7 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 1989be7e3..45e1762f4 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -20,6 +20,7 @@
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
@@ -61,6 +62,172 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
}
/**end repeat1**/
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
+SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+
+// Partial Store
+static PyObject *
+simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+ simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+ if (!PyArg_ParseTuple(
+ args, "O&O&O&:store_till_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &nlane_arg,
+ simd_arg_converter, &vec_arg
+ )) {
+ return NULL;
+ }
+ npyv_store_till_@sfx@(
+ seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
+ );
+ // write-back
+ if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+ simd_arg_free(&seq_arg);
+ return NULL;
+ }
+ simd_arg_free(&seq_arg);
+ Py_RETURN_NONE;
+}
+
+// Non-contiguous Load
+/**begin repeat1
+ * #intrin = loadn, loadn_till, loadn_tillz#
+ * #till = 0, 1, 1#
+ * #fill = 0, 1, 0#
+ * #format = , O&O&, O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg stride_arg = {.dtype = simd_data_s64};
+#if @till@
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if @fill@
+ simd_arg fill_arg = {.dtype = simd_data_@sfx@};
+#endif
+ if (!PyArg_ParseTuple(
+ args, "@format@O&O&:@intrin@_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &stride_arg
+#if @till@
+ ,simd_arg_converter, &nlane_arg
+#endif
+#if @fill@
+ ,simd_arg_converter, &fill_arg
+#endif
+ )) {
+ return NULL;
+ }
+ npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+ npy_intp stride = (npy_intp)stride_arg.data.s64;
+ Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+ Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+ if (stride < 0) {
+ seq_ptr += cur_seq_len -1;
+ min_seq_len = -min_seq_len;
+ }
+ if (cur_seq_len < min_seq_len) {
+ PyErr_Format(PyExc_ValueError,
+ "@intrin@_@sfx@(), according to provided stride %d, the "
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ stride, min_seq_len, cur_seq_len
+ );
+ goto err;
+ }
+ npyv_@sfx@ rvec = npyv_@intrin@_@sfx@(
+ seq_ptr, stride
+ #if @till@
+ , nlane_arg.data.u32
+ #endif
+ #if @fill@
+ , fill_arg.data.@sfx@
+ #endif
+ );
+ simd_arg ret = {
+ .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
+ };
+ simd_arg_free(&seq_arg);
+ return simd_arg_to_obj(&ret);
+err:
+ simd_arg_free(&seq_arg);
+ return NULL;
+}
+/**end repeat1**/
+
+// Non-contiguous Store
+/**begin repeat1
+ * #intrin = storen, storen_till#
+ * #till = 0, 1#
+ * #format = , O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+ simd_arg stride_arg = {.dtype = simd_data_s64};
+ simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+#if @till@
+ simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+ if (!PyArg_ParseTuple(
+ args, "@format@O&O&O&:storen_@sfx@",
+ simd_arg_converter, &seq_arg,
+ simd_arg_converter, &stride_arg
+#if @till@
+ ,simd_arg_converter, &nlane_arg
+#endif
+ ,simd_arg_converter, &vec_arg
+ )) {
+ return NULL;
+ }
+ npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+ npy_intp stride = (npy_intp)stride_arg.data.s64;
+ Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+ Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+ if (stride < 0) {
+ seq_ptr += cur_seq_len -1;
+ min_seq_len = -min_seq_len;
+ }
+ // overflow guard
+ if (cur_seq_len < min_seq_len) {
+ PyErr_Format(PyExc_ValueError,
+ "@intrin@_@sfx@(), according to provided stride %d, the"
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ stride, min_seq_len, cur_seq_len
+ );
+ goto err;
+ }
+ npyv_@intrin@_@sfx@(
+ seq_ptr, stride
+ #if @till@
+ ,nlane_arg.data.u32
+ #endif
+ ,vec_arg.data.v@sfx@
+ );
+ // write-back
+ if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+ goto err;
+ }
+ simd_arg_free(&seq_arg);
+ Py_RETURN_NONE;
+err:
+ simd_arg_free(&seq_arg);
+ return NULL;
+}
+/**end repeat1**/
+#endif // @ncont_sup@
+
+
/***************************
* Misc
***************************/
@@ -205,6 +372,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
*/
@@ -219,6 +387,19 @@ static PyMethodDef simd__intrinsics_methods[] = {
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+/**begin repeat1
+ * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz,
+ * store_till, storen, storen_till#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // ncont_sup
+
+
/***************************
* Misc
***************************/
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 77a636491..50e77a4b8 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -11,18 +11,20 @@ class _Test_Utility:
def __getattr__(self, attr):
"""
- To call NPV intrinsics without the prefix 'npyv_' and
+ To call NPV intrinsics without the attribute 'npyv' and
auto suffixing intrinsics according to class attribute 'sfx'
"""
return getattr(self.npyv, attr + "_" + self.sfx)
- def _data(self, n=None, reverse=False):
+ def _data(self, start=None, count=None, reverse=False):
"""
Create list of consecutive numbers according to number of vector's lanes.
"""
- if n is None:
- n = 1
- rng = range(n, n + self.nlanes)
+ if start is None:
+ start = 1
+ if count is None:
+ count = self.nlanes
+ rng = range(start, start + count)
if reverse:
rng = reversed(rng)
if self._is_fp():
@@ -196,6 +198,132 @@ class _SIMD_ALL(_Test_Utility):
assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
assert store_h != vdata # detect overflow
+ def test_memory_partial_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4] # test out of range
+ for n in lanes:
+ load_till = self.load_till(data, n, 15)
+ data_till = data[:n] + [15] * (self.nlanes-n)
+ assert load_till == data_till
+ load_tillz = self.load_tillz(data, n)
+ data_tillz = data[:n] + [0] * (self.nlanes-n)
+ assert load_tillz == data_tillz
+
+ def test_memory_partial_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ data_rev = self._data(reverse=True)
+ vdata = self.load(data)
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for n in lanes:
+ data_till = data_rev.copy()
+ data_till[:n] = data[:n]
+ store_till = self._data(reverse=True)
+ self.store_till(store_till, n, vdata)
+ assert store_till == data_till
+
+ def test_memory_noncont_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ for stride in range(1, 64):
+ data = self._data(count=stride*self.nlanes)
+ data_stride = data[::stride]
+ loadn = self.loadn(data, stride)
+ assert loadn == data_stride
+
+ for stride in range(-64, 0):
+ data = self._data(stride, -stride*self.nlanes)
+ data_stride = self.load(data[::stride]) # cast unsigned
+ loadn = self.loadn(data, stride)
+ assert loadn == data_stride
+
+ def test_memory_noncont_partial_load(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for stride in range(1, 64):
+ data = self._data(count=stride*self.nlanes)
+ data_stride = data[::stride]
+ for n in lanes:
+ data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+ loadn_till = self.loadn_till(data, stride, n, 15)
+ assert loadn_till == data_stride_till
+ data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+ loadn_tillz = self.loadn_tillz(data, stride, n)
+ assert loadn_tillz == data_stride_tillz
+
+ for stride in range(-64, 0):
+ data = self._data(stride, -stride*self.nlanes)
+ data_stride = list(self.load(data[::stride])) # cast unsigned
+ for n in lanes:
+ data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+ loadn_till = self.loadn_till(data, stride, n, 15)
+ assert loadn_till == data_stride_till
+ data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+ loadn_tillz = self.loadn_tillz(data, stride, n)
+ assert loadn_tillz == data_stride_tillz
+
+ def test_memory_noncont_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ vdata = self.load(self._data())
+ for stride in range(1, 64):
+ data = [15] * stride * self.nlanes
+ data[::stride] = vdata
+ storen = [15] * stride * self.nlanes
+ storen += [127]*64
+ self.storen(storen, stride, vdata)
+ assert storen[:-64] == data
+ assert storen[-64:] == [127]*64 # detect overflow
+
+ for stride in range(-64, 0):
+ data = [15] * -stride * self.nlanes
+ data[::stride] = vdata
+ storen = [127]*64
+ storen += [15] * -stride * self.nlanes
+ self.storen(storen, stride, vdata)
+ assert storen[64:] == data
+ assert storen[:64] == [127]*64 # detect overflow
+
+ def test_memory_noncont_partial_store(self):
+ if self.sfx in ("u8", "s8", "u16", "s16"):
+ return
+
+ data = self._data()
+ vdata = self.load(data)
+ lanes = list(range(1, self.nlanes + 1))
+ lanes += [self.nlanes**2, self.nlanes**4]
+ for stride in range(1, 64):
+ for n in lanes:
+ data_till = [15] * stride * self.nlanes
+ data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+ storen_till = [15] * stride * self.nlanes
+ storen_till += [127]*64
+ self.storen_till(storen_till, stride, n, vdata)
+ assert storen_till[:-64] == data_till
+ assert storen_till[-64:] == [127]*64 # detect overflow
+
+ for stride in range(-64, 0):
+ for n in lanes:
+ data_till = [15] * -stride * self.nlanes
+ data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+ storen_till = [127]*64
+ storen_till += [15] * -stride * self.nlanes
+ self.storen_till(storen_till, stride, n, vdata)
+ assert storen_till[64:] == data_till
+ assert storen_till[:64] == [127]*64 # detect overflow
+
def test_misc(self):
broadcast_zero = self.zero()
assert broadcast_zero == [0] * self.nlanes
@@ -388,7 +516,7 @@ class _SIMD_ALL(_Test_Utility):
int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
fp_sfx = ("f32", "f64")
all_sfx = int_sfx + fp_sfx
-tests_registery = {
+tests_registry = {
int_sfx : _SIMD_INT,
fp_sfx : _SIMD_FP,
all_sfx : _SIMD_ALL
@@ -411,7 +539,7 @@ for target_name, npyv in targets.items():
elif not npyv.simd_f64:
skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
- for sfxes, cls in tests_registery.items():
+ for sfxes, cls in tests_registry.items():
for sfx in sfxes:
skip_m = skip_sfx.get(sfx, skip)
inhr = (cls,)