diff options
| author | Sayed Adel <seiko@imavr.com> | 2020-10-27 11:43:09 +0000 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2020-10-27 11:46:58 +0000 |
| commit | 5d8c3be00dcc2b4a64dcd1ff108474b992edbc53 (patch) | |
| tree | dac6f14d957ae2becc7ac6d2d59659c04f6e6960 /numpy | |
| parent | 8cc5009857391ab2a11fb1af1042c0b83544e97d (diff) | |
| download | numpy-5d8c3be00dcc2b4a64dcd1ff108474b992edbc53.tar.gz | |
MAINT, TST: Add testing cases for partial/non-contig load and store
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 181 | ||||
| -rw-r--r-- | numpy/core/tests/test_simd.py | 142 |
2 files changed, 316 insertions, 7 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 1989be7e3..45e1762f4 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -20,6 +20,7 @@ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# */ @@ -61,6 +62,172 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) } /**end repeat1**/ +/**************************************** + * Non-contiguous/Partial Memory access + ****************************************/ +#if @ncont_sup@ +// Partial Load +SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@) +SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32) + +// Partial Store +static PyObject * +simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg nlane_arg = {.dtype = simd_data_u32}; + simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; + if (!PyArg_ParseTuple( + args, "O&O&O&:store_till_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &nlane_arg, + simd_arg_converter, &vec_arg + )) { + return NULL; + } + npyv_store_till_@sfx@( + seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@ + ); + // write-back + if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { + simd_arg_free(&seq_arg); + return NULL; + } + simd_arg_free(&seq_arg); + Py_RETURN_NONE; +} + +// Non-contiguous Load +/**begin repeat1 + * #intrin = loadn, loadn_till, loadn_tillz# + * #till = 0, 1, 1# + * #fill = 0, 1, 0# + * #format = , O&O&, O&# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg stride_arg = {.dtype = simd_data_s64}; +#if @till@ + simd_arg nlane_arg = {.dtype = simd_data_u32}; +#endif // till +#if @fill@ + simd_arg fill_arg = {.dtype = simd_data_@sfx@}; +#endif + if (!PyArg_ParseTuple( + args, "@format@O&O&:@intrin@_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &stride_arg +#if @till@ + ,simd_arg_converter, &nlane_arg +#endif +#if @fill@ + ,simd_arg_converter, &fill_arg +#endif + )) { + return NULL; + } + npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; + npy_intp stride = (npy_intp)stride_arg.data.s64; + Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); + Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; + if (stride < 0) { + seq_ptr += cur_seq_len -1; + min_seq_len = -min_seq_len; + } + if (cur_seq_len < min_seq_len) { + PyErr_Format(PyExc_ValueError, + "@intrin@_@sfx@(), according to provided stride %d, the " + "minimum acceptable size of the required sequence is %d, given(%d)", + stride, min_seq_len, cur_seq_len + ); + goto err; + } + npyv_@sfx@ rvec = npyv_@intrin@_@sfx@( + seq_ptr, stride + #if @till@ + , nlane_arg.data.u32 + #endif + #if @fill@ + , fill_arg.data.@sfx@ + #endif + ); + simd_arg ret = { + .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec} + }; + simd_arg_free(&seq_arg); + return simd_arg_to_obj(&ret); +err: + simd_arg_free(&seq_arg); + return NULL; +} +/**end repeat1**/ + +// Non-contiguous Store +/**begin repeat1 + * #intrin = storen, storen_till# + * #till = 0, 1# + * #format = , O&# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; + simd_arg stride_arg = {.dtype = simd_data_s64}; + simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; +#if @till@ + simd_arg nlane_arg = {.dtype = simd_data_u32}; +#endif + if (!PyArg_ParseTuple( + args, "@format@O&O&O&:storen_@sfx@", + simd_arg_converter, &seq_arg, + simd_arg_converter, &stride_arg +#if @till@ + ,simd_arg_converter, &nlane_arg +#endif + ,simd_arg_converter, &vec_arg + )) { + return NULL; + } + npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; + npy_intp stride = (npy_intp)stride_arg.data.s64; + Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); + Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; + if (stride < 0) { + seq_ptr += cur_seq_len -1; + min_seq_len = -min_seq_len; + } + // overflow guard + if (cur_seq_len < min_seq_len) { + PyErr_Format(PyExc_ValueError, + "@intrin@_@sfx@(), according to provided stride %d, the" + "minimum acceptable size of the required sequence is %d, given(%d)", + stride, min_seq_len, cur_seq_len + ); + goto err; + } + npyv_@intrin@_@sfx@( + seq_ptr, stride + #if @till@ + ,nlane_arg.data.u32 + #endif + ,vec_arg.data.v@sfx@ + ); + // write-back + if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { + goto err; + } + simd_arg_free(&seq_arg); + Py_RETURN_NONE; +err: + simd_arg_free(&seq_arg); + return NULL; +} +/**end repeat1**/ +#endif // @ncont_sup@ + + /*************************** * Misc ***************************/ @@ -205,6 +372,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# */ @@ -219,6 +387,19 @@ static PyMethodDef simd__intrinsics_methods[] = { SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ +/**************************************** + * Non-contiguous/Partial Memory access + ****************************************/ +#if @ncont_sup@ +/**begin repeat1 + * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz, + * store_till, storen, storen_till# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // ncont_sup + + /*************************** * Misc ***************************/ diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 77a636491..50e77a4b8 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -11,18 +11,20 @@ class _Test_Utility: def __getattr__(self, attr): """ - To call NPV intrinsics without the prefix 'npyv_' and + To call NPV intrinsics without the attribute 'npyv' and auto suffixing intrinsics according to class attribute 'sfx' """ return getattr(self.npyv, attr + "_" + self.sfx) - def _data(self, n=None, reverse=False): + def _data(self, start=None, count=None, reverse=False): """ Create list of consecutive numbers according to number of vector's lanes. """ - if n is None: - n = 1 - rng = range(n, n + self.nlanes) + if start is None: + start = 1 + if count is None: + count = self.nlanes + rng = range(start, start + count) if reverse: rng = reversed(rng) if self._is_fp(): @@ -196,6 +198,132 @@ class _SIMD_ALL(_Test_Utility): assert store_h[:self.nlanes//2] == data[self.nlanes//2:] assert store_h != vdata # detect overflow + def test_memory_partial_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] # test out of range + for n in lanes: + load_till = self.load_till(data, n, 15) + data_till = data[:n] + [15] * (self.nlanes-n) + assert load_till == data_till + load_tillz = self.load_tillz(data, n) + data_tillz = data[:n] + [0] * (self.nlanes-n) + assert load_tillz == data_tillz + + def test_memory_partial_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + data_rev = self._data(reverse=True) + vdata = self.load(data) + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for n in lanes: + data_till = data_rev.copy() + data_till[:n] = data[:n] + store_till = self._data(reverse=True) + self.store_till(store_till, n, vdata) + assert store_till == data_till + + def test_memory_noncont_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + for stride in range(1, 64): + data = self._data(count=stride*self.nlanes) + data_stride = data[::stride] + loadn = self.loadn(data, stride) + assert loadn == data_stride + + for stride in range(-64, 0): + data = self._data(stride, -stride*self.nlanes) + data_stride = self.load(data[::stride]) # cast unsigned + loadn = self.loadn(data, stride) + assert loadn == data_stride + + def test_memory_noncont_partial_load(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for stride in range(1, 64): + data = self._data(count=stride*self.nlanes) + data_stride = data[::stride] + for n in lanes: + data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) + loadn_till = self.loadn_till(data, stride, n, 15) + assert loadn_till == data_stride_till + data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) + loadn_tillz = self.loadn_tillz(data, stride, n) + assert loadn_tillz == data_stride_tillz + + for stride in range(-64, 0): + data = self._data(stride, -stride*self.nlanes) + data_stride = list(self.load(data[::stride])) # cast unsigned + for n in lanes: + data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) + loadn_till = self.loadn_till(data, stride, n, 15) + assert loadn_till == data_stride_till + data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) + loadn_tillz = self.loadn_tillz(data, stride, n) + assert loadn_tillz == data_stride_tillz + + def test_memory_noncont_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + vdata = self.load(self._data()) + for stride in range(1, 64): + data = [15] * stride * self.nlanes + data[::stride] = vdata + storen = [15] * stride * self.nlanes + storen += [127]*64 + self.storen(storen, stride, vdata) + assert storen[:-64] == data + assert storen[-64:] == [127]*64 # detect overflow + + for stride in range(-64, 0): + data = [15] * -stride * self.nlanes + data[::stride] = vdata + storen = [127]*64 + storen += [15] * -stride * self.nlanes + self.storen(storen, stride, vdata) + assert storen[64:] == data + assert storen[:64] == [127]*64 # detect overflow + + def test_memory_noncont_partial_store(self): + if self.sfx in ("u8", "s8", "u16", "s16"): + return + + data = self._data() + vdata = self.load(data) + lanes = list(range(1, self.nlanes + 1)) + lanes += [self.nlanes**2, self.nlanes**4] + for stride in range(1, 64): + for n in lanes: + data_till = [15] * stride * self.nlanes + data_till[::stride] = data[:n] + [15] * (self.nlanes-n) + storen_till = [15] * stride * self.nlanes + storen_till += [127]*64 + self.storen_till(storen_till, stride, n, vdata) + assert storen_till[:-64] == data_till + assert storen_till[-64:] == [127]*64 # detect overflow + + for stride in range(-64, 0): + for n in lanes: + data_till = [15] * -stride * self.nlanes + data_till[::stride] = data[:n] + [15] * (self.nlanes-n) + storen_till = [127]*64 + storen_till += [15] * -stride * self.nlanes + self.storen_till(storen_till, stride, n, vdata) + assert storen_till[64:] == data_till + assert storen_till[:64] == [127]*64 # detect overflow + def test_misc(self): broadcast_zero = self.zero() assert broadcast_zero == [0] * self.nlanes @@ -388,7 +516,7 @@ class _SIMD_ALL(_Test_Utility): int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64") fp_sfx = ("f32", "f64") all_sfx = int_sfx + fp_sfx -tests_registery = { +tests_registry = { int_sfx : _SIMD_INT, fp_sfx : _SIMD_FP, all_sfx : _SIMD_ALL @@ -411,7 +539,7 @@ for target_name, npyv in targets.items(): elif not npyv.simd_f64: skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision" - for sfxes, cls in tests_registery.items(): + for sfxes, cls in tests_registry.items(): for sfx in sfxes: skip_m = skip_sfx.get(sfx, skip) inhr = (cls,) |
