summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-01-18 04:31:01 +0200
committerSayed Adel <seiko@imavr.com>2022-02-13 14:32:52 +0200
commit762538cf6c00c02d16c29a0c380fd0a91bc9e96a (patch)
treeec16884df895514071b5dcd41940a8424e540fcf
parent211111082d1ee21492a1704699ea4b17c4a64ead (diff)
downloadnumpy-762538cf6c00c02d16c29a0c380fd0a91bc9e96a.tar.gz
ENH, SIMD: improve argmax/argmin performance
for all integers, f32 and f64 data types on all supported architectures via universal intrinsics.
-rw-r--r--benchmarks/benchmarks/bench_reduce.py14
-rw-r--r--numpy/core/setup.py4
-rw-r--r--numpy/core/src/multiarray/argfunc.dispatch.c.src392
-rw-r--r--numpy/core/src/multiarray/arraytypes.c.src127
-rw-r--r--numpy/core/src/multiarray/arraytypes.h.src (renamed from numpy/core/src/multiarray/arraytypes.h)21
-rw-r--r--numpy/core/tests/test_multiarray.py155
6 files changed, 599 insertions, 114 deletions
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
index 81316c492..ca07bd180 100644
--- a/benchmarks/benchmarks/bench_reduce.py
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -73,7 +73,8 @@ class FMinMax(Benchmark):
np.fmax.reduce(self.d)
class ArgMax(Benchmark):
- params = [np.float32, np.float64, bool]
+ params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
+ np.int64, np.uint64, np.float32, np.float64, bool]
param_names = ['dtype']
def setup(self, dtype):
@@ -82,6 +83,17 @@ class ArgMax(Benchmark):
def time_argmax(self, dtype):
np.argmax(self.d)
+class ArgMin(Benchmark):
+ params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
+ np.int64, np.uint64, np.float32, np.float64, bool]
+ param_names = ['dtype']
+
+ def setup(self, dtype):
+ self.d = np.ones(200000, dtype=dtype)
+
+ def time_argmin(self, dtype):
+ np.argmin(self.d)
+
class SmallReduction(Benchmark):
def setup(self):
self.d = np.ones(100, dtype=np.float32)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 46b77d477..219bca57c 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -830,7 +830,7 @@ def configuration(parent_package='',top_path=None):
multiarray_deps = [
join('src', 'multiarray', 'abstractdtypes.h'),
join('src', 'multiarray', 'arrayobject.h'),
- join('src', 'multiarray', 'arraytypes.h'),
+ join('src', 'multiarray', 'arraytypes.h.src'),
join('src', 'multiarray', 'arrayfunction_override.h'),
join('src', 'multiarray', 'array_coercion.h'),
join('src', 'multiarray', 'array_method.h'),
@@ -892,7 +892,9 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'abstractdtypes.c'),
join('src', 'multiarray', 'alloc.c'),
join('src', 'multiarray', 'arrayobject.c'),
+ join('src', 'multiarray', 'arraytypes.h.src'),
join('src', 'multiarray', 'arraytypes.c.src'),
+ join('src', 'multiarray', 'argfunc.dispatch.c.src'),
join('src', 'multiarray', 'array_coercion.c'),
join('src', 'multiarray', 'array_method.c'),
join('src', 'multiarray', 'array_assign_scalar.c'),
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
new file mode 100644
index 000000000..39222bc9a
--- /dev/null
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -0,0 +1,392 @@
+/* -*- c -*- */
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse42 xop avx2 avx512_skx
+ ** vsx2
+ ** neon asimd
+ **/
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "numpy/npy_math.h"
+
+#include "arraytypes.h"
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+#if NPY_SIMD
+#if NPY_SIMD > 512 || NPY_SIMD < 0
+ #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
+ // TODO: add special loop for large SIMD width.
+ // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
+ // or maybe expand the indices to 32|64-bit vectors(slower).
+#endif
+/**begin repeat
+ * #sfx = u8, s8, u16, s16#
+ * #usfx = u8, u8, u16, u16#
+ * #bsfx = b8, b8, b16, b16#
+ * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2#
+ */
+/**begin repeat1
+ * #intrin = cmpgt, cmplt#
+ * #func = argmax, argmin#
+ * #op = >, <#
+ */
+static inline npy_intp
+simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
+{
+ npyv_lanetype_@sfx@ s_acc = *ip;
+ npy_intp ret_idx = 0, i = 0;
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep*4;
+ npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
+ for (int vi = 0; vi < wstep; ++vi) {
+ d_vindices[vi] = vi;
+ }
+ const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
+ const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
+ const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
+ const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
+
+ const npy_intp max_block = @idx_max@*wstep & -wstep;
+ npy_intp len0 = len & -wstep;
+ while (i < len0) {
+ npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
+ npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
+ npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@();
+
+ npy_intp n = i + MIN(len0 - i, max_block);
+ npy_intp ik = i, i2 = 0;
+ for (; i < n; i += wstep, ++i2) {
+ npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2);
+ npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+ npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
+ npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
+ npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
+
+ // reverse to put lowest index first in case of matched values
+ npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
+ npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
+ npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a);
+ npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c);
+ npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
+ npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
+
+ npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
+ npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
+ npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
+ npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
+ acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
+ acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices);
+ acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale);
+ }
+ // reduce
+ npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
+ npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
+ npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@];
+ npyv_store_@sfx@(dacc, acc);
+ npyv_store_@usfx@(dacc_i, acc_indices);
+ npyv_store_@usfx@(dacc_s, acc_indices_scale);
+
+ for (int vi = 0; vi < vstep; ++vi) {
+ if (dacc[vi] @op@ s_acc) {
+ s_acc = dacc[vi];
+ ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+ }
+ }
+ // get the lowest index in case of matched values
+ for (int vi = 0; vi < vstep; ++vi) {
+ npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+ if (s_acc == dacc[vi] && ret_idx > idx) {
+ ret_idx = idx;
+ }
+ }
+ }
+ for (; i < len; ++i) {
+ npyv_lanetype_@sfx@ a = ip[i];
+ if (a @op@ s_acc) {
+ s_acc = a;
+ ret_idx = i;
+ }
+ }
+ return ret_idx;
+}
+/**end repeat1**/
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #sfx = u32, s32, u64, s64, f32, f64#
+ * #usfx = u32, u32, u64, u64, u32, u64#
+ * #bsfx = b32, b32, b64, b64, b32, b64#
+ * #is_fp = 0*4, 1*2#
+ * #is_idx32 = 1*2, 0*2, 1, 0#
+ * #chk_simd = NPY_SIMD*5, NPY_SIMD_F64#
+ */
+#if @chk_simd@
+/**begin repeat1
+ * #intrin = cmpgt, cmplt#
+ * #func = argmax, argmin#
+ * #op = >, <#
+ * #iop = <, >#
+ */
+static inline npy_intp
+simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
+{
+ npyv_lanetype_@sfx@ s_acc = *ip;
+ npy_intp ret_idx = 0, i = 0;
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep*4;
+ // loop by a scalar will perform better for small arrays
+ if (len < wstep) {
+ goto scalar_loop;
+ }
+ npy_intp len0 = len;
+ // guard against wraparound vector addition for 32-bit indices
+ // in case of the array length is larger than 16gb
+#if @is_idx32@
+ if (len0 > NPY_MAX_UINT32) {
+ len0 = NPY_MAX_UINT32;
+ }
+#endif
+ // create index for vector indices
+ npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
+ for (int vi = 0; vi < wstep; ++vi) {
+ d_vindices[vi] = vi;
+ }
+ const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
+ const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
+ const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
+ const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
+ // initialize vector accumulator for highest values and its indexes
+ npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
+ npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
+ for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+ npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
+ npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+ npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
+ npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
+ npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
+
+ // reverse to put lowest index first in case of matched values
+ npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
+ npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
+ npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a);
+ npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c);
+ npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
+ npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
+
+ npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
+ npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
+ npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
+ npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
+ acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
+ acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices);
+
+ #if @is_fp@
+ npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
+ npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b);
+ npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c);
+ npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d);
+ npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
+ npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
+ npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
+ if (nnan != ((1LL << vstep) - 1)) {
+ npy_uint64 nnan_4[4];
+ nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
+ nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
+ nnan_4[2] = npyv_tobits_@bsfx@(nnan_c);
+ nnan_4[3] = npyv_tobits_@bsfx@(nnan_d);
+ for (int ni = 0; ni < 4; ++ni) {
+ for (int vi = 0; vi < vstep; ++vi) {
+ if (!((nnan_4[ni] >> vi) & 1)) {
+ return i + ni*vstep + vi;
+ }
+ }
+ }
+ }
+ #endif
+ }
+ for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+ npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
+ npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
+ npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc);
+ acc = npyv_select_@sfx@(m_acc, a, acc);
+ acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices);
+ #if @is_fp@
+ npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
+ npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
+ if (nnan != ((1LL << vstep) - 1)) {
+ for (int vi = 0; vi < vstep; ++vi) {
+ if (!((nnan >> vi) & 1)) {
+ return i + vi;
+ }
+ }
+ }
+ #endif
+ }
+
+ // reduce
+ npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
+ npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
+ npyv_store_@usfx@(dacc_i, acc_indices);
+ npyv_store_@sfx@(dacc, acc);
+
+ s_acc = dacc[0];
+ ret_idx = dacc_i[0];
+ for (int vi = 1; vi < vstep; ++vi) {
+ if (dacc[vi] @op@ s_acc) {
+ s_acc = dacc[vi];
+ ret_idx = (npy_intp)dacc_i[vi];
+ }
+ }
+ // get the lowest index in case of matched values
+ for (int vi = 0; vi < vstep; ++vi) {
+ if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+ ret_idx = dacc_i[vi];
+ }
+ }
+scalar_loop:
+ for (; i < len; ++i) {
+ npyv_lanetype_@sfx@ a = ip[i];
+ #if @is_fp@
+ if (!(a @iop@= s_acc)) { // negated, for correct nan handling
+ #else
+ if (a @op@ s_acc) {
+ #endif
+ s_acc = a;
+ ret_idx = i;
+ #if @is_fp@
+ if (npy_isnan(s_acc)) {
+ // nan encountered, it's maximal
+ return ret_idx;
+ }
+ #endif
+ }
+ }
+ return ret_idx;
+}
+/**end repeat1**/
+#endif // chk_simd
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+ #if @is_fp@
+ #define TO_SIMD_SFX(X) X##_f@len@
+ #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+ #undef TO_SIMD_SFX
+ #endif
+ #elif @is_unsigned@
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #else
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #func = argmax, argmin#
+ * #op = >, <#
+ * #iop = <, >#
+ */
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if @is_fp@
+ if (npy_isnan(*ip)) {
+ // nan encountered; it's maximal|minimal
+ *mindx = 0;
+ return 0;
+ }
+#endif
+#ifdef TO_SIMD_SFX
+ *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+#else
+ @type@ mp = *ip;
+ *mindx = 0;
+ npy_intp i = 1;
+
+ for (; i < n; ++i) {
+ @type@ a = ip[i];
+ #if @is_fp@
+ if (!(a @iop@= mp)) { // negated, for correct nan handling
+ #else
+ if (a @op@ mp) {
+ #endif
+ mp = a;
+ *mindx = i;
+ #if @is_fp@
+ if (npy_isnan(mp)) {
+ // nan encountered, it's maximal|minimal
+ break;
+ }
+ #endif
+ }
+ }
+#endif // TO_SIMD_SFX
+ return 0;
+}
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
+(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+
+{
+ npy_intp i = 0;
+#if NPY_SIMD
+ const npyv_u8 zero = npyv_zero_u8();
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * 4;
+ for (npy_intp n = len & -wstep; i < n; i += wstep) {
+ npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
+ npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
+ npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
+ npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
+ npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
+ npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
+ npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
+ npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
+ npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
+ npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
+ npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
+ #if NPY_SIMD == 512
+ if (m != NPY_MAX_UINT64) {
+ #else
+ if ((npy_int64)m != ((1LL << vstep) - 1)) {
+ #endif
+ break;
+ }
+ }
+#endif // NPY_SIMD
+ for (; i < len; ++i) {
+ if (ip[i]) {
+ *mindx = i;
+ return 0;
+ }
+ }
+ *mindx = 0;
+ return 0;
+}
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 71401c60e..1dc6c9bb1 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -27,12 +27,6 @@
#include "arrayobject.h"
#include "alloc.h"
#include "typeinfo.h"
-#if defined(__ARM_NEON__) || defined (__ARM_NEON)
-#include <arm_neon.h>
-#endif
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#endif
#include "npy_longdouble.h"
#include "numpyos.h"
@@ -42,7 +36,7 @@
#include "npy_cblas.h"
#include "npy_buffer.h"
-
+#include "arraytypes.h"
/*
* Define a stack allocated dummy array with only the minimum information set:
* 1. The descr, the main field interesting here.
@@ -3176,77 +3170,21 @@ finish:
** ARGFUNC **
*****************************************************************************
*/
-#if defined(__ARM_NEON__) || defined (__ARM_NEON)
- int32_t _mm_movemask_epi8_neon(uint8x16_t input)
- {
- int8x8_t m0 = vcreate_s8(0x0706050403020100ULL);
- uint8x16_t v0 = vshlq_u8(vshrq_n_u8(input, 7), vcombine_s8(m0, m0));
- uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
- return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
- }
-#endif
-#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
-static int
-BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
- PyArrayObject *NPY_UNUSED(aip))
-
-{
- npy_intp i = 0;
- /* memcmp like logical_and on i386 is maybe slower for small arrays */
-#ifdef NPY_HAVE_SSE2_INTRINSICS
- const __m128i zero = _mm_setzero_si128();
- for (; i < n - (n % 32); i+=32) {
- __m128i d1 = _mm_loadu_si128((__m128i*)&ip[i]);
- __m128i d2 = _mm_loadu_si128((__m128i*)&ip[i + 16]);
- d1 = _mm_cmpeq_epi8(d1, zero);
- d2 = _mm_cmpeq_epi8(d2, zero);
- if (_mm_movemask_epi8(_mm_min_epu8(d1, d2)) != 0xFFFF) {
- break;
- }
- }
-#else
- #if defined(__ARM_NEON__) || defined (__ARM_NEON)
- uint8x16_t zero = vdupq_n_u8(0);
- for(; i < n - (n % 32); i+=32) {
- uint8x16_t d1 = vld1q_u8((uint8_t *)&ip[i]);
- uint8x16_t d2 = vld1q_u8((uint8_t *)&ip[i + 16]);
- d1 = vceqq_u8(d1, zero);
- d2 = vceqq_u8(d2, zero);
- if(_mm_movemask_epi8_neon(vminq_u8(d1, d2)) != 0xFFFF) {
- break;
- }
- }
- #endif
-#endif
- for (; i < n; i++) {
- if (ip[i]) {
- *max_ind = i;
- return 0;
- }
- }
- *max_ind = 0;
- return 0;
-}
+#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
/**begin repeat
*
- * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- * LONG, ULONG, LONGLONG, ULONGLONG,
- * HALF, FLOAT, DOUBLE, LONGDOUBLE,
- * CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE,
* DATETIME, TIMEDELTA#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- * npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- * npy_half, npy_float, npy_double, npy_longdouble,
- * npy_float, npy_double, npy_longdouble,
+ * #type = npy_half, npy_float, npy_double, npy_longdouble,
* npy_datetime, npy_timedelta#
- * #isfloat = 0*10, 1*7, 0*2#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
- * #iscomplex = 0*14, 1*3, 0*2#
- * #incr = ip++*14, ip+=2*3, ip++*2#
- * #isdatetime = 0*17, 1*2#
+ * #isfloat = 1*4, 0*2#
+ * #isnan = npy_half_isnan, npy_isnan*3, nop*2#
+ * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5#
+ * #iscomplex = 0, 1*3, 0*2#
+ * #incr = ip++, ip+=2*3, ip++*2#
+ * #isdatetime = 0*4, 1*2#
*/
static int
@fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind,
@@ -3337,22 +3275,16 @@ BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
/**begin repeat
*
- * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- * LONG, ULONG, LONGLONG, ULONGLONG,
- * HALF, FLOAT, DOUBLE, LONGDOUBLE,
- * CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * #fname = HALF, CFLOAT, CDOUBLE, CLONGDOUBLE,
* DATETIME, TIMEDELTA#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- * npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- * npy_half, npy_float, npy_double, npy_longdouble,
- * npy_float, npy_double, npy_longdouble,
+ * #type = npy_half, npy_float, npy_double, npy_longdouble,
* npy_datetime, npy_timedelta#
- * #isfloat = 0*10, 1*7, 0*2#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
- * #iscomplex = 0*14, 1*3, 0*2#
- * #incr = ip++*14, ip+=2*3, ip++*2#
- * #isdatetime = 0*17, 1*2#
+ * #isfloat = 1*4, 0*2#
+ * #isnan = npy_half_isnan, npy_isnan*3, nop*2#
+ * #le = npy_half_le, _LESS_THAN_OR_EQUAL*5#
+ * #iscomplex = 0, 1*3, 0*2#
+ * #incr = ip++, ip+=2*3, ip++*2#
+ * #isdatetime = 0*4, 1*2#
*/
static int
@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
@@ -3409,7 +3341,7 @@ static int
*min_ind = i;
break;
}
-#endif
+#endif
if (!@le@(mp, *ip)) { /* negated, for correct nan handling */
mp = *ip;
*min_ind = i;
@@ -4494,6 +4426,27 @@ set_typeinfo(PyObject *dict)
PyArray_Descr *dtype;
PyObject *cobj, *key;
+ // SIMD runtime dispatching
+ #ifndef NPY_DISABLE_OPTIMIZATION
+ #include "argfunc.dispatch.h"
+ #endif
+ /**begin repeat
+ * #FROM = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #NAME = Byte, UByte, Short, UShort, Int, UInt,
+ * Long, ULong, LongLong, ULongLong,
+ * Float, Double, LongDouble#
+ */
+ /**begin repeat1
+ * #func = argmax, argmin#
+ */
+ NPY_CPU_DISPATCH_CALL_XB(_Py@NAME@_ArrFuncs.@func@ = (PyArray_ArgFunc*)@FROM@_@func@);
+ /**end repeat1**/
+ /**end repeat**/
+ NPY_CPU_DISPATCH_CALL_XB(_PyBool_ArrFuncs.argmax = (PyArray_ArgFunc*)BOOL_argmax);
+
/*
* Override the base class for all types, eventually all of this logic
* should be defined on the class and inherited to the scalar.
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h.src
index b3a13b297..4c7487189 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h.src
@@ -28,4 +28,25 @@ small_correlate(const char * d_, npy_intp dstride,
npy_intp nk, enum NPY_TYPES ktype,
char * out_, npy_intp ostride);
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "argfunc.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = byte, ubyte, short, ushort, int, uint,
+ * long, ulong, longlong, ulonglong,
+ * float, double, longdouble#
+ */
+/**begin repeat1
+ * #func = argmax, argmin#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@,
+ (npy_@type@ *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+/**end repeat1**/
+/**end repeat**/
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
+ (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
#endif /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 4e2362547..137fb02ec 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4190,7 +4190,8 @@ class TestArgmaxArgminCommon:
sizes = [(), (3,), (3, 2), (2, 3),
(3, 3), (2, 3, 4), (4, 3, 2),
(1, 2, 3, 4), (2, 3, 4, 1),
- (3, 4, 1, 2), (4, 1, 2, 3)]
+ (3, 4, 1, 2), (4, 1, 2, 3),
+ (64,), (128,), (256,)]
@pytest.mark.parametrize("size, axis", itertools.chain(*[[(size, axis)
for axis in list(range(-len(size), len(size))) + [None]]
@@ -4304,9 +4305,9 @@ class TestArgmaxArgminCommon:
@pytest.mark.parametrize('ndim', [0, 1])
@pytest.mark.parametrize('method', ['argmax', 'argmin'])
def test_ret_is_out(self, ndim, method):
- a = np.ones((4,) + (3,)*ndim)
+ a = np.ones((4,) + (256,)*ndim)
arg_method = getattr(a, method)
- out = np.empty((3,)*ndim, dtype=np.intp)
+ out = np.empty((256,)*ndim, dtype=np.intp)
ret = arg_method(axis=0, out=out)
assert ret is out
@@ -4357,12 +4358,44 @@ class TestArgmaxArgminCommon:
assert_equal(arg_method(), 1)
class TestArgmax:
-
- nan_arr = [
- ([0, 1, 2, 3, np.nan], 4),
- ([0, 1, 2, np.nan, 3], 3),
- ([np.nan, 0, 1, 2, 3], 0),
- ([np.nan, 0, np.nan, 2, 3], 0),
+ usg_data = [
+ ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 0),
+ ([3, 3, 3, 3, 2, 2, 2, 2], 0),
+ ([0, 1, 2, 3, 4, 5, 6, 7], 7),
+ ([7, 6, 5, 4, 3, 2, 1, 0], 0)
+ ]
+ sg_data = usg_data + [
+ ([1, 2, 3, 4, -4, -3, -2, -1], 3),
+ ([1, 2, 3, 4, -1, -2, -3, -4], 3)
+ ]
+ darr = [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product(usg_data, (
+ np.uint8, np.uint16, np.uint32, np.uint64
+ ))
+ )]
+ darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product(sg_data, (
+ np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+ ))
+ )]
+ darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product((
+ ([0, 1, 2, 3, np.nan], 4),
+ ([0, 1, 2, np.nan, 3], 3),
+ ([np.nan, 0, 1, 2, 3], 0),
+ ([np.nan, 0, np.nan, 2, 3], 0),
+ # To hit the tail of SIMD multi-level(x4, x1) inner loops
+ # on varient SIMD widthes
+ ([1] * (2*5-1) + [np.nan], 2*5-1),
+ ([1] * (4*5-1) + [np.nan], 4*5-1),
+ ([1] * (8*5-1) + [np.nan], 8*5-1),
+ ([1] * (16*5-1) + [np.nan], 16*5-1),
+ ([1] * (32*5-1) + [np.nan], 32*5-1)
+ ), (
+ np.float32, np.float64
+ ))
+ )]
+ nan_arr = darr + [
([0, 1, 2, 3, complex(0, np.nan)], 4),
([0, 1, 2, 3, complex(np.nan, 0)], 4),
([0, 1, 2, complex(np.nan, 0), 3], 3),
@@ -4432,28 +4465,80 @@ class TestArgmax:
assert_equal(np.argmax(arr), pos, err_msg="%r" % arr)
assert_equal(arr[np.argmax(arr)], val, err_msg="%r" % arr)
+ # add padding to test SIMD loops
+ rarr = np.repeat(arr, 129)
+ rpos = pos * 129
+ assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr)
+ assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr)
+
+ padd = np.repeat(np.min(arr), 513)
+ rarr = np.concatenate((arr, padd))
+ rpos = pos
+ assert_equal(np.argmax(rarr), rpos, err_msg="%r" % rarr)
+ assert_equal(rarr[np.argmax(rarr)], val, err_msg="%r" % rarr)
+
+
def test_maximum_signed_integers(self):
a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8)
assert_equal(np.argmax(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmax(a), 1)
a = np.array([1, 2**15 - 1, -2**15], dtype=np.int16)
assert_equal(np.argmax(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmax(a), 1)
a = np.array([1, 2**31 - 1, -2**31], dtype=np.int32)
assert_equal(np.argmax(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmax(a), 1)
a = np.array([1, 2**63 - 1, -2**63], dtype=np.int64)
assert_equal(np.argmax(a), 1)
-
+ a.repeat(129)
+ assert_equal(np.argmax(a), 1)
class TestArgmin:
-
- nan_arr = [
- ([0, 1, 2, 3, np.nan], 4),
- ([0, 1, 2, np.nan, 3], 3),
- ([np.nan, 0, 1, 2, 3], 0),
- ([np.nan, 0, np.nan, 2, 3], 0),
+ usg_data = [
+ ([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 8),
+ ([3, 3, 3, 3, 2, 2, 2, 2], 4),
+ ([0, 1, 2, 3, 4, 5, 6, 7], 0),
+ ([7, 6, 5, 4, 3, 2, 1, 0], 7)
+ ]
+ sg_data = usg_data + [
+ ([1, 2, 3, 4, -4, -3, -2, -1], 4),
+ ([1, 2, 3, 4, -1, -2, -3, -4], 7)
+ ]
+ darr = [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product(usg_data, (
+ np.uint8, np.uint16, np.uint32, np.uint64
+ ))
+ )]
+ darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product(sg_data, (
+ np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+ ))
+ )]
+ darr = darr + [(np.array(d[0], dtype=t), d[1]) for d, t in (
+ itertools.product((
+ ([0, 1, 2, 3, np.nan], 4),
+ ([0, 1, 2, np.nan, 3], 3),
+ ([np.nan, 0, 1, 2, 3], 0),
+ ([np.nan, 0, np.nan, 2, 3], 0),
+ # To hit the tail of SIMD multi-level(x4, x1) inner loops
+ # on varient SIMD widthes
+ ([1] * (2*5-1) + [np.nan], 2*5-1),
+ ([1] * (4*5-1) + [np.nan], 4*5-1),
+ ([1] * (8*5-1) + [np.nan], 8*5-1),
+ ([1] * (16*5-1) + [np.nan], 16*5-1),
+ ([1] * (32*5-1) + [np.nan], 32*5-1)
+ ), (
+ np.float32, np.float64
+ ))
+ )]
+ nan_arr = darr + [
([0, 1, 2, 3, complex(0, np.nan)], 4),
([0, 1, 2, 3, complex(np.nan, 0)], 4),
([0, 1, 2, complex(np.nan, 0), 3], 3),
@@ -4512,30 +4597,50 @@ class TestArgmin:
([False, True, False, True, True], 0),
]
- def test_combinations(self):
- for arr, pos in self.nan_arr:
- with suppress_warnings() as sup:
- sup.filter(RuntimeWarning,
- "invalid value encountered in reduce")
- min_val = np.min(arr)
+ @pytest.mark.parametrize('data', nan_arr)
+ def test_combinations(self, data):
+ arr, pos = data
+ with suppress_warnings() as sup:
+ sup.filter(RuntimeWarning,
+ "invalid value encountered in reduce")
+ min_val = np.min(arr)
+
+ assert_equal(np.argmin(arr), pos, err_msg="%r" % arr)
+ assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr)
- assert_equal(np.argmin(arr), pos, err_msg="%r" % arr)
- assert_equal(arr[np.argmin(arr)], min_val, err_msg="%r" % arr)
+ # add padding to test SIMD loops
+ rarr = np.repeat(arr, 129)
+ rpos = pos * 129
+ assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr)
+ assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr)
+
+ padd = np.repeat(np.max(arr), 513)
+ rarr = np.concatenate((arr, padd))
+ rpos = pos
+ assert_equal(np.argmin(rarr), rpos, err_msg="%r" % rarr)
+ assert_equal(rarr[np.argmin(rarr)], min_val, err_msg="%r" % rarr)
def test_minimum_signed_integers(self):
a = np.array([1, -2**7, -2**7 + 1, 2**7 - 1], dtype=np.int8)
assert_equal(np.argmin(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmin(a), 1)
a = np.array([1, -2**15, -2**15 + 1, 2**15 - 1], dtype=np.int16)
assert_equal(np.argmin(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmin(a), 1)
a = np.array([1, -2**31, -2**31 + 1, 2**31 - 1], dtype=np.int32)
assert_equal(np.argmin(a), 1)
+ a.repeat(129)
+ assert_equal(np.argmin(a), 1)
a = np.array([1, -2**63, -2**63 + 1, 2**63 - 1], dtype=np.int64)
assert_equal(np.argmin(a), 1)
-
+ a.repeat(129)
+ assert_equal(np.argmin(a), 1)
class TestMinMax: