summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-07-08 09:27:13 +0200
committerSayed Adel <seiko@imavr.com>2020-10-27 11:46:58 +0000
commitcb3efe8e03b53dbab457a99be1a48384312abe16 (patch)
tree519f83bd1bda84f52fba88516561dd79e0f36826 /numpy
parentfcba5a6c901717110b9767b418df410d7c8c6e73 (diff)
downloadnumpy-cb3efe8e03b53dbab457a99be1a48384312abe16.tar.gz
ENH: Expose the NumPy C SIMD vectorization interface "NPYV" to Python
'_simd' is a new module to bring the NumPy C SIMD vectorization interface "NPYV" The module is designed to be extremely flexible so that it can accommodate any kind intrinsics, also to generate a python interface almost similar to the C interface. The main purpose of this module is to test NPYV intrinsics in python, but still can be used as an effective solution in designing SIMD kernels. Also add a new command-line argument `--simd-test` to control of targeted CPU features for the `_simd` module. Co-authored-by: Matti Picus <matti.picus@gmail.com> Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/setup.py23
-rw-r--r--numpy/core/src/_simd/_simd.c63
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src351
-rw-r--r--numpy/core/src/_simd/_simd.h30
-rw-r--r--numpy/core/src/_simd/_simd_inc.h.src395
-rw-r--r--numpy/core/src/_simd/_simd_inc_arg.h108
-rw-r--r--numpy/core/src/_simd/_simd_inc_convert.h206
-rw-r--r--numpy/core/src/_simd/_simd_inc_data.h.src91
-rw-r--r--numpy/core/src/_simd/_simd_inc_easyintrin.h229
-rw-r--r--numpy/core/src/_simd/_simd_inc_vector.h187
-rw-r--r--numpy/distutils/command/build.py12
-rw-r--r--numpy/distutils/command/build_ext.py8
12 files changed, 1701 insertions, 2 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index b3e17baed..e9a9a4e46 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -626,6 +626,7 @@ def configuration(parent_package='',top_path=None):
config.add_include_dirs(join('src', 'multiarray'))
config.add_include_dirs(join('src', 'umath'))
config.add_include_dirs(join('src', 'npysort'))
+ config.add_include_dirs(join('src', '_simd'))
config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -974,6 +975,28 @@ def configuration(parent_package='',top_path=None):
config.add_extension('_operand_flag_tests',
sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
+ #######################################################################
+ # SIMD module #
+ #######################################################################
+
+ config.add_extension('_simd', sources=[
+ join('src', 'common', 'npy_cpu_features.c.src'),
+ join('src', '_simd', '_simd.c'),
+ join('src', '_simd', '_simd_inc.h.src'),
+ join('src', '_simd', '_simd_inc_data.h.src'),
+ join('src', '_simd', '_simd.dispatch.c.src'),
+ ], depends=[
+ join('src', 'common', 'npy_cpu_dispatch.h'),
+ join('src', 'common', 'simd', 'simd.h'),
+ join('src', '_simd', '_simd.h'),
+ join('src', '_simd', '_simd_inc.h.src'),
+ join('src', '_simd', '_simd_inc_data.h.src'),
+ join('src', '_simd', '_simd_inc_arg.h'),
+ join('src', '_simd', '_simd_inc_convert.h'),
+ join('src', '_simd', '_simd_inc_easyintrin.h'),
+ join('src', '_simd', '_simd_inc_vector.h'),
+ ])
+
config.add_subpackage('tests')
config.add_data_dir('tests/data')
config.add_data_dir('tests/examples')
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
new file mode 100644
index 000000000..e5cb582b3
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.c
@@ -0,0 +1,63 @@
+#include "_simd.h"
+
+PyMODINIT_FUNC PyInit__simd(void)
+{
+ static struct PyModuleDef defs = {
+ .m_base = PyModuleDef_HEAD_INIT,
+ .m_name = "_simd",
+ .m_size = -1
+ };
+ if (npy_cpu_init() < 0) {
+ return NULL;
+ }
+ PyObject *m = PyModule_Create(&defs);
+ if (m == NULL) {
+ return NULL;
+ }
+ PyObject *targets = PyDict_New();
+ if (targets == NULL) {
+ goto err;
+ }
+ if (PyModule_AddObject(m, "targets", targets) < 0) {
+ Py_DECREF(targets);
+ goto err;
+ }
+ // add keys for non-supported optimizations with None value
+ #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY) \
+ { \
+ PyObject *simd_mod; \
+ if (!TESTED_FEATURES) { \
+ Py_INCREF(Py_None); \
+ simd_mod = Py_None; \
+ } else { \
+ simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)(); \
+ if (simd_mod == NULL) { \
+ goto err; \
+ } \
+ } \
+ const char *target_name = NPY_TOSTRING(TARGET_NAME); \
+ if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ }
+
+ #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY) \
+ { \
+ PyObject *simd_mod = simd_create_module(); \
+ if (simd_mod == NULL) { \
+ goto err; \
+ } \
+ if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) { \
+ Py_DECREF(simd_mod); \
+ goto err; \
+ } \
+ }
+
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+ NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+ return m;
+err:
+ Py_DECREF(m);
+ return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
new file mode 100644
index 000000000..69a9ffd45
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -0,0 +1,351 @@
+/*@targets $werror #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_inc_data.h"
+#include "_simd_inc_convert.h"
+#include "_simd_inc_vector.h"
+#include "_simd_inc_arg.h"
+#include "_simd_inc_easyintrin.h"
+
+/*************************************************************************
+ * Defining NPYV intrinsics as module functions
+ *************************************************************************/
+/**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
+ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
+ * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
+ * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ */
+#if @simd_sup@
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
+/**end repeat1**/
+/**begin repeat1
+ * # intrin = store, storea, stores, storel, storeh#
+ */
+// special definition due to the nature of @intrin@
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ simd_arg req_args[] = {
+ {.dtype = simd_data_q@sfx@},
+ {.dtype = simd_data_v@sfx@},
+ };
+ if (simd_args_from_tuple(args, req_args, 2, "@intrin@_@sfx@")) {
+ return NULL;
+ }
+ npyv_@intrin@_@sfx@(
+ req_args[0].data.q@sfx@, req_args[1].data.v@sfx@
+ );
+ // write-back
+ if (simd_sequence_fill_obj(req_args[0].obj, req_args[0].data.q@sfx@, simd_data_q@sfx@)) {
+ simd_args_sequence_free(req_args, 2);
+ return NULL;
+ }
+ simd_args_sequence_free(req_args, 2);
+ Py_RETURN_NONE;
+}
+/**end repeat1**/
+
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
+SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_@sfx@ and npy_set_@sfx@.
+*/
+/**begin repeat1
+ * #intrin = setf, set#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+ npyv_lanetype_@sfx@ *data = simd_sequence_from_obj(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
+ if (data == NULL) {
+ return NULL;
+ }
+ simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@(
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
+ data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15],
+ data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+ data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+ data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+ data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+ data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+ data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+ data[64] // for setf
+ )};
+ simd_sequence_free(data);
+ return (PyObject*)simd_vector_to_obj(r, simd_data_v@sfx@);
+}
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = combine, zip#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8)
+SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@)
+SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@)
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin = and, or, xor#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #intrin = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin = add, sub#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin = adds, subs#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // div_sup
+
+#endif // simd_sup
+/**end repeat**/
+
+/***************************
+ * Variant
+ ***************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+
+/*************************************************************************
+ * Attach module functions
+ *************************************************************************/
+static PyMethodDef simd__intrinsics_methods[] = {
+/**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
+ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
+ * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
+ * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
+ */
+#if @simd_sup@
+
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Misc
+ ***************************/
+/**begin repeat1
+ * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = set, setf, setall, zero, select#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh, combine, zip#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+/**begin repeat1
+ * # intrin = shl, shr, shli, shri#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin = add, sub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin = adds, subs#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_INTRIN_DEF(mul_@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_INTRIN_DEF(div_@sfx@)
+#endif // div_sup
+
+#endif // simd_sup
+/**end repeat**/
+
+/***************************
+ * Variant
+ ***************************/
+SIMD_INTRIN_DEF(cleanup)
+/***************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+/*************************************************************************
+ * Defining a separate module for each target
+ *************************************************************************/
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+ static struct PyModuleDef defs = {
+ .m_base = PyModuleDef_HEAD_INIT,
+ .m_size = -1,
+ #ifdef NPY__CPU_TARGET_CURRENT
+ .m_name = "NPYV_" NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+ #else
+ .m_name = "NPYV_BASELINE",
+ #endif
+ #if NPY_SIMD
+ .m_methods = simd__intrinsics_methods
+ #else
+ .m_methods = NULL
+ #endif
+ };
+ PyObject *m = PyModule_Create(&defs);
+ if (m == NULL) {
+ return NULL;
+ }
+ if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+ goto err;
+ }
+ if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+ goto err;
+ }
+ if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+ goto err;
+ }
+#if NPY_SIMD
+ if (simd_vector_register(m)) {
+ goto err;
+ }
+ /**begin repeat
+ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ */
+ if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) {
+ goto err;
+ }
+ /**end repeat**/
+#endif // NPY_SIMD
+ return m;
+err:
+ Py_DECREF(m);
+ return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h
new file mode 100644
index 000000000..d9905c801
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.h
@@ -0,0 +1,30 @@
+/**
+ * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes.
+ *
+ * Please keep this module independent from other c-extension modules,
+ * since NPYV intrinsics may be involved in their functionality,
+ * which increases the degree of complexity in tracking and detecting errors.
+ *
+ * TODO: Add an independent sphinx doc.
+ *
+ * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'.
+ */
+#ifndef _SIMD_SIMD_H_
+#define _SIMD_SIMD_H_
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+// autogenerated, required for CPU dispatch macros
+#include "_simd.dispatch.h"
+#endif
+/**
+ * Create a new module for each required optimization which contains all NPYV intrinsics,
+ *
+ * If required optimization is not supported by NPYV, the module will still provides
+ * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without
+ * any intrinsics.
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void))
+#endif // _SIMD_SIMD_H_
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
new file mode 100644
index 000000000..4261ce148
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -0,0 +1,395 @@
+/**
+ * This header works only through '_simd.dispatch.c'
+ */
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+ // scalars
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ npyv_lanetype_@sfx@ @sfx@;
+ /**end repeat**/
+ // sequence
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ npyv_lanetype_@sfx@ *q@sfx@;
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+ */
+ npyv_@sfx@ v@sfx@;
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+ */
+ npyv_@sfx@x2 v@sfx@x2;
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+ */
+ npyv_@sfx@x3 v@sfx@x3;
+ /**end repeat**/
+#if NPY_SIMD_F64
+ npyv_f64 vf64;
+ npyv_f64x2 vf64x2;
+ npyv_f64x3 vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+ simd_data_none = 0,
+ // scalars
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_@sfx@,
+ /**end repeat**/
+ // sequences
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_q@sfx@,
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64#
+ */
+ simd_data_v@sfx@,
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_v@sfx@x2,
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ */
+ simd_data_v@sfx@x3,
+ /**end repeat**/
+ simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+ // type name compatible with python style
+ const char *pyname;
+ // returns '1' if the type represent a unsigned integer
+ int is_unsigned:1;
+ // returns '1' if the type represent a signed integer
+ int is_signed:1;
+ // returns '1' if the type represent a single or double precision
+ int is_float:1;
+ // returns '1' if the type represent a boolean
+ int is_bool:1;
+ // returns '1' if the type represent a sequence
+ int is_sequence:1;
+ // returns '1' if the type represent a scalar
+ int is_scalar:1;
+ // returns '1' if the type represent a vector
+ int is_vector:1;
+ // returns the len of multi-vector if the type reprsent x2 or x3 vector
+ // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+ int is_vectorx;
+ // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+ simd_data_type to_scalar;
+ // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+ // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+ // e.g. simd_data_vb8 -> simd_data_vu8
+ simd_data_type to_vector;
+ // number of vector lanes
+ int nlanes;
+ // sizeof lane type
+ int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ ** const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ ** if (info->is_vector && info->is_unsigned) {
+ ** ...
+ ** }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+ PyObject_HEAD
+ // vector type id
+ simd_data_type type;
+ // vector data, aligned for safe casting
+ npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} simd_vector;
+/**
+ * convert simd_data to PyObject(simd_vector),
+ * raise Python exception on failure and returns NULL.
+ */
+static simd_vector *
+simd_vector_to_obj(simd_data data, simd_data_type vtype);
+/**
+ * convert PyObject(simd_vector) to simd_data,
+ * raise Python exception on failure.
+ */
+static simd_data
+simd_vector_from_obj(simd_vector *vec, simd_data_type vtype);
+/**
+ * initialize and register vector type(PyTypeObject) to PyModule,
+ * vector type can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+simd_vector_register(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_obj(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ ** printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_obj(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_obj(data, simd_data_s32);
+ ** if (obj != NULL) {
+ ** printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ ** Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_obj(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ ** // aligned store
+ ** npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ ** printf("The first element of my array %f\n", aligned_ptr[0]);
+ ** simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_obj()`.
+ */
+static size_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_obj()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_obj(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ ** npyv_u32 a = npyv_load_u32(ptr);
+ ** ...
+ ** simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_obj()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_obj()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_obj(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_obj(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ ** npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ ** ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_obj(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+ simd_data_type dtype;
+ simd_data data;
+ // set by simd_args_from_tuple()
+ PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ * - requires `simd_args_sequence_free()` or `simd_sequence_free()`
+ * to free allocated C array, in case of sequence data types.
+ * - the number of minimum acceptable elements for sequence data
+ * types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ ** // fails to convert a python sequence object to C array of uint8
+ ** return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_args_sequence_free(&arg, 1);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ ** // fails to convert a python simd_vector to NPYV vector
+ ** return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ ** // fails convert C uint32 to Python integer.
+ ** return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Similar to simd_arg_from_obj() but convert multiple objects
+ * from a Python tuple.
+ * This function is used to parse method parameters.
+ *
+ * Notes:
+ * - requires `simd_args_sequence_free()` or `simd_sequence_free()`
+ * to free allocated C array, in case of sequence data types.
+ * - the number of minimum acceptable elements for sequence data
+ * types is the number of lanes of the equivalent vector data type.
+ * - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Parameters:
+ * - `tuple_obj`: a valid Python tuple
+ * - `args`: array of 'simd_arg' contain valid data types
+ * - `args_len`: length of `args`.
+ * - `method_name`: method name, required for exception message.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ */
+static int
+simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name);
+/**
+ * Free the allocated C array for sequence data types.
+ */
+static void
+simd_args_sequence_free(simd_arg *args, int args_len);
+
+#endif // NPY_SIMD
diff --git a/numpy/core/src/_simd/_simd_inc_arg.h b/numpy/core/src/_simd/_simd_inc_arg.h
new file mode 100644
index 000000000..1db1ff202
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_arg.h
@@ -0,0 +1,108 @@
+#if !NPY_SIMD
+ #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg)
+{
+ assert(arg->dtype != 0);
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (info->is_scalar) {
+ arg->data = simd_scalar_from_obj(obj, arg->dtype);
+ }
+ else if (info->is_sequence) {
+ unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
+ arg->data.qu8 = simd_sequence_from_obj(obj, arg->dtype, min_seq_size);
+ }
+ else if (info->is_vectorx) {
+ arg->data = simd_vectorx_from_obj(obj, arg->dtype);
+ }
+ else if (info->is_vector) {
+ arg->data = simd_vector_from_obj((simd_vector*)obj, arg->dtype);
+ } else {
+ arg->data.u64 = 0;
+ PyErr_Format(PyExc_RuntimeError,
+ "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname
+ );
+ return -1;
+ }
+ if (PyErr_Occurred()) {
+ return -1;
+ }
+ return 0;
+}
+
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg)
+{
+ assert(arg->dtype != 0);
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (info->is_scalar) {
+ return simd_scalar_to_obj(arg->data, arg->dtype);
+ }
+ if (info->is_sequence) {
+ return simd_sequence_to_obj(arg->data.qu8, arg->dtype);
+ }
+ if (info->is_vectorx) {
+ return simd_vectorx_to_obj(arg->data, arg->dtype);
+ }
+ if (info->is_vector) {
+ return (PyObject*)simd_vector_to_obj(arg->data, arg->dtype);
+ }
+ PyErr_Format(PyExc_RuntimeError,
+ "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
+ );
+ return NULL;
+}
+
+static void
+simd_args_sequence_free(simd_arg *args, int args_len)
+{
+ assert(args_len > 0);
+ while (--args_len >= 0) {
+ simd_arg *arg = &args[args_len];
+ const simd_data_info *info = simd_data_getinfo(arg->dtype);
+ if (!info->is_sequence) {
+ continue;
+ }
+ simd_sequence_free(arg->data.qu8);
+ }
+}
+
+static int
+simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name)
+{
+ assert(args_len > 0);
+ assert(PyTuple_Check(tuple_obj));
+
+ Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(tuple_obj);
+ if (obj_arg_len != args_len) {
+ if (args_len == 1) {
+ PyErr_Format(PyExc_TypeError,
+ "%s() takes only one argument (%d given)", method_name, obj_arg_len
+ );
+ return -1;
+ }
+ PyErr_Format(PyExc_TypeError,
+ "%s() takes exactly %d arguments (%d given)", method_name, args_len, obj_arg_len
+ );
+ return -1;
+ }
+ for (int arg_pos = 0; arg_pos < args_len; ++arg_pos) {
+ simd_arg *arg = &args[arg_pos];
+ arg->obj = PyTuple_GET_ITEM(tuple_obj, arg_pos);
+ assert(arg->obj != NULL);
+ if (simd_arg_from_obj(arg->obj, arg) != 0) {
+ // free previous args
+ if (arg_pos > 0) {
+ simd_args_sequence_free(args, arg_pos);
+ }
+ // TODO: improve log by add argument number and method name
+ return -1;
+ }
+ }
+ return 0;
+}
diff --git a/numpy/core/src/_simd/_simd_inc_convert.h b/numpy/core/src/_simd/_simd_inc_convert.h
new file mode 100644
index 000000000..360101247
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_convert.h
@@ -0,0 +1,206 @@
+#if !NPY_SIMD
+ #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_data
+simd_scalar_from_obj(PyObject *obj, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_scalar && info->lane_size > 0);
+ simd_data data;
+ if (info->is_float) {
+ data.f64 = PyFloat_AsDouble(obj);
+ if (dtype == simd_data_f32){
+ data.f32 = (float)data.f64;
+ }
+ } else {
+ data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+ }
+ return data;
+}
+
+static PyObject *
+simd_scalar_to_obj(simd_data data, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_scalar && info->lane_size > 0);
+ if (info->is_float) {
+ if (dtype == simd_data_f32) {
+ return PyFloat_FromDouble(data.f32);
+ }
+ return PyFloat_FromDouble(data.f64);
+ }
+ int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+ data.u64 <<= leftb;
+ if (info->is_signed) {
+ return PyLong_FromLongLong(data.s64 >> leftb);
+ }
+ return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
+}
+
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_sequence && info->lane_size > 0);
+
+ size_t size = NPY_SIMD_WIDTH + sizeof(size_t) + sizeof(size_t*);
+ size += len * info->lane_size;
+
+ size_t *ptr = malloc(size);
+ if (ptr == NULL) {
+ return PyErr_NoMemory();
+ }
+ *(ptr++) = len;
+ size_t **a_ptr = (size_t**)(
+ ((size_t)ptr + NPY_SIMD_WIDTH) & ~(size_t)(NPY_SIMD_WIDTH-1)
+ );
+ a_ptr[-1] = ptr;
+ return a_ptr;
+}
+
+static size_t
+simd_sequence_len(const void *ptr)
+{
+ size_t *ptrz = ((size_t**)ptr)[-1];
+ return *(ptrz-1);
+}
+
+static void
+simd_sequence_free(void *ptr)
+{
+ size_t *ptrz = ((size_t**)ptr)[-1];
+ free(ptrz-1);
+}
+
+static void *
+simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ assert(info->is_sequence && info->lane_size > 0);
+ PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence");
+ if (seq_obj == NULL) {
+ return NULL;
+ }
+ Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
+ if (seq_size < (Py_ssize_t)min_size) {
+ PyErr_Format(PyExc_ValueError,
+ "minimum acceptable size of the required sequence is %d, given(%d)",
+ min_size, seq_size
+ );
+ return NULL;
+ }
+ npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype);
+ if (dst == NULL) {
+ return NULL;
+ }
+ PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
+ for (Py_ssize_t i = 0; i < seq_size; ++i) {
+ simd_data data = simd_scalar_from_obj(seq_items[i], info->to_scalar);
+ npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
+ memcpy(sdst, &data.u64, info->lane_size);
+ }
+ Py_DECREF(seq_obj);
+
+ if (PyErr_Occurred()) {
+ simd_sequence_free(dst);
+ return NULL;
+ }
+ return dst;
+}
+
+static int
+simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ if (!PySequence_Check(obj)) {
+ PyErr_Format(PyExc_TypeError,
+ "a sequence object is required to fill %s", info->pyname
+ );
+ return -1;
+ }
+ const npyv_lanetype_u8 *src = ptr;
+ Py_ssize_t seq_len = (Py_ssize_t)simd_sequence_len(ptr);
+ for (Py_ssize_t i = 0; i < seq_len; ++i) {
+ const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
+ simd_data data;
+ memcpy(&data.u64, ssrc, info->lane_size);
+ PyObject *item = simd_scalar_to_obj(data, info->to_scalar);
+ if (item == NULL) {
+ return -1;
+ }
+ if (PySequence_SetItem(obj, i, item) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static PyObject *
+simd_sequence_to_obj(const void *ptr, simd_data_type dtype)
+{
+ PyObject *list = PyList_New((Py_ssize_t)simd_sequence_len(ptr));
+ if (list == NULL) {
+ return NULL;
+ }
+ if (simd_sequence_fill_obj(list, ptr, dtype) < 0) {
+ Py_DECREF(list);
+ return NULL;
+ }
+ return list;
+}
+
+static simd_data
+simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ // NPYV currently only supports x2 and x3
+ assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+ simd_data data = {.u64 = 0};
+ if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) {
+ PyErr_Format(PyExc_TypeError,
+ "a tuple of %d vector type %s is required",
+ info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname
+ );
+ return data;
+ }
+ for (int i = 0; i < info->is_vectorx; ++i) {
+ PyObject *item = PyTuple_GET_ITEM(obj, i);
+ // get the max multi-vec and let the compiler do the rest
+ data.vu64x3.val[i] = simd_vector_from_obj((simd_vector*)item, info->to_vector).vu64;
+ if (PyErr_Occurred()) {
+ return data;
+ }
+ }
+ return data;
+}
+
+static PyObject *
+simd_vectorx_to_obj(simd_data data, simd_data_type dtype)
+{
+ const simd_data_info *info = simd_data_getinfo(dtype);
+ // NPYV currently only supports x2 and x3
+ assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+ PyObject *tuple = PyTuple_New(info->is_vectorx);
+ if (tuple == NULL) {
+ return NULL;
+ }
+ for (int i = 0; i < info->is_vectorx; ++i) {
+ // get the max multi-vector and let the compiler handle the rest
+ simd_data vdata = {.vu64 = data.vu64x3.val[i]};
+ PyObject *item = (PyObject*)simd_vector_to_obj(vdata, info->to_vector);
+ if (item == NULL) {
+ // TODO: improve log add item number
+ Py_DECREF(tuple);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(tuple, i, item);
+ }
+ return tuple;
+}
diff --git a/numpy/core/src/_simd/_simd_inc_data.h.src b/numpy/core/src/_simd/_simd_inc_data.h.src
new file mode 100644
index 000000000..eefac483b
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_data.h.src
@@ -0,0 +1,91 @@
+#if !NPY_SIMD
+ #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+ [simd_data_none] = {.pyname="none"},
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ * #name = int*8, float, float#
+ */
+ [simd_data_@sfx@] = {
+ .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // sequences
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ * #name = int*8, float, float#
+ */
+ [simd_data_q@sfx@] = {
+ .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // vectors
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@] = {
+ .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // boolean vectors, treated as unsigned and converted internally
+ // to add compatibility among all SIMD extensions
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64#
+ * #bsfx = b8, b16, b32, b64#
+ */
+ [simd_data_v@bsfx@] = {
+ .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1,
+ .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // multi-vectors x2
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@x2] = {
+ .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+ // multi-vectors x3
+ /**begin repeat
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+ * #sig = 0*4, 1*4, 0*2#
+ * #fp = 0*4, 0*4, 1*2#
+ */
+ [simd_data_v@sfx@x3] = {
+ .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+ .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+ .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@)
+ },
+ /**end repeat**/
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
diff --git a/numpy/core/src/_simd/_simd_inc_easyintrin.h b/numpy/core/src/_simd/_simd_inc_easyintrin.h
new file mode 100644
index 000000000..7216b373a
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_easyintrin.h
@@ -0,0 +1,229 @@
+#if !NPY_SIMD
+ #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+#define SIMD_INTRIN_DEF(NAME) \
+ { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma
+
+static int simd__no_arguments(PyObject *args, const char* method_name)
+{
+ if (args == NULL) {
+ return 0;
+ }
+ assert(PyTuple_Check(args));
+ Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(args);
+ if (obj_arg_len != 0) {
+ PyErr_Format(PyExc_RuntimeError,
+ "%s(), takes no arguments, given(%d)", method_name, obj_arg_len
+ );
+ return -1;
+ }
+ return 0;
+}
+
+#define SIMD_IMPL_INTRIN_0(NAME, RET) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ if (simd__no_arguments( \
+ args, NPY_TOSTRING(NAME) \
+ )) return NULL; \
+ simd_arg a = { \
+ .dtype = simd_data_##RET, \
+ .data = {.RET = npyv_##NAME()}, \
+ }; \
+ return simd_arg_to_obj(&a); \
+ }
+
+#define SIMD_IMPL_INTRIN_0N(NAME) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ if (simd__no_arguments( \
+ args, NPY_TOSTRING(NAME) \
+ )) return NULL; \
+ npyv_##NAME(); \
+ Py_RETURN_NONE; \
+ }
+
+#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg req_args[] = { \
+ {.dtype = simd_data_##IN0}, \
+ }; \
+ if (simd_args_from_tuple( \
+ args, req_args, 1, NPY_TOSTRING(NAME)) \
+ ) return NULL; \
+ simd_data r = {.RET = npyv_##NAME( \
+ req_args[0].data.IN0 \
+ )}; \
+ simd_args_sequence_free(req_args, 1); \
+ req_args[0].data = r; \
+ req_args[0].dtype = simd_data_##RET; \
+ return simd_arg_to_obj(req_args); \
+ }
+
+#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg req_args[] = { \
+ {.dtype = simd_data_##IN0}, \
+ {.dtype = simd_data_##IN1}, \
+ }; \
+ if (simd_args_from_tuple( \
+ args, req_args, 2, NPY_TOSTRING(NAME)) \
+ ) { \
+ return NULL; \
+ } \
+ simd_data r = {.RET = npyv_##NAME( \
+ req_args[0].data.IN0, \
+ req_args[1].data.IN1 \
+ )}; \
+ simd_args_sequence_free(req_args, 2); \
+ req_args[0].data = r; \
+ req_args[0].dtype = simd_data_##RET; \
+ return simd_arg_to_obj(req_args); \
+ }
+
+#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
+ C == req_args[1].data.u8 ? NPY_CAT(npyv_, NAME)(req_args[0].data.IN0, C) :
+
+#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg req_args[] = { \
+ {.dtype = simd_data_##IN0}, \
+ {.dtype = simd_data_u8}, \
+ }; \
+ if (simd_args_from_tuple( \
+ args, req_args, 2, NPY_TOSTRING(NAME)) \
+ ) { \
+ return NULL; \
+ } \
+ simd_data r; \
+ r.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \
+ SIMD__REPEAT_2IMM, NAME, IN0 \
+ ) npyv_##NAME(req_args[0].data.IN0, 0); \
+ simd_args_sequence_free(req_args, 2); \
+ req_args[0].data = r; \
+ req_args[0].dtype = simd_data_##RET; \
+ return simd_arg_to_obj(req_args); \
+ }
+
+#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg req_args[] = { \
+ {.dtype = simd_data_##IN0}, \
+ {.dtype = simd_data_##IN1}, \
+ {.dtype = simd_data_##IN2}, \
+ }; \
+ if (simd_args_from_tuple( \
+ args, req_args, 3, NPY_TOSTRING(NAME)) \
+ ) { \
+ return NULL; \
+ } \
+ simd_data r = {.RET = npyv_##NAME( \
+ req_args[0].data.IN0, \
+ req_args[1].data.IN1, \
+ req_args[2].data.IN2 \
+ )}; \
+ simd_args_sequence_free(req_args, 3); \
+ req_args[0].data = r; \
+ req_args[0].dtype = simd_data_##RET; \
+ return simd_arg_to_obj(req_args); \
+ }
+/**
+ * Helper macros for repeating and expand a certain macro.
+ * Mainly used for converting a scalar to an immediate constant.
+ */
+#define SIMD__IMPL_COUNT_7(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_8(FN, ...) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(8, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_16(FN, ...) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(16, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_32(FN, ...) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(32, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_48(FN, ...) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(48, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63(FN, ...) \
+ NPY_EXPAND(FN(0, __VA_ARGS__)) \
+ SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_64(FN, ...) \
+ SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(64, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_7_(FN, ...) \
+ NPY_EXPAND(FN(1, __VA_ARGS__)) \
+ NPY_EXPAND(FN(2, __VA_ARGS__)) NPY_EXPAND(FN(3, __VA_ARGS__)) \
+ NPY_EXPAND(FN(4, __VA_ARGS__)) NPY_EXPAND(FN(5, __VA_ARGS__)) \
+ NPY_EXPAND(FN(6, __VA_ARGS__)) NPY_EXPAND(FN(7, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15_(FN, ...) \
+ SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(8, __VA_ARGS__)) NPY_EXPAND(FN(9, __VA_ARGS__)) \
+ NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \
+ NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \
+ NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31_(FN, ...) \
+ SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \
+ NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \
+ NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \
+ NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \
+ NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \
+ NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \
+ NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \
+ NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47_(FN, ...) \
+ SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \
+ NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \
+ NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \
+ NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \
+ NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \
+ NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \
+ NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \
+ NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63_(FN, ...) \
+ SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+ NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \
+ NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \
+ NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \
+ NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \
+ NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \
+ NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \
+ NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \
+ NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__))
diff --git a/numpy/core/src/_simd/_simd_inc_vector.h b/numpy/core/src/_simd/_simd_inc_vector.h
new file mode 100644
index 000000000..b0fa17b9a
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_vector.h
@@ -0,0 +1,187 @@
+#if !NPY_SIMD
+ #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Private Definitions
+ ************************************/
+// PySequenceMethods
+static Py_ssize_t
+simd__vector_length(simd_vector *self)
+{
+ return simd_data_getinfo(self->type)->nlanes;
+}
+static PyObject *
+simd__vector_item(simd_vector *self, Py_ssize_t i)
+{
+ const simd_data_info *info = simd_data_getinfo(self->type);
+ int nlanes = info->nlanes;
+ if (i >= nlanes) {
+ PyErr_SetString(PyExc_IndexError, "list index out of range");
+ return NULL;
+ }
+ npyv_lanetype_u8 *src = self->data + i * info->lane_size;
+ simd_data data;
+ memcpy(&data.u64, src, info->lane_size);
+ return simd_scalar_to_obj(data, info->to_scalar);
+}
+
+static PySequenceMethods simd__vector_as_sequence = {
+ (lenfunc) simd__vector_length, /* sq_length */
+ (binaryfunc) NULL, /* sq_concat */
+ (ssizeargfunc) NULL, /* sq_repeat */
+ (ssizeargfunc) simd__vector_item, /* sq_item */
+ (ssizessizeargfunc) NULL, /* sq_slice */
+ (ssizeobjargproc) NULL, /* sq_ass_item */
+ (ssizessizeobjargproc) NULL, /* sq_ass_slice */
+ (objobjproc) NULL, /* sq_contains */
+ (binaryfunc) NULL, /* sq_inplace_concat */
+ (ssizeargfunc) NULL, /* sq_inplace_repeat */
+};
+
+// PyGetSetDef
+static PyObject *
+simd__vector_name(simd_vector *self)
+{
+ return PyUnicode_FromString(simd_data_getinfo(self->type)->pyname);
+}
+static PyGetSetDef simd__vector_getset[] = {
+ { "__name__", (getter)simd__vector_name, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL }
+};
+
+// PyTypeObject(simd__vector_type)
+static PyObject *
+simd__vector_repr(PyObject *self)
+{
+ // PySequence_Fast returns Tuple in PyPy
+ PyObject *obj = PySequence_List(self);
+ if (obj != NULL) {
+ PyObject *repr = PyObject_Str(obj);
+ Py_DECREF(obj);
+ return repr;
+ }
+ return obj;
+}
+static PyObject *
+simd__vector_compare(PyObject *self, PyObject *other, int cmp_op)
+{
+ PyObject *obj;
+ if (PyTuple_Check(other)) {
+ obj = PySequence_Tuple(self);
+ } else if (PyList_Check(other)) {
+ obj = PySequence_List(self);
+ } else {
+ obj = PySequence_Fast(self, "invalid argument, expected a vector");
+ }
+ if (obj != NULL) {
+ PyObject *rich = PyObject_RichCompare(obj, other, cmp_op);
+ Py_DECREF(obj);
+ return rich;
+ }
+ return obj;
+}
+static PyTypeObject simd__vector_type = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)),
+ .tp_basicsize = sizeof(simd_vector),
+ .tp_repr = (reprfunc)simd__vector_repr,
+ .tp_as_sequence = &simd__vector_as_sequence,
+ .tp_flags = Py_TPFLAGS_DEFAULT,
+ .tp_richcompare = simd__vector_compare,
+ .tp_getset = simd__vector_getset
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_vector *
+simd_vector_to_obj(simd_data data, simd_data_type vtype)
+{
+ const simd_data_info *info = simd_data_getinfo(vtype);
+ assert(info->is_vector && info->nlanes > 0);
+
+ simd_vector *vec = PyObject_New(simd_vector, &simd__vector_type);
+ if (vec == NULL) {
+ return (simd_vector*)PyErr_NoMemory();
+ }
+ vec->type = vtype;
+ if (info->is_bool) {
+ // boolean vectors are internally treated as unsigned
+ // vectors to add compatibility among all SIMD extensions
+ switch(vtype) {
+ case simd_data_vb8:
+ data.vu8 = npyv_cvt_u8_b8(data.vb8);
+ break;
+ case simd_data_vb16:
+ data.vu16 = npyv_cvt_u16_b16(data.vb16);
+ break;
+ case simd_data_vb32:
+ data.vu32 = npyv_cvt_u32_b32(data.vb32);
+ break;
+ default:
+ data.vu64 = npyv_cvt_u64_b64(data.vb64);
+ }
+ }
+ npyv_store_u8(vec->data, data.vu8);
+ return vec;
+}
+
+static simd_data
+simd_vector_from_obj(simd_vector *vec, simd_data_type vtype)
+{
+ const simd_data_info *info = simd_data_getinfo(vtype);
+ assert(info->is_vector && info->nlanes > 0);
+
+ simd_data data = {.u64 = 0};
+ if (!PyObject_IsInstance(
+ (PyObject *)vec, (PyObject *)&simd__vector_type
+ )) {
+ PyErr_Format(PyExc_TypeError,
+ "a vector type %s is required", info->pyname
+ );
+ return data;
+ }
+ if (vec->type != vtype) {
+ PyErr_Format(PyExc_TypeError,
+ "a vector type %s is required, got(%s)",
+ info->pyname, simd_data_getinfo(vec->type)->pyname
+ );
+ return data;
+ }
+
+ data.vu8 = npyv_load_u8(vec->data);
+ if (info->is_bool) {
+ // boolean vectors are internally treated as unsigned
+ // vectors to add compatibility among all SIMD extensions
+ switch(vtype) {
+ case simd_data_vb8:
+ data.vb8 = npyv_cvt_b8_u8(data.vu8);
+ break;
+ case simd_data_vb16:
+ data.vb16 = npyv_cvt_b16_u16(data.vu16);
+ break;
+ case simd_data_vb32:
+ data.vb32 = npyv_cvt_b32_u32(data.vu32);
+ break;
+ default:
+ data.vb64 = npyv_cvt_b64_u64(data.vu64);
+ }
+ }
+ return data;
+}
+
+static int
+simd_vector_register(PyObject *module)
+{
+ Py_INCREF(&simd__vector_type);
+ if (PyType_Ready(&simd__vector_type)) {
+ return -1;
+ }
+ if (PyModule_AddObject(
+ module, "vector_type",(PyObject *)&simd__vector_type
+ )) {
+ return -1;
+ }
+ return 0;
+}
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index 60ba4c917..6025586cd 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -22,6 +22,8 @@ class build(old_build):
"specify a list of dispatched CPU optimizations"),
('disable-optimization', None,
"disable CPU optimized code(dispatch,simd,fast...)"),
+ ('simd-test=', None,
+ "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
]
help_options = old_build.help_options + [
@@ -36,6 +38,16 @@ class build(old_build):
self.cpu_baseline = "min"
self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
self.disable_optimization = False
+ """
+ the '_simd' module is a very large. Adding more dispatched features
+ will increase binary size and compile time. By default we minimize
+ the targeted features to those most commonly used by the NumPy SIMD interface(NPYV),
+ NOTE: any specified features will be ignored if they're:
+ - part of the baseline(--cpu-baseline)
+ - not part of dispatch-able features(--cpu-dispatch)
+ - not supported by compiler or platform
+ """
+ self.simd_test = "BASELINE SSE2 SSE41 SSE42 XOP (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
def finalize_options(self):
build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index 1a881c56a..ca6f8bcd2 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,8 +19,7 @@ from numpy.distutils.misc_util import (
has_cxx_sources, has_f_sources, is_sequence
)
from numpy.distutils.command.config_compiler import show_fortran_compilers
-from numpy.distutils.ccompiler_opt import new_ccompiler_opt
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt
class build_ext (old_build_ext):
@@ -39,6 +38,8 @@ class build_ext (old_build_ext):
"specify a list of dispatched CPU optimizations"),
('disable-optimization', None,
"disable CPU optimized code(dispatch,simd,fast...)"),
+ ('simd-test=', None,
+ "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
]
help_options = old_build_ext.help_options + [
@@ -56,6 +57,7 @@ class build_ext (old_build_ext):
self.cpu_baseline = None
self.cpu_dispatch = None
self.disable_optimization = None
+ self.simd_test = None
def finalize_options(self):
if self.parallel:
@@ -87,7 +89,9 @@ class build_ext (old_build_ext):
('cpu_baseline', 'cpu_baseline'),
('cpu_dispatch', 'cpu_dispatch'),
('disable_optimization', 'disable_optimization'),
+ ('simd_test', 'simd_test')
)
+ CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test
def run(self):
if not self.extensions: