ENH: Expose the NumPy C SIMD vectorization interface "NPYV" to Python

'_simd' is a new module to bring the NumPy C SIMD vectorization interface "NPYV" The module is designed to be extremely flexible so that it can accommodate any kind intrinsics, also to generate a python interface almost similar to the C interface. The main purpose of this module is to test NPYV intrinsics in python, but still can be used as an effective solution in designing SIMD kernels. Also add a new command-line argument `--simd-test` to control of targeted CPU features for the `_simd` module. Co-authored-by: Matti Picus <matti.picus@gmail.com> Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
author: Sayed Adel <seiko@imavr.com> 2020-07-08 09:27:13 +0200
committer: Sayed Adel <seiko@imavr.com> 2020-10-27 11:46:58 +0000
commit: cb3efe8e03b53dbab457a99be1a48384312abe16 (patch)
tree: 519f83bd1bda84f52fba88516561dd79e0f36826 /numpy
parent: fcba5a6c901717110b9767b418df410d7c8c6e73 (diff)
download: numpy-cb3efe8e03b53dbab457a99be1a48384312abe16.tar.gz
12 files changed, 1701 insertions, 2 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index b3e17baed..e9a9a4e46 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -626,6 +626,7 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', '_simd'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
     config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -974,6 +975,28 @@ def configuration(parent_package='',top_path=None):
     config.add_extension('_operand_flag_tests',
                     sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
 
+    #######################################################################
+    #                        SIMD module                                  #
+    #######################################################################
+
+    config.add_extension('_simd', sources=[
+        join('src', 'common', 'npy_cpu_features.c.src'),
+        join('src', '_simd', '_simd.c'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_inc_data.h.src'),
+        join('src', '_simd', '_simd.dispatch.c.src'),
+    ], depends=[
+        join('src', 'common', 'npy_cpu_dispatch.h'),
+        join('src', 'common', 'simd', 'simd.h'),
+        join('src', '_simd', '_simd.h'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_inc_data.h.src'),
+        join('src', '_simd', '_simd_inc_arg.h'),
+        join('src', '_simd', '_simd_inc_convert.h'),
+        join('src', '_simd', '_simd_inc_easyintrin.h'),
+        join('src', '_simd', '_simd_inc_vector.h'),
+    ])
+
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
     config.add_data_dir('tests/examples')
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
new file mode 100644
index 000000000..e5cb582b3
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.c
@@ -0,0 +1,63 @@
+#include "_simd.h"
+
+PyMODINIT_FUNC PyInit__simd(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_name = "_simd",
+        .m_size = -1
+    };
+    if (npy_cpu_init() < 0) {
+        return NULL;
+    }
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    PyObject *targets = PyDict_New();
+    if (targets == NULL) {
+        goto err;
+    }
+    if (PyModule_AddObject(m, "targets", targets) < 0) {
+        Py_DECREF(targets);
+        goto err;
+    }
+    // add keys for non-supported optimizations with None value
+    #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY)       \
+        {                                                                      \
+            PyObject *simd_mod;                                                \
+            if (!TESTED_FEATURES) {                                            \
+                Py_INCREF(Py_None);                                            \
+                simd_mod = Py_None;                                            \
+            } else {                                                           \
+                simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)();        \
+                if (simd_mod == NULL) {                                        \
+                    goto err;                                                  \
+                }                                                              \
+            }                                                                  \
+            const char *target_name = NPY_TOSTRING(TARGET_NAME);               \
+            if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) {    \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY)                            \
+        {                                                                      \
+            PyObject *simd_mod = simd_create_module();                         \
+            if (simd_mod == NULL) {                                            \
+                goto err;                                                      \
+            }                                                                  \
+            if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) {     \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+    NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
new file mode 100644
index 000000000..69a9ffd45
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -0,0 +1,351 @@
+/*@targets $werror #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_inc_data.h"
+#include "_simd_inc_convert.h"
+#include "_simd_inc_vector.h"
+#include "_simd_inc_arg.h"
+#include "_simd_inc_easyintrin.h"
+
+/*************************************************************************
+ * Defining NPYV intrinsics as module functions
+ *************************************************************************/
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
+/**end repeat1**/
+/**begin repeat1
+ * # intrin = store, storea, stores, storel, storeh#
+ */
+// special definition due to the nature of @intrin@
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg req_args[] = {
+        {.dtype = simd_data_q@sfx@},
+        {.dtype = simd_data_v@sfx@},
+    };
+    if (simd_args_from_tuple(args, req_args, 2, "@intrin@_@sfx@")) {
+        return NULL;
+    }
+    npyv_@intrin@_@sfx@(
+        req_args[0].data.q@sfx@, req_args[1].data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_obj(req_args[0].obj, req_args[0].data.q@sfx@, simd_data_q@sfx@)) {
+        simd_args_sequence_free(req_args, 2);
+        return NULL;
+    }
+    simd_args_sequence_free(req_args, 2);
+    Py_RETURN_NONE;
+}
+/**end repeat1**/
+
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
+SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_@sfx@ and npy_set_@sfx@.
+*/
+/**begin repeat1
+ * #intrin = setf, set#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_@sfx@ *data = simd_sequence_from_obj(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)simd_vector_to_obj(r, simd_data_v@sfx@);
+}
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = combine, zip#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8)
+SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@)
+SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@)
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #intrin  = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@,  v@bsfx@)
+SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // div_sup
+
+#endif // simd_sup
+/**end repeat**/
+
+/***************************
+ * Variant
+ ***************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+
+/*************************************************************************
+ * Attach module functions
+ *************************************************************************/
+static PyMethodDef simd__intrinsics_methods[] = {
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Misc
+ ***************************/
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = set, setf, setall, zero, select#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh, combine, zip#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+/**begin repeat1
+ * # intrin = shl, shr, shli, shri#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_INTRIN_DEF(mul_@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_INTRIN_DEF(div_@sfx@)
+#endif // div_sup
+
+#endif // simd_sup
+/**end repeat**/
+
+/***************************
+ * Variant
+ ***************************/
+SIMD_INTRIN_DEF(cleanup)
+/***************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+/*************************************************************************
+ * Defining a separate module for each target
+ *************************************************************************/
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_size = -1,
+    #ifdef NPY__CPU_TARGET_CURRENT
+        .m_name = "NPYV_" NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+    #else
+        .m_name = "NPYV_BASELINE",
+    #endif
+    #if NPY_SIMD
+        .m_methods = simd__intrinsics_methods
+    #else
+        .m_methods = NULL
+    #endif
+    };
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+        goto err;
+    }
+#if NPY_SIMD
+    if (simd_vector_register(m)) {
+        goto err;
+    }
+    /**begin repeat
+     * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+     */
+    if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) {
+        goto err;
+    }
+    /**end repeat**/
+#endif // NPY_SIMD
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h
new file mode 100644
index 000000000..d9905c801
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.h
@@ -0,0 +1,30 @@
+/**
+ * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes.
+ *
+ * Please keep this module independent from other c-extension modules,
+ * since NPYV intrinsics may be involved in their functionality,
+ * which increases the degree of complexity in tracking and detecting errors.
+ *
+ * TODO: Add an independent sphinx doc.
+ *
+ * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'.
+ */
+#ifndef _SIMD_SIMD_H_
+#define _SIMD_SIMD_H_
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+// autogenerated, required for CPU dispatch macros
+#include "_simd.dispatch.h"
+#endif
+/**
+ * Create a new module for each required optimization which contains all NPYV intrinsics,
+ *
+ * If required optimization is not supported by NPYV, the module will still provides
+ * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without
+ * any intrinsics.
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void))
+#endif // _SIMD_SIMD_H_
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
new file mode 100644
index 000000000..4261ce148
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -0,0 +1,395 @@
+/**
+ * This header works only through '_simd.dispatch.c'
+ */
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ @sfx@;
+    /**end repeat**/
+    // sequence
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ *q@sfx@;
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+     */
+    npyv_@sfx@ v@sfx@;
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x2 v@sfx@x2;
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x3 v@sfx@x3;
+    /**end repeat**/
+#if NPY_SIMD_F64
+    npyv_f64    vf64;
+    npyv_f64x2  vf64x2;
+    npyv_f64x3  vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+    simd_data_none = 0,
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_@sfx@,
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_q@sfx@,
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64#
+     */
+    simd_data_v@sfx@,
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x2,
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x3,
+    /**end repeat**/
+    simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+    // type name compatible with python style
+    const char *pyname;
+    // returns '1' if the type represent a unsigned integer
+    int is_unsigned:1;
+    // returns '1' if the type represent a signed integer
+    int is_signed:1;
+    // returns '1' if the type represent a single or double precision
+    int is_float:1;
+    // returns '1' if the type represent a boolean
+    int is_bool:1;
+    // returns '1' if the type represent a sequence
+    int is_sequence:1;
+    // returns '1' if the type represent a scalar
+    int is_scalar:1;
+    // returns '1' if the type represent a vector
+    int is_vector:1;
+    // returns the len of multi-vector if the type reprsent x2 or x3 vector
+    // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+    int is_vectorx;
+    // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+    simd_data_type to_scalar;
+    // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+    // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+    // e.g. simd_data_vb8 -> simd_data_vu8
+    simd_data_type to_vector;
+    // number of vector lanes
+    int nlanes;
+    // sizeof lane type
+    int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ **  const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ **  if (info->is_vector && info->is_unsigned) {
+ **     ...
+ **  }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+    PyObject_HEAD
+    // vector type id
+    simd_data_type type;
+    // vector data, aligned for safe casting
+    npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} simd_vector;
+/**
+ * convert simd_data to PyObject(simd_vector),
+ * raise Python exception on failure and returns NULL.
+ */
+static simd_vector *
+simd_vector_to_obj(simd_data data, simd_data_type vtype);
+/**
+ * convert PyObject(simd_vector) to simd_data,
+ * raise Python exception on failure.
+ */
+static simd_data
+simd_vector_from_obj(simd_vector *vec, simd_data_type vtype);
+/**
+ * initialize and register vector type(PyTypeObject) to PyModule,
+ * vector type can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+simd_vector_register(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_obj(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ **    printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_obj(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_obj(data, simd_data_s32);
+ ** if (obj != NULL) {
+ **    printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_obj(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ **    // aligned store
+ **    npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ **    printf("The first element of my array %f\n", aligned_ptr[0]);
+ **    simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_obj()`.
+ */
+static size_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_obj()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_obj(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ **     npyv_u32 a = npyv_load_u32(ptr);
+ **     ...
+ **     simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_obj()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_obj()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_obj(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_obj(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ **     npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ **     ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_obj(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+    simd_data_type dtype;
+    simd_data data;
+    // set by simd_args_from_tuple()
+    PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ *  - requires `simd_args_sequence_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ **     // fails to convert a python sequence object to C array of uint8
+ **     return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_args_sequence_free(&arg, 1);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ **     // fails to convert a python simd_vector to NPYV vector
+ **     return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ **    // fails convert C uint32 to Python integer.
+ **    return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Similar to simd_arg_from_obj() but convert multiple objects
+ * from a Python tuple.
+ * This function is used to parse method parameters.
+ *
+ * Notes:
+ *  - requires `simd_args_sequence_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *  - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Parameters:
+ *  - `tuple_obj`: a valid Python tuple
+ *  - `args`: array of 'simd_arg' contain valid data types
+ *  - `args_len`: length of `args`.
+ *  - `method_name`: method name, required for exception message.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ */
+static int
+simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name);
+/**
+ * Free the allocated C array for sequence data types.
+ */
+static void
+simd_args_sequence_free(simd_arg *args, int args_len);
+
+#endif // NPY_SIMD
diff --git a/numpy/core/src/_simd/_simd_inc_arg.h b/numpy/core/src/_simd/_simd_inc_arg.h
new file mode 100644
index 000000000..1db1ff202
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_arg.h
@@ -0,0 +1,108 @@
+#if !NPY_SIMD
+    #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        arg->data = simd_scalar_from_obj(obj, arg->dtype);
+    }
+    else if (info->is_sequence) {
+        unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
+        arg->data.qu8 = simd_sequence_from_obj(obj, arg->dtype, min_seq_size);
+    }
+    else if (info->is_vectorx) {
+        arg->data = simd_vectorx_from_obj(obj, arg->dtype);
+    }
+    else if (info->is_vector) {
+        arg->data = simd_vector_from_obj((simd_vector*)obj, arg->dtype);
+    } else {
+        arg->data.u64 = 0;
+        PyErr_Format(PyExc_RuntimeError,
+            "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname
+        );
+        return -1;
+    }
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        return simd_scalar_to_obj(arg->data, arg->dtype);
+    }
+    if (info->is_sequence) {
+        return simd_sequence_to_obj(arg->data.qu8, arg->dtype);
+    }
+    if (info->is_vectorx) {
+        return simd_vectorx_to_obj(arg->data, arg->dtype);
+    }
+    if (info->is_vector) {
+        return (PyObject*)simd_vector_to_obj(arg->data, arg->dtype);
+    }
+    PyErr_Format(PyExc_RuntimeError,
+        "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
+    );
+    return NULL;
+}
+
+static void
+simd_args_sequence_free(simd_arg *args, int args_len)
+{
+    assert(args_len > 0);
+    while (--args_len >= 0) {
+        simd_arg *arg = &args[args_len];
+        const simd_data_info *info = simd_data_getinfo(arg->dtype);
+        if (!info->is_sequence) {
+            continue;
+        }
+        simd_sequence_free(arg->data.qu8);
+    }
+}
+
+static int
+simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name)
+{
+    assert(args_len > 0);
+    assert(PyTuple_Check(tuple_obj));
+
+    Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(tuple_obj);
+    if (obj_arg_len != args_len) {
+        if (args_len == 1) {
+            PyErr_Format(PyExc_TypeError,
+                "%s() takes only one argument (%d given)", method_name, obj_arg_len
+            );
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+            "%s() takes exactly %d arguments (%d given)", method_name, args_len, obj_arg_len
+        );
+        return -1;
+    }
+    for (int arg_pos = 0; arg_pos < args_len; ++arg_pos) {
+        simd_arg *arg = &args[arg_pos];
+        arg->obj = PyTuple_GET_ITEM(tuple_obj, arg_pos);
+        assert(arg->obj != NULL);
+        if (simd_arg_from_obj(arg->obj, arg) != 0) {
+            // free previous args
+            if (arg_pos > 0) {
+                simd_args_sequence_free(args, arg_pos);
+            }
+            // TODO: improve log by add argument number and method name
+            return -1;
+        }
+    }
+    return 0;
+}
diff --git a/numpy/core/src/_simd/_simd_inc_convert.h b/numpy/core/src/_simd/_simd_inc_convert.h
new file mode 100644
index 000000000..360101247
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_convert.h
@@ -0,0 +1,206 @@
+#if !NPY_SIMD
+    #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_data
+simd_scalar_from_obj(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    simd_data data;
+    if (info->is_float) {
+        data.f64 = PyFloat_AsDouble(obj);
+        if (dtype == simd_data_f32){
+            data.f32 = (float)data.f64;
+        }
+    } else {
+        data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+    }
+    return data;
+}
+
+static PyObject *
+simd_scalar_to_obj(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    if (info->is_float) {
+        if (dtype == simd_data_f32) {
+            return PyFloat_FromDouble(data.f32);
+        }
+        return PyFloat_FromDouble(data.f64);
+    }
+    int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+    data.u64 <<= leftb;
+    if (info->is_signed) {
+        return PyLong_FromLongLong(data.s64 >> leftb);
+    }
+    return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
+}
+
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_sequence && info->lane_size > 0);
+
+    size_t size  = NPY_SIMD_WIDTH + sizeof(size_t) + sizeof(size_t*);
+           size += len * info->lane_size;
+
+    size_t *ptr = malloc(size);
+    if (ptr == NULL) {
+        return PyErr_NoMemory();
+    }
+    *(ptr++) = len;
+    size_t **a_ptr = (size_t**)(
+        ((size_t)ptr + NPY_SIMD_WIDTH) & ~(size_t)(NPY_SIMD_WIDTH-1)
+    );
+    a_ptr[-1] = ptr;
+    return a_ptr;
+}
+
+static size_t
+simd_sequence_len(const void *ptr)
+{
+    size_t *ptrz = ((size_t**)ptr)[-1];
+    return *(ptrz-1);
+}
+
+static void
+simd_sequence_free(void *ptr)
+{
+    size_t *ptrz = ((size_t**)ptr)[-1];
+    free(ptrz-1);
+}
+
+static void *
+simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_sequence && info->lane_size > 0);
+    PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence");
+    if (seq_obj == NULL) {
+        return NULL;
+    }
+    Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
+    if (seq_size < (Py_ssize_t)min_size) {
+        PyErr_Format(PyExc_ValueError,
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            min_size, seq_size
+        );
+        return NULL;
+    }
+    npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype);
+    if (dst == NULL) {
+        return NULL;
+    }
+    PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
+    for (Py_ssize_t i = 0; i < seq_size; ++i) {
+        simd_data data = simd_scalar_from_obj(seq_items[i], info->to_scalar);
+        npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
+        memcpy(sdst, &data.u64, info->lane_size);
+    }
+    Py_DECREF(seq_obj);
+
+    if (PyErr_Occurred()) {
+        simd_sequence_free(dst);
+        return NULL;
+    }
+    return dst;
+}
+
+static int
+simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    if (!PySequence_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+            "a sequence object is required to fill %s", info->pyname
+        );
+        return -1;
+    }
+    const npyv_lanetype_u8 *src = ptr;
+    Py_ssize_t seq_len = (Py_ssize_t)simd_sequence_len(ptr);
+    for (Py_ssize_t i = 0; i < seq_len; ++i) {
+        const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
+        simd_data data;
+        memcpy(&data.u64, ssrc, info->lane_size);
+        PyObject *item = simd_scalar_to_obj(data, info->to_scalar);
+        if (item == NULL) {
+            return -1;
+        }
+        if (PySequence_SetItem(obj, i, item) < 0) {
+            Py_DECREF(item);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static PyObject *
+simd_sequence_to_obj(const void *ptr, simd_data_type dtype)
+{
+    PyObject *list = PyList_New((Py_ssize_t)simd_sequence_len(ptr));
+    if (list == NULL) {
+        return NULL;
+    }
+    if (simd_sequence_fill_obj(list, ptr, dtype) < 0) {
+        Py_DECREF(list);
+        return NULL;
+    }
+    return list;
+}
+
+static simd_data
+simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    simd_data data = {.u64 = 0};
+    if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) {
+        PyErr_Format(PyExc_TypeError,
+            "a tuple of %d vector type %s is required",
+            info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname
+        );
+        return data;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        PyObject *item = PyTuple_GET_ITEM(obj, i);
+        // get the max multi-vec and let the compiler do the rest
+        data.vu64x3.val[i] = simd_vector_from_obj((simd_vector*)item, info->to_vector).vu64;
+        if (PyErr_Occurred()) {
+            return data;
+        }
+    }
+    return data;
+}
+
+static PyObject *
+simd_vectorx_to_obj(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    PyObject *tuple = PyTuple_New(info->is_vectorx);
+    if (tuple == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        // get the max multi-vector and let the compiler handle the rest
+        simd_data vdata = {.vu64 = data.vu64x3.val[i]};
+        PyObject *item = (PyObject*)simd_vector_to_obj(vdata, info->to_vector);
+        if (item == NULL) {
+            // TODO: improve log add item number
+            Py_DECREF(tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(tuple, i, item);
+    }
+    return tuple;
+}
diff --git a/numpy/core/src/_simd/_simd_inc_data.h.src b/numpy/core/src/_simd/_simd_inc_data.h.src
new file mode 100644
index 000000000..eefac483b
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_data.h.src
@@ -0,0 +1,91 @@
+#if !NPY_SIMD
+    #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+    [simd_data_none] = {.pyname="none"},
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig  = 0*4, 1*4, 0*2#
+     * #fp   = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_@sfx@] = {
+        .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_q@sfx@] = {
+        .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@] = {
+        .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // boolean vectors, treated as unsigned and converted internally
+    // to add compatibility among all SIMD extensions
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64#
+     * #bsfx = b8, b16, b32, b64#
+     */
+    [simd_data_v@bsfx@] = {
+        .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x2] = {
+        .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x3] = {
+        .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
diff --git a/numpy/core/src/_simd/_simd_inc_easyintrin.h b/numpy/core/src/_simd/_simd_inc_easyintrin.h
new file mode 100644
index 000000000..7216b373a
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_easyintrin.h
@@ -0,0 +1,229 @@
+#if !NPY_SIMD
+    #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+#define SIMD_INTRIN_DEF(NAME) \
+    { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma
+
+static int simd__no_arguments(PyObject *args, const char* method_name)
+{
+    if (args == NULL) {
+        return 0;
+    }
+    assert(PyTuple_Check(args));
+    Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(args);
+    if (obj_arg_len != 0) {
+        PyErr_Format(PyExc_RuntimeError,
+            "%s(), takes no arguments, given(%d)", method_name, obj_arg_len
+        );
+        return -1;
+    }
+    return 0;
+}
+
+#define SIMD_IMPL_INTRIN_0(NAME, RET)                     \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (simd__no_arguments(                           \
+            args, NPY_TOSTRING(NAME)                      \
+        )) return NULL;                                   \
+        simd_arg a = {                                    \
+            .dtype = simd_data_##RET,                     \
+            .data  = {.RET = npyv_##NAME()},              \
+        };                                                \
+        return simd_arg_to_obj(&a);                       \
+    }
+
+#define SIMD_IMPL_INTRIN_0N(NAME)                         \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (simd__no_arguments(                           \
+            args, NPY_TOSTRING(NAME)                      \
+        )) return NULL;                                   \
+        npyv_##NAME();                                    \
+        Py_RETURN_NONE;                                   \
+    }
+
+#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0)                \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg req_args[] = {                           \
+            {.dtype = simd_data_##IN0},                   \
+        };                                                \
+        if (simd_args_from_tuple(                         \
+            args, req_args, 1, NPY_TOSTRING(NAME))        \
+        ) return NULL;                                    \
+        simd_data r = {.RET = npyv_##NAME(                \
+            req_args[0].data.IN0                          \
+        )};                                               \
+        simd_args_sequence_free(req_args, 1);             \
+        req_args[0].data = r;                             \
+        req_args[0].dtype = simd_data_##RET;              \
+        return simd_arg_to_obj(req_args);                 \
+    }
+
+#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1)           \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg req_args[] = {                           \
+            {.dtype = simd_data_##IN0},                   \
+            {.dtype = simd_data_##IN1},                   \
+        };                                                \
+        if (simd_args_from_tuple(                         \
+            args, req_args, 2, NPY_TOSTRING(NAME))        \
+        ) {                                               \
+            return NULL;                                  \
+        }                                                 \
+        simd_data r = {.RET = npyv_##NAME(                \
+            req_args[0].data.IN0,                         \
+            req_args[1].data.IN1                          \
+        )};                                               \
+        simd_args_sequence_free(req_args, 2);             \
+        req_args[0].data = r;                             \
+        req_args[0].dtype = simd_data_##RET;              \
+        return simd_arg_to_obj(req_args);                 \
+    }
+
+#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
+    C == req_args[1].data.u8 ? NPY_CAT(npyv_, NAME)(req_args[0].data.IN0, C) :
+
+#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG)  \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg req_args[] = {                           \
+            {.dtype = simd_data_##IN0},                   \
+            {.dtype = simd_data_u8},                      \
+        };                                                \
+        if (simd_args_from_tuple(                         \
+            args, req_args, 2, NPY_TOSTRING(NAME))        \
+        ) {                                               \
+            return NULL;                                  \
+        }                                                 \
+        simd_data r;                                      \
+        r.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)(    \
+            SIMD__REPEAT_2IMM, NAME, IN0                  \
+        ) npyv_##NAME(req_args[0].data.IN0, 0);           \
+        simd_args_sequence_free(req_args, 2);             \
+        req_args[0].data = r;                             \
+        req_args[0].dtype = simd_data_##RET;              \
+        return simd_arg_to_obj(req_args);                 \
+    }
+
+#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2)      \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg req_args[] = {                           \
+            {.dtype = simd_data_##IN0},                   \
+            {.dtype = simd_data_##IN1},                   \
+            {.dtype = simd_data_##IN2},                   \
+        };                                                \
+        if (simd_args_from_tuple(                         \
+            args, req_args, 3, NPY_TOSTRING(NAME))        \
+        ) {                                               \
+            return NULL;                                  \
+        }                                                 \
+        simd_data r = {.RET = npyv_##NAME(                \
+            req_args[0].data.IN0,                         \
+            req_args[1].data.IN1,                         \
+            req_args[2].data.IN2                          \
+        )};                                               \
+        simd_args_sequence_free(req_args, 3);             \
+        req_args[0].data = r;                             \
+        req_args[0].dtype = simd_data_##RET;              \
+        return simd_arg_to_obj(req_args);                 \
+    }
+/**
+ * Helper macros for repeating and expand a certain macro.
+ * Mainly used for converting a scalar to an immediate constant.
+ */
+#define SIMD__IMPL_COUNT_7(FN, ...)      \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_8(FN, ...)      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(8,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_16(FN, ...)      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(16,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_32(FN, ...)      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(32,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_48(FN, ...)      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(48,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_64(FN, ...)      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(64,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_7_(FN, ...)                                \
+                                    NPY_EXPAND(FN(1,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(2,  __VA_ARGS__)) NPY_EXPAND(FN(3,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(4,  __VA_ARGS__)) NPY_EXPAND(FN(5,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(6,  __VA_ARGS__)) NPY_EXPAND(FN(7,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15_(FN, ...)                               \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)                            \
+    NPY_EXPAND(FN(8,  __VA_ARGS__)) NPY_EXPAND(FN(9,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \
+    NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \
+    NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31_(FN, ...)                               \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \
+    NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \
+    NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \
+    NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \
+    NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \
+    NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \
+    NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \
+    NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47_(FN, ...)                               \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \
+    NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \
+    NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \
+    NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \
+    NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \
+    NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \
+    NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \
+    NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63_(FN, ...)                               \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \
+    NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \
+    NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \
+    NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \
+    NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \
+    NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \
+    NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \
+    NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__))
diff --git a/numpy/core/src/_simd/_simd_inc_vector.h b/numpy/core/src/_simd/_simd_inc_vector.h
new file mode 100644
index 000000000..b0fa17b9a
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc_vector.h
@@ -0,0 +1,187 @@
+#if !NPY_SIMD
+    #error "Not a standalone header, only works through 'simd.dispatch.c.src'"
+#endif
+
+/************************************
+ ** Private Definitions
+ ************************************/
+// PySequenceMethods
+static Py_ssize_t
+simd__vector_length(simd_vector *self)
+{
+    return simd_data_getinfo(self->type)->nlanes;
+}
+static PyObject *
+simd__vector_item(simd_vector *self, Py_ssize_t i)
+{
+    const simd_data_info *info = simd_data_getinfo(self->type);
+    int nlanes = info->nlanes;
+    if (i >= nlanes) {
+        PyErr_SetString(PyExc_IndexError, "list index out of range");
+        return NULL;
+    }
+    npyv_lanetype_u8 *src = self->data + i * info->lane_size;
+    simd_data data;
+    memcpy(&data.u64, src, info->lane_size);
+    return simd_scalar_to_obj(data, info->to_scalar);
+}
+
+static PySequenceMethods simd__vector_as_sequence = {
+    (lenfunc) simd__vector_length,           /* sq_length */
+    (binaryfunc) NULL,                       /* sq_concat */
+    (ssizeargfunc) NULL,                     /* sq_repeat */
+    (ssizeargfunc) simd__vector_item,        /* sq_item */
+    (ssizessizeargfunc) NULL,                /* sq_slice */
+    (ssizeobjargproc) NULL,                  /* sq_ass_item */
+    (ssizessizeobjargproc) NULL,             /* sq_ass_slice */
+    (objobjproc) NULL,                       /* sq_contains */
+    (binaryfunc) NULL,                       /* sq_inplace_concat */
+    (ssizeargfunc) NULL,                     /* sq_inplace_repeat */
+};
+
+// PyGetSetDef
+static PyObject *
+simd__vector_name(simd_vector *self)
+{
+    return PyUnicode_FromString(simd_data_getinfo(self->type)->pyname);
+}
+static PyGetSetDef simd__vector_getset[] = {
+    { "__name__", (getter)simd__vector_name, NULL, NULL, NULL },
+    { NULL, NULL, NULL, NULL, NULL }
+};
+
+// PyTypeObject(simd__vector_type)
+static PyObject *
+simd__vector_repr(PyObject *self)
+{
+    // PySequence_Fast returns Tuple in PyPy
+    PyObject *obj = PySequence_List(self);
+    if (obj != NULL) {
+        PyObject *repr = PyObject_Str(obj);
+        Py_DECREF(obj);
+        return repr;
+    }
+    return obj;
+}
+static PyObject *
+simd__vector_compare(PyObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *obj;
+    if (PyTuple_Check(other)) {
+        obj = PySequence_Tuple(self);
+    } else if (PyList_Check(other)) {
+        obj = PySequence_List(self);
+    } else {
+        obj = PySequence_Fast(self, "invalid argument, expected a vector");
+    }
+    if (obj != NULL) {
+        PyObject *rich = PyObject_RichCompare(obj, other, cmp_op);
+        Py_DECREF(obj);
+        return rich;
+    }
+    return obj;
+}
+static PyTypeObject simd__vector_type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)),
+    .tp_basicsize = sizeof(simd_vector),
+    .tp_repr = (reprfunc)simd__vector_repr,
+    .tp_as_sequence = &simd__vector_as_sequence,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_richcompare = simd__vector_compare,
+    .tp_getset = simd__vector_getset
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_vector *
+simd_vector_to_obj(simd_data data, simd_data_type vtype)
+{
+    const simd_data_info *info = simd_data_getinfo(vtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    simd_vector *vec = PyObject_New(simd_vector, &simd__vector_type);
+    if (vec == NULL) {
+        return (simd_vector*)PyErr_NoMemory();
+    }
+    vec->type = vtype;
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(vtype) {
+        case simd_data_vb8:
+            data.vu8 = npyv_cvt_u8_b8(data.vb8);
+            break;
+        case simd_data_vb16:
+            data.vu16 = npyv_cvt_u16_b16(data.vb16);
+            break;
+        case simd_data_vb32:
+            data.vu32 = npyv_cvt_u32_b32(data.vb32);
+            break;
+        default:
+            data.vu64 = npyv_cvt_u64_b64(data.vb64);
+        }
+    }
+    npyv_store_u8(vec->data, data.vu8);
+    return vec;
+}
+
+static simd_data
+simd_vector_from_obj(simd_vector *vec, simd_data_type vtype)
+{
+    const simd_data_info *info = simd_data_getinfo(vtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    simd_data data = {.u64 = 0};
+    if (!PyObject_IsInstance(
+        (PyObject *)vec, (PyObject *)&simd__vector_type
+    )) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required", info->pyname
+        );
+        return data;
+    }
+    if (vec->type != vtype) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required, got(%s)",
+            info->pyname, simd_data_getinfo(vec->type)->pyname
+        );
+        return data;
+    }
+
+    data.vu8 = npyv_load_u8(vec->data);
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(vtype) {
+        case simd_data_vb8:
+            data.vb8 = npyv_cvt_b8_u8(data.vu8);
+            break;
+        case simd_data_vb16:
+            data.vb16 = npyv_cvt_b16_u16(data.vu16);
+            break;
+        case simd_data_vb32:
+            data.vb32 = npyv_cvt_b32_u32(data.vu32);
+            break;
+        default:
+            data.vb64 = npyv_cvt_b64_u64(data.vu64);
+        }
+    }
+    return data;
+}
+
+static int
+simd_vector_register(PyObject *module)
+{
+    Py_INCREF(&simd__vector_type);
+    if (PyType_Ready(&simd__vector_type)) {
+        return -1;
+    }
+    if (PyModule_AddObject(
+        module, "vector_type",(PyObject *)&simd__vector_type
+    )) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index 60ba4c917..6025586cd 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -22,6 +22,8 @@ class build(old_build):
          "specify a list of dispatched CPU optimizations"),
         ('disable-optimization', None,
          "disable CPU optimized code(dispatch,simd,fast...)"),
+        ('simd-test=', None,
+         "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
         ]
 
     help_options = old_build.help_options + [
@@ -36,6 +38,16 @@ class build(old_build):
         self.cpu_baseline = "min"
         self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
         self.disable_optimization = False
+        """
+        the '_simd' module is a very large. Adding more dispatched features
+        will increase binary size and compile time. By default we minimize
+        the targeted features to those most commonly used by the NumPy SIMD interface(NPYV),
+        NOTE: any specified features will be ignored if they're:
+            - part of the baseline(--cpu-baseline)
+            - not part of dispatch-able features(--cpu-dispatch)
+            - not supported by compiler or platform
+        """
+        self.simd_test = "BASELINE SSE2 SSE41 SSE42 XOP (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
 
     def finalize_options(self):
         build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index 1a881c56a..ca6f8bcd2 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,8 +19,7 @@ from numpy.distutils.misc_util import (
     has_cxx_sources, has_f_sources, is_sequence
 )
 from numpy.distutils.command.config_compiler import show_fortran_compilers
-from numpy.distutils.ccompiler_opt import new_ccompiler_opt
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt
 
 class build_ext (old_build_ext):
 
@@ -39,6 +38,8 @@ class build_ext (old_build_ext):
          "specify a list of dispatched CPU optimizations"),
         ('disable-optimization', None,
          "disable CPU optimized code(dispatch,simd,fast...)"),
+        ('simd-test=', None,
+         "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
     ]
 
     help_options = old_build_ext.help_options + [
@@ -56,6 +57,7 @@ class build_ext (old_build_ext):
         self.cpu_baseline = None
         self.cpu_dispatch = None
         self.disable_optimization = None
+        self.simd_test = None
 
     def finalize_options(self):
         if self.parallel:
@@ -87,7 +89,9 @@ class build_ext (old_build_ext):
                                         ('cpu_baseline', 'cpu_baseline'),
                                         ('cpu_dispatch', 'cpu_dispatch'),
                                         ('disable_optimization', 'disable_optimization'),
+                                        ('simd_test', 'simd_test')
                                   )
+        CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test
 
     def run(self):
         if not self.extensions:
author	Sayed Adel <seiko@imavr.com>	2020-07-08 09:27:13 +0200
committer	Sayed Adel <seiko@imavr.com>	2020-10-27 11:46:58 +0000
commit	cb3efe8e03b53dbab457a99be1a48384312abe16 (patch)
tree	519f83bd1bda84f52fba88516561dd79e0f36826 /numpy
parent	fcba5a6c901717110b9767b418df410d7c8c6e73 (diff)
download	numpy-cb3efe8e03b53dbab457a99be1a48384312abe16.tar.gz