diff options
| author | Sayed Adel <seiko@imavr.com> | 2020-07-08 09:27:13 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2020-10-27 11:46:58 +0000 |
| commit | cb3efe8e03b53dbab457a99be1a48384312abe16 (patch) | |
| tree | 519f83bd1bda84f52fba88516561dd79e0f36826 /numpy | |
| parent | fcba5a6c901717110b9767b418df410d7c8c6e73 (diff) | |
| download | numpy-cb3efe8e03b53dbab457a99be1a48384312abe16.tar.gz | |
ENH: Expose the NumPy C SIMD vectorization interface "NPYV" to Python
'_simd' is a new module to bring the NumPy C SIMD vectorization interface "NPYV"
The module is designed to be extremely flexible so that it can accommodate any kind
intrinsics, also to generate a python interface almost similar to the C interface.
The main purpose of this module is to test NPYV intrinsics in python,
but still can be used as an effective solution in designing SIMD kernels.
Also add a new command-line argument `--simd-test` to control of targeted CPU features
for the `_simd` module.
Co-authored-by: Matti Picus <matti.picus@gmail.com>
Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/setup.py | 23 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd.c | 63 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 351 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd.h | 30 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc.h.src | 395 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc_arg.h | 108 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc_convert.h | 206 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc_data.h.src | 91 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc_easyintrin.h | 229 | ||||
| -rw-r--r-- | numpy/core/src/_simd/_simd_inc_vector.h | 187 | ||||
| -rw-r--r-- | numpy/distutils/command/build.py | 12 | ||||
| -rw-r--r-- | numpy/distutils/command/build_ext.py | 8 |
12 files changed, 1701 insertions, 2 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py index b3e17baed..e9a9a4e46 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -626,6 +626,7 @@ def configuration(parent_package='',top_path=None): config.add_include_dirs(join('src', 'multiarray')) config.add_include_dirs(join('src', 'umath')) config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', '_simd')) config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) @@ -974,6 +975,28 @@ def configuration(parent_package='',top_path=None): config.add_extension('_operand_flag_tests', sources=[join('src', 'umath', '_operand_flag_tests.c.src')]) + ####################################################################### + # SIMD module # + ####################################################################### + + config.add_extension('_simd', sources=[ + join('src', 'common', 'npy_cpu_features.c.src'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_inc_data.h.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_inc_data.h.src'), + join('src', '_simd', '_simd_inc_arg.h'), + join('src', '_simd', '_simd_inc_convert.h'), + join('src', '_simd', '_simd_inc_easyintrin.h'), + join('src', '_simd', '_simd_inc_vector.h'), + ]) + config.add_subpackage('tests') config.add_data_dir('tests/data') config.add_data_dir('tests/examples') diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c new file mode 100644 index 000000000..e5cb582b3 --- /dev/null +++ b/numpy/core/src/_simd/_simd.c @@ -0,0 +1,63 @@ +#include "_simd.h" + +PyMODINIT_FUNC PyInit__simd(void) +{ + static struct PyModuleDef defs = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "_simd", + .m_size = -1 + }; + if (npy_cpu_init() < 0) { + return NULL; + } + PyObject *m = PyModule_Create(&defs); + if (m == NULL) { + return NULL; + } + PyObject *targets = PyDict_New(); + if (targets == NULL) { + goto err; + } + if (PyModule_AddObject(m, "targets", targets) < 0) { + Py_DECREF(targets); + goto err; + } + // add keys for non-supported optimizations with None value + #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY) \ + { \ + PyObject *simd_mod; \ + if (!TESTED_FEATURES) { \ + Py_INCREF(Py_None); \ + simd_mod = Py_None; \ + } else { \ + simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)(); \ + if (simd_mod == NULL) { \ + goto err; \ + } \ + } \ + const char *target_name = NPY_TOSTRING(TARGET_NAME); \ + if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + } + + #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY) \ + { \ + PyObject *simd_mod = simd_create_module(); \ + if (simd_mod == NULL) { \ + goto err; \ + } \ + if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) { \ + Py_DECREF(simd_mod); \ + goto err; \ + } \ + } + + NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY) + NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY) + return m; +err: + Py_DECREF(m); + return NULL; +} diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src new file mode 100644 index 000000000..69a9ffd45 --- /dev/null +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -0,0 +1,351 @@ +/*@targets $werror #simd_test*/ +#include "_simd.h" +#include "_simd_inc.h" + +#if NPY_SIMD +#include "_simd_inc_data.h" +#include "_simd_inc_convert.h" +#include "_simd_inc_vector.h" +#include "_simd_inc_arg.h" +#include "_simd_inc_easyintrin.h" + +/************************************************************************* + * Defining NPYV intrinsics as module functions + *************************************************************************/ +/**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# + * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# + * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# + * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + */ +#if @simd_sup@ +/*************************** + * Memory + ***************************/ +/**begin repeat1 + * # intrin = load, loada, loads, loadl# + */ +SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@) +/**end repeat1**/ +/**begin repeat1 + * # intrin = store, storea, stores, storel, storeh# + */ +// special definition due to the nature of @intrin@ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + simd_arg req_args[] = { + {.dtype = simd_data_q@sfx@}, + {.dtype = simd_data_v@sfx@}, + }; + if (simd_args_from_tuple(args, req_args, 2, "@intrin@_@sfx@")) { + return NULL; + } + npyv_@intrin@_@sfx@( + req_args[0].data.q@sfx@, req_args[1].data.v@sfx@ + ); + // write-back + if (simd_sequence_fill_obj(req_args[0].obj, req_args[0].data.q@sfx@, simd_data_q@sfx@)) { + simd_args_sequence_free(req_args, 2); + return NULL; + } + simd_args_sequence_free(req_args, 2); + Py_RETURN_NONE; +} +/**end repeat1**/ + +/*************************** + * Misc + ***************************/ +SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@) +SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) + +/**begin repeat1 + * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + */ +#if @simd_sup2@ +SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@) +#endif // simd_sup2 +/**end repeat1**/ + +/** + * special definition due to the nature of intrinsics + * npyv_setf_@sfx@ and npy_set_@sfx@. +*/ +/**begin repeat1 + * #intrin = setf, set# + */ +static PyObject * +simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) +{ + npyv_lanetype_@sfx@ *data = simd_sequence_from_obj(args, simd_data_q@sfx@, npyv_nlanes_@sfx@); + if (data == NULL) { + return NULL; + } + simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@( + data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], + data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15], + data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23], + data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31], + data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39], + data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47], + data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55], + data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63], + data[64] // for setf + )}; + simd_sequence_free(data); + return (PyObject*)simd_vector_to_obj(r, simd_data_v@sfx@); +} +/**end repeat1**/ + +/*************************** + * Reorder + ***************************/ +/**begin repeat1 + * # intrin = combinel, combineh# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +/**begin repeat1 + * # intrin = combine, zip# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@) +/**end repeat1**/ + +/*************************** + * Operators + ***************************/ +#if @shl_imm@ > 0 +SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8) +SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8) +// immediate constant +SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@) +SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@) +#endif // shl_imm + +/**begin repeat1 + * #intrin = and, or, xor# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) + +/**begin repeat1 + * #intrin = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +/*************************** + * Conversion + ***************************/ +SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@) +SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@) + +/*************************** + * Arithmetic + ***************************/ +/**begin repeat1 + * #intrin = add, sub# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + +#if @sat_sup@ +/**begin repeat1 + * #intrin = adds, subs# + */ +SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ +#endif // sat_sup + +#if @mul_sup@ +SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@) +#endif // mul_sup + +#if @div_sup@ +SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@) +#endif // div_sup + +#endif // simd_sup +/**end repeat**/ + +/*************************** + * Variant + ***************************/ +SIMD_IMPL_INTRIN_0N(cleanup) + +/************************************************************************* + * Attach module functions + *************************************************************************/ +static PyMethodDef simd__intrinsics_methods[] = { +/**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# + * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# + * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# + * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# + */ +#if @simd_sup@ + +/*************************** + * Memory + ***************************/ +/**begin repeat1 + * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/*************************** + * Misc + ***************************/ +/**begin repeat1 + * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + */ +#if @simd_sup2@ +SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@) +#endif // simd_sup2 +/**end repeat1**/ + +/**begin repeat1 + * # intrin = set, setf, setall, zero, select# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/*************************** + * Reorder + ***************************/ +/**begin repeat1 + * # intrin = combinel, combineh, combine, zip# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) +SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) + +/*************************** + * Operators + ***************************/ +#if @shl_imm@ > 0 +/**begin repeat1 + * # intrin = shl, shr, shli, shri# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // shl_imm + +/**begin repeat1 + * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +/*************************** + * Conversion + ***************************/ +SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) +SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) + +/*************************** + * Arithmetic + ***************************/ +/**begin repeat1 + * #intrin = add, sub# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + +#if @sat_sup@ +/**begin repeat1 + * #intrin = adds, subs# + */ +SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ +#endif // sat_sup + +#if @mul_sup@ +SIMD_INTRIN_DEF(mul_@sfx@) +#endif // mul_sup + +#if @div_sup@ +SIMD_INTRIN_DEF(div_@sfx@) +#endif // div_sup + +#endif // simd_sup +/**end repeat**/ + +/*************************** + * Variant + ***************************/ +SIMD_INTRIN_DEF(cleanup) +/***************************/ +{NULL, NULL, 0, NULL} +}; // PyMethodDef + +#endif // NPY_SIMD + +/************************************************************************* + * Defining a separate module for each target + *************************************************************************/ +NPY_VISIBILITY_HIDDEN PyObject * +NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) +{ + static struct PyModuleDef defs = { + .m_base = PyModuleDef_HEAD_INIT, + .m_size = -1, + #ifdef NPY__CPU_TARGET_CURRENT + .m_name = "NPYV_" NPY_TOSTRING(NPY__CPU_TARGET_CURRENT), + #else + .m_name = "NPYV_BASELINE", + #endif + #if NPY_SIMD + .m_methods = simd__intrinsics_methods + #else + .m_methods = NULL + #endif + }; + PyObject *m = PyModule_Create(&defs); + if (m == NULL) { + return NULL; + } + if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) { + goto err; + } + if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) { + goto err; + } + if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) { + goto err; + } +#if NPY_SIMD + if (simd_vector_register(m)) { + goto err; + } + /**begin repeat + * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# + */ + if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) { + goto err; + } + /**end repeat**/ +#endif // NPY_SIMD + return m; +err: + Py_DECREF(m); + return NULL; +} diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h new file mode 100644 index 000000000..d9905c801 --- /dev/null +++ b/numpy/core/src/_simd/_simd.h @@ -0,0 +1,30 @@ +/** + * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes. + * + * Please keep this module independent from other c-extension modules, + * since NPYV intrinsics may be involved in their functionality, + * which increases the degree of complexity in tracking and detecting errors. + * + * TODO: Add an independent sphinx doc. + * + * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'. + */ +#ifndef _SIMD_SIMD_H_ +#define _SIMD_SIMD_H_ + +#include <Python.h> +#include "numpy/npy_common.h" + +#ifndef NPY_DISABLE_OPTIMIZATION +// autogenerated, required for CPU dispatch macros +#include "_simd.dispatch.h" +#endif +/** + * Create a new module for each required optimization which contains all NPYV intrinsics, + * + * If required optimization is not supported by NPYV, the module will still provides + * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without + * any intrinsics. + */ +NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void)) +#endif // _SIMD_SIMD_H_ diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src new file mode 100644 index 000000000..4261ce148 --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc.h.src @@ -0,0 +1,395 @@ +/** + * This header works only through '_simd.dispatch.c' + */ +#include <Python.h> +#include "simd/simd.h" + +#if NPY_SIMD +/************************************ + ** Types + ************************************/ +/** + * Gather all data types supported by the module. +*/ +typedef union +{ + // scalars + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + npyv_lanetype_@sfx@ @sfx@; + /**end repeat**/ + // sequence + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + npyv_lanetype_@sfx@ *q@sfx@; + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64# + */ + npyv_@sfx@ v@sfx@; + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + */ + npyv_@sfx@x2 v@sfx@x2; + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + */ + npyv_@sfx@x3 v@sfx@x3; + /**end repeat**/ +#if NPY_SIMD_F64 + npyv_f64 vf64; + npyv_f64x2 vf64x2; + npyv_f64x3 vf64x3; +#endif +} simd_data; + +/** + * Data types IDs and suffixes. Must be same data types as the ones + * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'. +*/ +typedef enum +{ + simd_data_none = 0, + // scalars + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_@sfx@, + /**end repeat**/ + // sequences + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_q@sfx@, + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64# + */ + simd_data_v@sfx@, + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_v@sfx@x2, + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + */ + simd_data_v@sfx@x3, + /**end repeat**/ + simd_data_end, +} simd_data_type; +/************************************ + ** Declarations (inc_data) + ************************************/ +/** + * simd_data_type information + */ +typedef struct +{ + // type name compatible with python style + const char *pyname; + // returns '1' if the type represent a unsigned integer + int is_unsigned:1; + // returns '1' if the type represent a signed integer + int is_signed:1; + // returns '1' if the type represent a single or double precision + int is_float:1; + // returns '1' if the type represent a boolean + int is_bool:1; + // returns '1' if the type represent a sequence + int is_sequence:1; + // returns '1' if the type represent a scalar + int is_scalar:1; + // returns '1' if the type represent a vector + int is_vector:1; + // returns the len of multi-vector if the type reprsent x2 or x3 vector + // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2 + int is_vectorx; + // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8 + simd_data_type to_scalar; + // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8 + // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector + // e.g. simd_data_vb8 -> simd_data_vu8 + simd_data_type to_vector; + // number of vector lanes + int nlanes; + // sizeof lane type + int lane_size; +} simd_data_info; + +/** + * Returns data info of certain dtype. + * + * Example: + ** const simd_data_info *info = simd_data_getinfo(simd_data_vu8); + ** if (info->is_vector && info->is_unsigned) { + ** ... + ** } + */ +static const simd_data_info * +simd_data_getinfo(simd_data_type dtype); + +/************************************ + ** Declarations (inc_vector) + ************************************/ +typedef struct +{ + PyObject_HEAD + // vector type id + simd_data_type type; + // vector data, aligned for safe casting + npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH]; +} simd_vector; +/** + * convert simd_data to PyObject(simd_vector), + * raise Python exception on failure and returns NULL. + */ +static simd_vector * +simd_vector_to_obj(simd_data data, simd_data_type vtype); +/** + * convert PyObject(simd_vector) to simd_data, + * raise Python exception on failure. + */ +static simd_data +simd_vector_from_obj(simd_vector *vec, simd_data_type vtype); +/** + * initialize and register vector type(PyTypeObject) to PyModule, + * vector type can be reached through attribute 'vector_type'. + * return -1 on error, 0 on success. + */ +static int +simd_vector_register(PyObject *module); + +/************************************ + ** Declarations (inc_convert) + ************************************/ +/** + * Return a C scalar(simd_data) representation of `obj` and + * according to the scalar data type `dtype` on range (simd_data_[u8:f64]). + * Raise a Python exception on failure. + * + * Example: + ** simd_data data = simd_scalar_from_obj(obj, simd_data_f32); + ** if (!PyErr_Occurred()) { + ** printf("I have a valid float %d\n", data.f32); + ** } + */ +static simd_data +simd_scalar_from_obj(PyObject *obj, simd_data_type dtype); +/** + * Create a Python scalar from a C scalar based on the contents + * of `data`(simd_data) and according to the scalar data type `dtype` + * on range(simd_data_[u8:f64]). + * Return NULL and a Python exception on failure, otherwise new reference. + * + * Example: + ** simd_data data = {.u32 = 0x7fffffff}; + ** PyObject *obj = simd_scalar_to_obj(data, simd_data_s32); + ** if (obj != NULL) { + ** printf("I have a valid Python integer %d\n", PyLong_AsLong(obj)); + ** Py_DECREF(obj); + ** } + */ +static PyObject * +simd_scalar_to_obj(simd_data data, simd_data_type dtype); +/** + * Allocate a C array in memory according to number of elements `len` + * and sequence data type `dtype` on range(simd_data_[qu8:qf64]). + * + * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL + * with a Python exception on failure. + * + * Example: + ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64); + ** if (aligned_ptr != NULL) { + ** // aligned store + ** npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0)); + ** printf("The first element of my array %f\n", aligned_ptr[0]); + ** simd_sequence_free(aligned_ptr); + ** } + */ +static void * +simd_sequence_new(Py_ssize_t len, simd_data_type dtype); +/** + * Return the number of elements of the allocated C array `ptr` + * by `simd_sequence_new()` or `simd_sequence_from_obj()`. + */ +static size_t +simd_sequence_len(const void *ptr); +/** + * Free the allocated C array by `simd_sequence_new()` or + * `simd_sequence_from_obj()`. + */ +static void +simd_sequence_free(void *ptr); +/** + * Return a C array representation of a PyObject sequence `obj` and + * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Note: parameter `min_size` takes the number of minimum acceptable elements. + * + * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL + * with a Python exception on failure. + * + * Example: + ** npyv_lanetype_u32 *ptr = simd_sequence_from_obj(seq_obj, simd_data_qu32, npyv_nlanes_u32); + ** if (ptr != NULL) { + ** npyv_u32 a = npyv_load_u32(ptr); + ** ... + ** simd_sequence_free(ptr); + ** } + ** + */ +static void * +simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size); +/** + * Fill a Python sequence object `obj` with a C array `ptr` allocated by + * `simd_sequence_new()` or `simd_sequence_from_obj()` according to + * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Return 0 on success or -1 with a Python exception on failure. + */ +static int +simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype); +/** + * Create a Python list from a C array `ptr` allocated by + * `simd_sequence_new()` or `simd_sequence_from_obj()` according to + * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]). + * + * Return NULL and a Python exception on failure, otherwise new reference. + */ +static PyObject * +simd_sequence_to_obj(const void *ptr, simd_data_type dtype); +/** + * Return a SIMD multi-vector(simd_data) representation of Python tuple of + * (simd_vector*,) `obj` according to the scalar data type `dtype` + * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]). + * + * Raise a Python exception on failure. + * + * Example: + ** simd_data data = simd_vectorx_from_obj(tuple_obj, simd_data_vf32x2); + ** if (!PyErr_Occurred()) { + ** npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]); + ** ... + ** } + ** + */ +static simd_data +simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype); +/** + * Create a Python tuple of 'simd_vector' from a SIMD multi-vector + * based on the contents of `data`(simd_data) and according to + * the multi-vector data type `dtype` on range + * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]). + * + * Return NULL and a Python exception on failure, otherwise new reference. + */ +static PyObject * +simd_vectorx_to_obj(simd_data data, simd_data_type dtype); + +/************************************ + ** Declarations (inc_arg) + ************************************/ +typedef struct +{ + simd_data_type dtype; + simd_data data; + // set by simd_args_from_tuple() + PyObject *obj; +} simd_arg; +/** + * The following functions gather all conversions between all data types + * and they can used instead of all above functions. + */ +/** + * Convert a Python object `obj` into simd_data `arg->data` according to the + * required data type `arg->dtype`. + * + * Return -1 and raise Python exception on failure, otherwise return 0. + * + * Notes: + * - requires `simd_args_sequence_free()` or `simd_sequence_free()` + * to free allocated C array, in case of sequence data types. + * - the number of minimum acceptable elements for sequence data + * types is the number of lanes of the equivalent vector data type. + * + * Example #1: + ** simd_arg arg = {.dtype = simd_data_qu8}; + ** if (simd_arg_from_obj(seq_obj, &arg) < 0) { + ** // fails to convert a python sequence object to C array of uint8 + ** return; + ** } + ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8); + ** ... + ** simd_args_sequence_free(&arg, 1); + * + * Example #2: + ** simd_arg arg = {.dtype = simd_data_vf32}; + ** if (simd_arg_from_obj(vector_obj, &arg) < 0) { + ** // fails to convert a python simd_vector to NPYV vector + ** return; + ** } + ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1)); + ** ... + */ +static int +simd_arg_from_obj(PyObject *obj, simd_arg *arg); +/** + * Convert a simd_data `arg->data` to into a Python object according to the + * required data type `arg->dtype`. + * + * Return NULL and raise Python exception on failure, otherwise return + * new reference. + * + * Example: + ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}}; + ** PyObject *obj = simd_arg_to_obj(&arg); + ** if (obj == NULL) { + ** // fails convert C uint32 to Python integer. + ** return; + ** } + ** + */ +static PyObject * +simd_arg_to_obj(const simd_arg *arg); +/** + * Similar to simd_arg_from_obj() but convert multiple objects + * from a Python tuple. + * This function is used to parse method parameters. + * + * Notes: + * - requires `simd_args_sequence_free()` or `simd_sequence_free()` + * to free allocated C array, in case of sequence data types. + * - the number of minimum acceptable elements for sequence data + * types is the number of lanes of the equivalent vector data type. + * - use 'arg->obj' to retrieve the parameter obj. + * + * Parameters: + * - `tuple_obj`: a valid Python tuple + * - `args`: array of 'simd_arg' contain valid data types + * - `args_len`: length of `args`. + * - `method_name`: method name, required for exception message. + * + * Return -1 and raise Python exception on failure, otherwise return 0. + */ +static int +simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name); +/** + * Free the allocated C array for sequence data types. + */ +static void +simd_args_sequence_free(simd_arg *args, int args_len); + +#endif // NPY_SIMD diff --git a/numpy/core/src/_simd/_simd_inc_arg.h b/numpy/core/src/_simd/_simd_inc_arg.h new file mode 100644 index 000000000..1db1ff202 --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc_arg.h @@ -0,0 +1,108 @@ +#if !NPY_SIMD + #error "Not a standalone header, only works through 'simd.dispatch.c.src'" +#endif + +/************************************ + ** Protected Definitions + ************************************/ +static int +simd_arg_from_obj(PyObject *obj, simd_arg *arg) +{ + assert(arg->dtype != 0); + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (info->is_scalar) { + arg->data = simd_scalar_from_obj(obj, arg->dtype); + } + else if (info->is_sequence) { + unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes; + arg->data.qu8 = simd_sequence_from_obj(obj, arg->dtype, min_seq_size); + } + else if (info->is_vectorx) { + arg->data = simd_vectorx_from_obj(obj, arg->dtype); + } + else if (info->is_vector) { + arg->data = simd_vector_from_obj((simd_vector*)obj, arg->dtype); + } else { + arg->data.u64 = 0; + PyErr_Format(PyExc_RuntimeError, + "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname + ); + return -1; + } + if (PyErr_Occurred()) { + return -1; + } + return 0; +} + +static PyObject * +simd_arg_to_obj(const simd_arg *arg) +{ + assert(arg->dtype != 0); + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (info->is_scalar) { + return simd_scalar_to_obj(arg->data, arg->dtype); + } + if (info->is_sequence) { + return simd_sequence_to_obj(arg->data.qu8, arg->dtype); + } + if (info->is_vectorx) { + return simd_vectorx_to_obj(arg->data, arg->dtype); + } + if (info->is_vector) { + return (PyObject*)simd_vector_to_obj(arg->data, arg->dtype); + } + PyErr_Format(PyExc_RuntimeError, + "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname + ); + return NULL; +} + +static void +simd_args_sequence_free(simd_arg *args, int args_len) +{ + assert(args_len > 0); + while (--args_len >= 0) { + simd_arg *arg = &args[args_len]; + const simd_data_info *info = simd_data_getinfo(arg->dtype); + if (!info->is_sequence) { + continue; + } + simd_sequence_free(arg->data.qu8); + } +} + +static int +simd_args_from_tuple(PyObject *tuple_obj, simd_arg *args, int args_len, const char *method_name) +{ + assert(args_len > 0); + assert(PyTuple_Check(tuple_obj)); + + Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(tuple_obj); + if (obj_arg_len != args_len) { + if (args_len == 1) { + PyErr_Format(PyExc_TypeError, + "%s() takes only one argument (%d given)", method_name, obj_arg_len + ); + return -1; + } + PyErr_Format(PyExc_TypeError, + "%s() takes exactly %d arguments (%d given)", method_name, args_len, obj_arg_len + ); + return -1; + } + for (int arg_pos = 0; arg_pos < args_len; ++arg_pos) { + simd_arg *arg = &args[arg_pos]; + arg->obj = PyTuple_GET_ITEM(tuple_obj, arg_pos); + assert(arg->obj != NULL); + if (simd_arg_from_obj(arg->obj, arg) != 0) { + // free previous args + if (arg_pos > 0) { + simd_args_sequence_free(args, arg_pos); + } + // TODO: improve log by add argument number and method name + return -1; + } + } + return 0; +} diff --git a/numpy/core/src/_simd/_simd_inc_convert.h b/numpy/core/src/_simd/_simd_inc_convert.h new file mode 100644 index 000000000..360101247 --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc_convert.h @@ -0,0 +1,206 @@ +#if !NPY_SIMD + #error "Not a standalone header, only works through 'simd.dispatch.c.src'" +#endif + +/************************************ + ** Protected Definitions + ************************************/ +static simd_data +simd_scalar_from_obj(PyObject *obj, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_scalar && info->lane_size > 0); + simd_data data; + if (info->is_float) { + data.f64 = PyFloat_AsDouble(obj); + if (dtype == simd_data_f32){ + data.f32 = (float)data.f64; + } + } else { + data.u64 = PyLong_AsUnsignedLongLongMask(obj); + } + return data; +} + +static PyObject * +simd_scalar_to_obj(simd_data data, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_scalar && info->lane_size > 0); + if (info->is_float) { + if (dtype == simd_data_f32) { + return PyFloat_FromDouble(data.f32); + } + return PyFloat_FromDouble(data.f64); + } + int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8; + data.u64 <<= leftb; + if (info->is_signed) { + return PyLong_FromLongLong(data.s64 >> leftb); + } + return PyLong_FromUnsignedLongLong(data.u64 >> leftb); +} + +static void * +simd_sequence_new(Py_ssize_t len, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_sequence && info->lane_size > 0); + + size_t size = NPY_SIMD_WIDTH + sizeof(size_t) + sizeof(size_t*); + size += len * info->lane_size; + + size_t *ptr = malloc(size); + if (ptr == NULL) { + return PyErr_NoMemory(); + } + *(ptr++) = len; + size_t **a_ptr = (size_t**)( + ((size_t)ptr + NPY_SIMD_WIDTH) & ~(size_t)(NPY_SIMD_WIDTH-1) + ); + a_ptr[-1] = ptr; + return a_ptr; +} + +static size_t +simd_sequence_len(const void *ptr) +{ + size_t *ptrz = ((size_t**)ptr)[-1]; + return *(ptrz-1); +} + +static void +simd_sequence_free(void *ptr) +{ + size_t *ptrz = ((size_t**)ptr)[-1]; + free(ptrz-1); +} + +static void * +simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + assert(info->is_sequence && info->lane_size > 0); + PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence"); + if (seq_obj == NULL) { + return NULL; + } + Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj); + if (seq_size < (Py_ssize_t)min_size) { + PyErr_Format(PyExc_ValueError, + "minimum acceptable size of the required sequence is %d, given(%d)", + min_size, seq_size + ); + return NULL; + } + npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype); + if (dst == NULL) { + return NULL; + } + PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj); + for (Py_ssize_t i = 0; i < seq_size; ++i) { + simd_data data = simd_scalar_from_obj(seq_items[i], info->to_scalar); + npyv_lanetype_u8 *sdst = dst + i * info->lane_size; + memcpy(sdst, &data.u64, info->lane_size); + } + Py_DECREF(seq_obj); + + if (PyErr_Occurred()) { + simd_sequence_free(dst); + return NULL; + } + return dst; +} + +static int +simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + if (!PySequence_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "a sequence object is required to fill %s", info->pyname + ); + return -1; + } + const npyv_lanetype_u8 *src = ptr; + Py_ssize_t seq_len = (Py_ssize_t)simd_sequence_len(ptr); + for (Py_ssize_t i = 0; i < seq_len; ++i) { + const npyv_lanetype_u8 *ssrc = src + i * info->lane_size; + simd_data data; + memcpy(&data.u64, ssrc, info->lane_size); + PyObject *item = simd_scalar_to_obj(data, info->to_scalar); + if (item == NULL) { + return -1; + } + if (PySequence_SetItem(obj, i, item) < 0) { + Py_DECREF(item); + return -1; + } + } + return 0; +} + +static PyObject * +simd_sequence_to_obj(const void *ptr, simd_data_type dtype) +{ + PyObject *list = PyList_New((Py_ssize_t)simd_sequence_len(ptr)); + if (list == NULL) { + return NULL; + } + if (simd_sequence_fill_obj(list, ptr, dtype) < 0) { + Py_DECREF(list); + return NULL; + } + return list; +} + +static simd_data +simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + // NPYV currently only supports x2 and x3 + assert(info->is_vectorx > 1 && info->is_vectorx < 4); + + simd_data data = {.u64 = 0}; + if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) { + PyErr_Format(PyExc_TypeError, + "a tuple of %d vector type %s is required", + info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname + ); + return data; + } + for (int i = 0; i < info->is_vectorx; ++i) { + PyObject *item = PyTuple_GET_ITEM(obj, i); + // get the max multi-vec and let the compiler do the rest + data.vu64x3.val[i] = simd_vector_from_obj((simd_vector*)item, info->to_vector).vu64; + if (PyErr_Occurred()) { + return data; + } + } + return data; +} + +static PyObject * +simd_vectorx_to_obj(simd_data data, simd_data_type dtype) +{ + const simd_data_info *info = simd_data_getinfo(dtype); + // NPYV currently only supports x2 and x3 + assert(info->is_vectorx > 1 && info->is_vectorx < 4); + + PyObject *tuple = PyTuple_New(info->is_vectorx); + if (tuple == NULL) { + return NULL; + } + for (int i = 0; i < info->is_vectorx; ++i) { + // get the max multi-vector and let the compiler handle the rest + simd_data vdata = {.vu64 = data.vu64x3.val[i]}; + PyObject *item = (PyObject*)simd_vector_to_obj(vdata, info->to_vector); + if (item == NULL) { + // TODO: improve log add item number + Py_DECREF(tuple); + return NULL; + } + PyTuple_SET_ITEM(tuple, i, item); + } + return tuple; +} diff --git a/numpy/core/src/_simd/_simd_inc_data.h.src b/numpy/core/src/_simd/_simd_inc_data.h.src new file mode 100644 index 000000000..eefac483b --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc_data.h.src @@ -0,0 +1,91 @@ +#if !NPY_SIMD + #error "Not a standalone header, only works through 'simd.dispatch.c.src'" +#endif + +/************************************ + ** Private Definitions + ************************************/ +static simd_data_info simd__data_registry[simd_data_end] = +{ + [simd_data_none] = {.pyname="none"}, + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + * #name = int*8, float, float# + */ + [simd_data_@sfx@] = { + .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // sequences + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + * #name = int*8, float, float# + */ + [simd_data_q@sfx@] = { + .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // vectors + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@] = { + .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // boolean vectors, treated as unsigned and converted internally + // to add compatibility among all SIMD extensions + /**begin repeat + * #sfx = u8, u16, u32, u64# + * #bsfx = b8, b16, b32, b64# + */ + [simd_data_v@bsfx@] = { + .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1, + .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // multi-vectors x2 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@x2] = { + .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ + // multi-vectors x3 + /**begin repeat + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64# + * #sig = 0*4, 1*4, 0*2# + * #fp = 0*4, 0*4, 1*2# + */ + [simd_data_v@sfx@x3] = { + .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@, + .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@, + .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@) + }, + /**end repeat**/ +}; + +/************************************ + ** Protected Definitions + ************************************/ +static const simd_data_info * +simd_data_getinfo(simd_data_type dtype) +{ return &simd__data_registry[dtype]; } diff --git a/numpy/core/src/_simd/_simd_inc_easyintrin.h b/numpy/core/src/_simd/_simd_inc_easyintrin.h new file mode 100644 index 000000000..7216b373a --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc_easyintrin.h @@ -0,0 +1,229 @@ +#if !NPY_SIMD + #error "Not a standalone header, only works through 'simd.dispatch.c.src'" +#endif + +#define SIMD_INTRIN_DEF(NAME) \ + { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma + +static int simd__no_arguments(PyObject *args, const char* method_name) +{ + if (args == NULL) { + return 0; + } + assert(PyTuple_Check(args)); + Py_ssize_t obj_arg_len = PyTuple_GET_SIZE(args); + if (obj_arg_len != 0) { + PyErr_Format(PyExc_RuntimeError, + "%s(), takes no arguments, given(%d)", method_name, obj_arg_len + ); + return -1; + } + return 0; +} + +#define SIMD_IMPL_INTRIN_0(NAME, RET) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + if (simd__no_arguments( \ + args, NPY_TOSTRING(NAME) \ + )) return NULL; \ + simd_arg a = { \ + .dtype = simd_data_##RET, \ + .data = {.RET = npyv_##NAME()}, \ + }; \ + return simd_arg_to_obj(&a); \ + } + +#define SIMD_IMPL_INTRIN_0N(NAME) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + if (simd__no_arguments( \ + args, NPY_TOSTRING(NAME) \ + )) return NULL; \ + npyv_##NAME(); \ + Py_RETURN_NONE; \ + } + +#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg req_args[] = { \ + {.dtype = simd_data_##IN0}, \ + }; \ + if (simd_args_from_tuple( \ + args, req_args, 1, NPY_TOSTRING(NAME)) \ + ) return NULL; \ + simd_data r = {.RET = npyv_##NAME( \ + req_args[0].data.IN0 \ + )}; \ + simd_args_sequence_free(req_args, 1); \ + req_args[0].data = r; \ + req_args[0].dtype = simd_data_##RET; \ + return simd_arg_to_obj(req_args); \ + } + +#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg req_args[] = { \ + {.dtype = simd_data_##IN0}, \ + {.dtype = simd_data_##IN1}, \ + }; \ + if (simd_args_from_tuple( \ + args, req_args, 2, NPY_TOSTRING(NAME)) \ + ) { \ + return NULL; \ + } \ + simd_data r = {.RET = npyv_##NAME( \ + req_args[0].data.IN0, \ + req_args[1].data.IN1 \ + )}; \ + simd_args_sequence_free(req_args, 2); \ + req_args[0].data = r; \ + req_args[0].dtype = simd_data_##RET; \ + return simd_arg_to_obj(req_args); \ + } + +#define SIMD__REPEAT_2IMM(C, NAME, IN0) \ + C == req_args[1].data.u8 ? NPY_CAT(npyv_, NAME)(req_args[0].data.IN0, C) : + +#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg req_args[] = { \ + {.dtype = simd_data_##IN0}, \ + {.dtype = simd_data_u8}, \ + }; \ + if (simd_args_from_tuple( \ + args, req_args, 2, NPY_TOSTRING(NAME)) \ + ) { \ + return NULL; \ + } \ + simd_data r; \ + r.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \ + SIMD__REPEAT_2IMM, NAME, IN0 \ + ) npyv_##NAME(req_args[0].data.IN0, 0); \ + simd_args_sequence_free(req_args, 2); \ + req_args[0].data = r; \ + req_args[0].dtype = simd_data_##RET; \ + return simd_arg_to_obj(req_args); \ + } + +#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg req_args[] = { \ + {.dtype = simd_data_##IN0}, \ + {.dtype = simd_data_##IN1}, \ + {.dtype = simd_data_##IN2}, \ + }; \ + if (simd_args_from_tuple( \ + args, req_args, 3, NPY_TOSTRING(NAME)) \ + ) { \ + return NULL; \ + } \ + simd_data r = {.RET = npyv_##NAME( \ + req_args[0].data.IN0, \ + req_args[1].data.IN1, \ + req_args[2].data.IN2 \ + )}; \ + simd_args_sequence_free(req_args, 3); \ + req_args[0].data = r; \ + req_args[0].dtype = simd_data_##RET; \ + return simd_arg_to_obj(req_args); \ + } +/** + * Helper macros for repeating and expand a certain macro. + * Mainly used for converting a scalar to an immediate constant. + */ +#define SIMD__IMPL_COUNT_7(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_8(FN, ...) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(8, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_15(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_16(FN, ...) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(16, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_31(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_32(FN, ...) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(32, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_47(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_48(FN, ...) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(48, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_63(FN, ...) \ + NPY_EXPAND(FN(0, __VA_ARGS__)) \ + SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) + +#define SIMD__IMPL_COUNT_64(FN, ...) \ + SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(64, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_7_(FN, ...) \ + NPY_EXPAND(FN(1, __VA_ARGS__)) \ + NPY_EXPAND(FN(2, __VA_ARGS__)) NPY_EXPAND(FN(3, __VA_ARGS__)) \ + NPY_EXPAND(FN(4, __VA_ARGS__)) NPY_EXPAND(FN(5, __VA_ARGS__)) \ + NPY_EXPAND(FN(6, __VA_ARGS__)) NPY_EXPAND(FN(7, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_15_(FN, ...) \ + SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(8, __VA_ARGS__)) NPY_EXPAND(FN(9, __VA_ARGS__)) \ + NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \ + NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \ + NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_31_(FN, ...) \ + SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \ + NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \ + NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \ + NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \ + NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \ + NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \ + NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \ + NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_47_(FN, ...) \ + SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \ + NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \ + NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \ + NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \ + NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \ + NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \ + NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \ + NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__)) + +#define SIMD__IMPL_COUNT_63_(FN, ...) \ + SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \ + NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \ + NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \ + NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \ + NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \ + NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \ + NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \ + NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \ + NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__)) diff --git a/numpy/core/src/_simd/_simd_inc_vector.h b/numpy/core/src/_simd/_simd_inc_vector.h new file mode 100644 index 000000000..b0fa17b9a --- /dev/null +++ b/numpy/core/src/_simd/_simd_inc_vector.h @@ -0,0 +1,187 @@ +#if !NPY_SIMD + #error "Not a standalone header, only works through 'simd.dispatch.c.src'" +#endif + +/************************************ + ** Private Definitions + ************************************/ +// PySequenceMethods +static Py_ssize_t +simd__vector_length(simd_vector *self) +{ + return simd_data_getinfo(self->type)->nlanes; +} +static PyObject * +simd__vector_item(simd_vector *self, Py_ssize_t i) +{ + const simd_data_info *info = simd_data_getinfo(self->type); + int nlanes = info->nlanes; + if (i >= nlanes) { + PyErr_SetString(PyExc_IndexError, "list index out of range"); + return NULL; + } + npyv_lanetype_u8 *src = self->data + i * info->lane_size; + simd_data data; + memcpy(&data.u64, src, info->lane_size); + return simd_scalar_to_obj(data, info->to_scalar); +} + +static PySequenceMethods simd__vector_as_sequence = { + (lenfunc) simd__vector_length, /* sq_length */ + (binaryfunc) NULL, /* sq_concat */ + (ssizeargfunc) NULL, /* sq_repeat */ + (ssizeargfunc) simd__vector_item, /* sq_item */ + (ssizessizeargfunc) NULL, /* sq_slice */ + (ssizeobjargproc) NULL, /* sq_ass_item */ + (ssizessizeobjargproc) NULL, /* sq_ass_slice */ + (objobjproc) NULL, /* sq_contains */ + (binaryfunc) NULL, /* sq_inplace_concat */ + (ssizeargfunc) NULL, /* sq_inplace_repeat */ +}; + +// PyGetSetDef +static PyObject * +simd__vector_name(simd_vector *self) +{ + return PyUnicode_FromString(simd_data_getinfo(self->type)->pyname); +} +static PyGetSetDef simd__vector_getset[] = { + { "__name__", (getter)simd__vector_name, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL } +}; + +// PyTypeObject(simd__vector_type) +static PyObject * +simd__vector_repr(PyObject *self) +{ + // PySequence_Fast returns Tuple in PyPy + PyObject *obj = PySequence_List(self); + if (obj != NULL) { + PyObject *repr = PyObject_Str(obj); + Py_DECREF(obj); + return repr; + } + return obj; +} +static PyObject * +simd__vector_compare(PyObject *self, PyObject *other, int cmp_op) +{ + PyObject *obj; + if (PyTuple_Check(other)) { + obj = PySequence_Tuple(self); + } else if (PyList_Check(other)) { + obj = PySequence_List(self); + } else { + obj = PySequence_Fast(self, "invalid argument, expected a vector"); + } + if (obj != NULL) { + PyObject *rich = PyObject_RichCompare(obj, other, cmp_op); + Py_DECREF(obj); + return rich; + } + return obj; +} +static PyTypeObject simd__vector_type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)), + .tp_basicsize = sizeof(simd_vector), + .tp_repr = (reprfunc)simd__vector_repr, + .tp_as_sequence = &simd__vector_as_sequence, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_richcompare = simd__vector_compare, + .tp_getset = simd__vector_getset +}; + +/************************************ + ** Protected Definitions + ************************************/ +static simd_vector * +simd_vector_to_obj(simd_data data, simd_data_type vtype) +{ + const simd_data_info *info = simd_data_getinfo(vtype); + assert(info->is_vector && info->nlanes > 0); + + simd_vector *vec = PyObject_New(simd_vector, &simd__vector_type); + if (vec == NULL) { + return (simd_vector*)PyErr_NoMemory(); + } + vec->type = vtype; + if (info->is_bool) { + // boolean vectors are internally treated as unsigned + // vectors to add compatibility among all SIMD extensions + switch(vtype) { + case simd_data_vb8: + data.vu8 = npyv_cvt_u8_b8(data.vb8); + break; + case simd_data_vb16: + data.vu16 = npyv_cvt_u16_b16(data.vb16); + break; + case simd_data_vb32: + data.vu32 = npyv_cvt_u32_b32(data.vb32); + break; + default: + data.vu64 = npyv_cvt_u64_b64(data.vb64); + } + } + npyv_store_u8(vec->data, data.vu8); + return vec; +} + +static simd_data +simd_vector_from_obj(simd_vector *vec, simd_data_type vtype) +{ + const simd_data_info *info = simd_data_getinfo(vtype); + assert(info->is_vector && info->nlanes > 0); + + simd_data data = {.u64 = 0}; + if (!PyObject_IsInstance( + (PyObject *)vec, (PyObject *)&simd__vector_type + )) { + PyErr_Format(PyExc_TypeError, + "a vector type %s is required", info->pyname + ); + return data; + } + if (vec->type != vtype) { + PyErr_Format(PyExc_TypeError, + "a vector type %s is required, got(%s)", + info->pyname, simd_data_getinfo(vec->type)->pyname + ); + return data; + } + + data.vu8 = npyv_load_u8(vec->data); + if (info->is_bool) { + // boolean vectors are internally treated as unsigned + // vectors to add compatibility among all SIMD extensions + switch(vtype) { + case simd_data_vb8: + data.vb8 = npyv_cvt_b8_u8(data.vu8); + break; + case simd_data_vb16: + data.vb16 = npyv_cvt_b16_u16(data.vu16); + break; + case simd_data_vb32: + data.vb32 = npyv_cvt_b32_u32(data.vu32); + break; + default: + data.vb64 = npyv_cvt_b64_u64(data.vu64); + } + } + return data; +} + +static int +simd_vector_register(PyObject *module) +{ + Py_INCREF(&simd__vector_type); + if (PyType_Ready(&simd__vector_type)) { + return -1; + } + if (PyModule_AddObject( + module, "vector_type",(PyObject *)&simd__vector_type + )) { + return -1; + } + return 0; +} diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py index 60ba4c917..6025586cd 100644 --- a/numpy/distutils/command/build.py +++ b/numpy/distutils/command/build.py @@ -22,6 +22,8 @@ class build(old_build): "specify a list of dispatched CPU optimizations"), ('disable-optimization', None, "disable CPU optimized code(dispatch,simd,fast...)"), + ('simd-test=', None, + "specify a list of CPU optimizations to be tested against NumPy SIMD interface"), ] help_options = old_build.help_options + [ @@ -36,6 +38,16 @@ class build(old_build): self.cpu_baseline = "min" self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default self.disable_optimization = False + """ + the '_simd' module is a very large. Adding more dispatched features + will increase binary size and compile time. By default we minimize + the targeted features to those most commonly used by the NumPy SIMD interface(NPYV), + NOTE: any specified features will be ignored if they're: + - part of the baseline(--cpu-baseline) + - not part of dispatch-able features(--cpu-dispatch) + - not supported by compiler or platform + """ + self.simd_test = "BASELINE SSE2 SSE41 SSE42 XOP (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD" def finalize_options(self): build_scripts = self.build_scripts diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py index 1a881c56a..ca6f8bcd2 100644 --- a/numpy/distutils/command/build_ext.py +++ b/numpy/distutils/command/build_ext.py @@ -19,8 +19,7 @@ from numpy.distutils.misc_util import ( has_cxx_sources, has_f_sources, is_sequence ) from numpy.distutils.command.config_compiler import show_fortran_compilers -from numpy.distutils.ccompiler_opt import new_ccompiler_opt - +from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt class build_ext (old_build_ext): @@ -39,6 +38,8 @@ class build_ext (old_build_ext): "specify a list of dispatched CPU optimizations"), ('disable-optimization', None, "disable CPU optimized code(dispatch,simd,fast...)"), + ('simd-test=', None, + "specify a list of CPU optimizations to be tested against NumPy SIMD interface"), ] help_options = old_build_ext.help_options + [ @@ -56,6 +57,7 @@ class build_ext (old_build_ext): self.cpu_baseline = None self.cpu_dispatch = None self.disable_optimization = None + self.simd_test = None def finalize_options(self): if self.parallel: @@ -87,7 +89,9 @@ class build_ext (old_build_ext): ('cpu_baseline', 'cpu_baseline'), ('cpu_dispatch', 'cpu_dispatch'), ('disable_optimization', 'disable_optimization'), + ('simd_test', 'simd_test') ) + CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test def run(self): if not self.extensions: |
