diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-10-18 14:04:17 -0700 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2021-10-18 14:04:17 -0700 |
commit | 1950fb55ce21f2e125672979872dd4c7781d18ce (patch) | |
tree | 75a8dfdd914c6e1ae6a531f8d7a54082fca92f1c /numpy | |
parent | dd2eaaabdb0451631e95376ae2b4d319082b438e (diff) | |
download | numpy-1950fb55ce21f2e125672979872dd4c7781d18ce.tar.gz |
BUG: NEON min/max is slow (#17989)
This fixes numpy/numpy#17989 by adding ARM NEON implementations for min/max and fmin/max.
Before: Rosetta faster than native arm64 by `1.2x - 8.6x`.
After: Native arm64 faster than Rosetta by `1.6x - 6.7x`. (2.8x - 15.5x improvement)
**Benchmarks**
```
before after ratio
[b0e1a445] [8301ffd7]
<main> <gh-issue-17989/improve-neon-min-max>
+ 32.6±0.04μs 37.5±0.08μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 1, 'd')
+ 32.6±0.06μs 37.5±0.04μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 1, 'd')
+ 37.8±0.09μs 43.2±0.09μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'f')
+ 37.7±0.09μs 42.9±0.1μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 2, 'd')
+ 37.9±0.2μs 43.0±0.02μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 2, 'd')
+ 37.7±0.01μs 42.3±1μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 2, 'd')
+ 34.2±0.07μs 38.1±0.05μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 2, 'f')
+ 32.6±0.03μs 35.8±0.04μs 1.10 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 1, 'f')
+ 37.1±0.1μs 40.3±0.1μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 1, 2, 'd')
+ 37.2±0.1μs 40.3±0.04μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'f')
+ 37.1±0.09μs 40.3±0.07μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 1, 2, 'd')
+ 68.6±0.5μs 74.2±0.3μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'd')
+ 37.1±0.2μs 40.0±0.1μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 1, 2, 'd')
+ 2.42±0μs 2.61±0.05μs 1.08 bench_core.CountNonzero.time_count_nonzero_axis(3, 100, <class 'numpy.int16'>)
+ 69.1±0.7μs 73.5±0.7μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 4, 4, 'd')
+ 54.7±0.3μs 58.0±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd')
+ 54.5±0.2μs 57.8±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 4, 'd')
+ 3.78±0.04μs 4.00±0.02μs 1.06 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 100, <class 'str'>)
+ 54.8±0.2μs 57.9±0.3μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'd')
+ 3.68±0.01μs 3.87±0.02μs 1.05 bench_core.CountNonzero.time_count_nonzero_multi_axis(1, 100, <class 'object'>)
+ 69.6±0.2μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'd')
+ 229±2μs 241±0.2μs 1.05 bench_random.Bounded.time_bounded('PCG64', [<class 'numpy.uint64'>, 1535])
- 73.0±0.8μs 69.5±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 4, 4, 'd')
- 37.6±0.1μs 35.7±0.3μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 1, 4, 'f')
- 88.7±0.04μs 84.2±0.7μs 0.95 bench_lib.Pad.time_pad((256, 128, 1), 1, 'wrap')
- 57.9±0.2μs 54.8±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 2, 4, 'd')
- 39.9±0.2μs 37.2±0.04μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 1, 2, 'd')
- 2.66±0.01μs 2.47±0.01μs 0.93 bench_lib.Nan.time_nanmin(200, 0)
- 2.65±0.02μs 2.46±0.04μs 0.93 bench_lib.Nan.time_nanmin(200, 50.0)
- 2.64±0.01μs 2.45±0.01μs 0.93 bench_lib.Nan.time_nanmax(200, 90.0)
- 2.64±0μs 2.44±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0)
- 2.68±0.02μs 2.48±0μs 0.92 bench_lib.Nan.time_nanmax(200, 2.0)
- 40.2±0.01μs 37.1±0.1μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'f')
- 2.69±0μs 2.47±0μs 0.92 bench_lib.Nan.time_nanmin(200, 2.0)
- 2.70±0.02μs 2.48±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0.1)
- 2.70±0μs 2.47±0μs 0.91 bench_lib.Nan.time_nanmin(200, 90.0)
- 2.70±0μs 2.46±0μs 0.91 bench_lib.Nan.time_nanmin(200, 0.1)
- 2.70±0μs 2.42±0.01μs 0.90 bench_lib.Nan.time_nanmax(200, 50.0)
- 11.8±0.6ms 10.6±0.6ms 0.89 bench_core.CountNonzero.time_count_nonzero_axis(2, 1000000, <class 'str'>)
- 42.7±0.1μs 37.8±0.02μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 2, 2, 'd')
- 42.8±0.03μs 37.8±0.2μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 2, 'd')
- 43.1±0.2μs 37.7±0.09μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'f')
- 37.5±0.07μs 32.6±0.06μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 1, 'd')
- 41.7±0.03μs 36.3±0.07μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 1, 4, 'd')
- 166±0.8μs 144±1μs 0.87 bench_ufunc.UFunc.time_ufunc_types('fmin')
- 11.6±0.8ms 10.0±0.01ms 0.87 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 1000000, <class 'str'>)
- 167±0.9μs 144±2μs 0.86 bench_ufunc.UFunc.time_ufunc_types('minimum')
- 168±4μs 143±0.5μs 0.85 bench_ufunc.UFunc.time_ufunc_types('fmax')
- 167±1μs 142±0.8μs 0.85 bench_ufunc.UFunc.time_ufunc_types('maximum')
- 7.10±0μs 4.97±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 2)
- 7.11±0.07μs 4.96±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 2)
- 7.05±0.07μs 4.68±0μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 4)
- 7.13±0μs 4.68±0.01μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 4)
- 461±0.2μs 297±7μs 0.64 bench_app.MaxesOfDots.time_it
- 7.04±0.07μs 3.95±0μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 2)
- 7.06±0.06μs 3.95±0.01μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 2)
- 7.09±0.06μs 3.24±0μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 1)
- 7.12±0.07μs 3.25±0.02μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 1)
- 14.5±0.02μs 3.98±0μs 0.27 bench_reduce.MinMax.time_max(<class 'numpy.int64'>)
- 14.6±0.1μs 4.00±0.01μs 0.27 bench_reduce.MinMax.time_min(<class 'numpy.int64'>)
- 6.88±0.06μs 1.34±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 1)
- 7.00±0μs 1.33±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 1)
- 39.4±0.01μs 3.95±0.01μs 0.10 bench_reduce.MinMax.time_min(<class 'numpy.float64'>)
- 39.4±0.01μs 3.95±0.02μs 0.10 bench_reduce.MinMax.time_max(<class 'numpy.float64'>)
- 254±0.02μs 22.8±0.2μs 0.09 bench_lib.Nan.time_nanmax(200000, 50.0)
- 253±0.1μs 22.7±0.1μs 0.09 bench_lib.Nan.time_nanmin(200000, 0)
- 254±0.06μs 22.7±0.09μs 0.09 bench_lib.Nan.time_nanmin(200000, 2.0)
- 254±0.01μs 22.7±0.03μs 0.09 bench_lib.Nan.time_nanmin(200000, 0.1)
- 254±0.04μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmin(200000, 50.0)
- 253±0.1μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 0.1)
- 253±0.03μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmin(200000, 90.0)
- 253±0.02μs 22.7±0.07μs 0.09 bench_lib.Nan.time_nanmax(200000, 0)
- 254±0.03μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmax(200000, 90.0)
- 254±0.09μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 2.0)
- 39.2±0.01μs 2.51±0.01μs 0.06 bench_reduce.MinMax.time_max(<class 'numpy.float32'>)
- 39.2±0.01μs 2.50±0.01μs 0.06 bench_reduce.MinMax.time_min(<class 'numpy.float32'>)
```
Size change of _multiarray_umath.cpython-39-darwin.so:
Before: 3,890,723
After: 3,924,035
Change: +33,312 (~ +0.856 %)
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 4 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/setup.py.orig | 1063 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 66 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 9 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 50 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src.orig | 667 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_minmax.dispatch.c.src | 671 |
8 files changed, 2531 insertions, 0 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 3a27a34cd..c20468a8f 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -516,6 +516,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMax') ), @@ -523,6 +524,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]), TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMin') ), @@ -537,6 +539,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), TD(noobj), TD(O, f='npy_ObjectMax') ), @@ -544,6 +547,7 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', + TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]), TD(noobj), TD(O, f='npy_ObjectMin') ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index f2524ae13..2b0e33244 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -943,6 +943,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_minmax.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), diff --git a/numpy/core/setup.py.orig b/numpy/core/setup.py.orig new file mode 100644 index 000000000..f2524ae13 --- /dev/null +++ b/numpy/core/setup.py.orig @@ -0,0 +1,1063 @@ +import os +import sys +import pickle +import copy +import warnings +import platform +import textwrap +import glob +from os.path import join + +from numpy.distutils import log +from distutils.dep_util import newer +from sysconfig import get_config_var +from numpy.compat import npy_load_module +from setup_common import * # noqa: F403 + +# Set to True to enable relaxed strides checking. This (mostly) means +# that `strides[dim]` is ignored if `shape[dim] == 1` when setting flags. +NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0") + +# Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a +# bogus value for affected strides in order to help smoke out bad stride usage +# when relaxed stride checking is enabled. +NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0") +NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING + +# XXX: ugly, we use a class to avoid calling twice some expensive functions in +# config.h/numpyconfig.h. I don't see a better way because distutils force +# config.h generation inside an Extension class, and as such sharing +# configuration information between extensions is not easy. +# Using a pickled-based memoize does not work because config_cmd is an instance +# method, which cPickle does not like. +# +# Use pickle in all cases, as cPickle is gone in python3 and the difference +# in time is only in build. -- Charles Harris, 2013-03-30 + +class CallOnceOnly: + def __init__(self): + self._check_types = None + self._check_ieee_macros = None + self._check_complex = None + + def check_types(self, *a, **kw): + if self._check_types is None: + out = check_types(*a, **kw) + self._check_types = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_types)) + return out + + def check_ieee_macros(self, *a, **kw): + if self._check_ieee_macros is None: + out = check_ieee_macros(*a, **kw) + self._check_ieee_macros = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_ieee_macros)) + return out + + def check_complex(self, *a, **kw): + if self._check_complex is None: + out = check_complex(*a, **kw) + self._check_complex = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_complex)) + return out + +def can_link_svml(): + """SVML library is supported only on x86_64 architecture and currently + only on linux + """ + machine = platform.machine() + system = platform.system() + return "x86_64" in machine and system == "Linux" + +def check_svml_submodule(svmlpath): + if not os.path.exists(svmlpath + "/README.md"): + raise RuntimeError("Missing `SVML` submodule! Run `git submodule " + "update --init` to fix this.") + return True + +def pythonlib_dir(): + """return path where libpython* is.""" + if sys.platform == 'win32': + return os.path.join(sys.prefix, "libs") + else: + return get_config_var('LIBDIR') + +def is_npy_no_signal(): + """Return True if the NPY_NO_SIGNAL symbol must be defined in configuration + header.""" + return sys.platform == 'win32' + +def is_npy_no_smp(): + """Return True if the NPY_NO_SMP symbol must be defined in public + header (when SMP support cannot be reliably enabled).""" + # Perhaps a fancier check is in order here. + # so that threads are only enabled if there + # are actually multiple CPUS? -- but + # threaded code can be nice even on a single + # CPU so that long-calculating code doesn't + # block. + return 'NPY_NOSMP' in os.environ + +def win32_checks(deflist): + from numpy.distutils.misc_util import get_build_architecture + a = get_build_architecture() + + # Distutils hack on AMD64 on windows + print('BUILD_ARCHITECTURE: %r, os.name=%r, sys.platform=%r' % + (a, os.name, sys.platform)) + if a == 'AMD64': + deflist.append('DISTUTILS_USE_SDK') + + # On win32, force long double format string to be 'g', not + # 'Lg', since the MS runtime does not support long double whose + # size is > sizeof(double) + if a == "Intel" or a == "AMD64": + deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING') + +def check_math_capabilities(config, ext, moredefs, mathlibs): + def check_func(func_name): + return config.check_func(func_name, libraries=mathlibs, + decl=True, call=True) + + def check_funcs_once(funcs_name): + decl = dict([(f, True) for f in funcs_name]) + st = config.check_funcs_once(funcs_name, libraries=mathlibs, + decl=decl, call=decl) + if st: + moredefs.extend([(fname2def(f), 1) for f in funcs_name]) + return st + + def check_funcs(funcs_name): + # Use check_funcs_once first, and if it does not work, test func per + # func. Return success only if all the functions are available + if not check_funcs_once(funcs_name): + # Global check failed, check func per func + for f in funcs_name: + if check_func(f): + moredefs.append((fname2def(f), 1)) + return 0 + else: + return 1 + + #use_msvc = config.check_decl("_MSC_VER") + + if not check_funcs_once(MANDATORY_FUNCS): + raise SystemError("One of the required function to build numpy is not" + " available (the list is %s)." % str(MANDATORY_FUNCS)) + + # Standard functions which may not be available and for which we have a + # replacement implementation. Note that some of these are C99 functions. + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + for f in OPTIONAL_STDFUNCS_MAYBE: + if config.check_decl(fname2def(f), + headers=["Python.h", "math.h"]): + OPTIONAL_STDFUNCS.remove(f) + + check_funcs(OPTIONAL_STDFUNCS) + + for h in OPTIONAL_HEADERS: + if config.check_func("", decl=False, call=False, headers=[h]): + h = h.replace(".", "_").replace(os.path.sep, "_") + moredefs.append((fname2def(h), 1)) + + for tup in OPTIONAL_INTRINSICS: + headers = None + if len(tup) == 2: + f, args, m = tup[0], tup[1], fname2def(tup[0]) + elif len(tup) == 3: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[0]) + else: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[3]) + if config.check_func(f, decl=False, call=True, call_args=args, + headers=headers): + moredefs.append((m, 1)) + + for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES: + if config.check_gcc_function_attribute(dec, fn): + moredefs.append((fname2def(fn), 1)) + if fn == 'attribute_target_avx512f': + # GH-14787: Work around GCC<8.4 bug when compiling with AVX512 + # support on Windows-based platforms + if (sys.platform in ('win32', 'cygwin') and + config.check_compiler_gcc() and + not config.check_gcc_version_at_least(8, 4)): + ext.extra_compile_args.extend( + ['-ffixed-xmm%s' % n for n in range(16, 32)]) + + for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS: + if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code, + header): + moredefs.append((fname2def(fn), 1)) + + for fn in OPTIONAL_VARIABLE_ATTRIBUTES: + if config.check_gcc_variable_attribute(fn): + m = fn.replace("(", "_").replace(")", "_") + moredefs.append((fname2def(m), 1)) + + # C99 functions: float and long double versions + check_funcs(C99_FUNCS_SINGLE) + check_funcs(C99_FUNCS_EXTENDED) + +def check_complex(config, mathlibs): + priv = [] + pub = [] + + try: + if os.uname()[0] == "Interix": + warnings.warn("Disabling broken complex support. See #1365", stacklevel=2) + return priv, pub + except Exception: + # os.uname not available on all platforms. blanket except ugly but safe + pass + + # Check for complex support + st = config.check_header('complex.h') + if st: + priv.append(('HAVE_COMPLEX_H', 1)) + pub.append(('NPY_USE_C99_COMPLEX', 1)) + + for t in C99_COMPLEX_TYPES: + st = config.check_type(t, headers=["complex.h"]) + if st: + pub.append(('NPY_HAVE_%s' % type2def(t), 1)) + + def check_prec(prec): + flist = [f + prec for f in C99_COMPLEX_FUNCS] + decl = dict([(f, True) for f in flist]) + if not config.check_funcs_once(flist, call=decl, decl=decl, + libraries=mathlibs): + for f in flist: + if config.check_func(f, call=True, decl=True, + libraries=mathlibs): + priv.append((fname2def(f), 1)) + else: + priv.extend([(fname2def(f), 1) for f in flist]) + + check_prec('') + check_prec('f') + check_prec('l') + + return priv, pub + +def check_ieee_macros(config): + priv = [] + pub = [] + + macros = [] + + def _add_decl(f): + priv.append(fname2def("decl_%s" % f)) + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + _macros = ["isnan", "isinf", "signbit", "isfinite"] + for f in _macros: + py_symbol = fname2def("decl_%s" % f) + already_declared = config.check_decl(py_symbol, + headers=["Python.h", "math.h"]) + if already_declared: + if config.check_macro_true(py_symbol, + headers=["Python.h", "math.h"]): + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + else: + macros.append(f) + # Normally, isnan and isinf are macro (C99), but some platforms only have + # func, or both func and macro version. Check for macro only, and define + # replacement ones if not found. + # Note: including Python.h is necessary because it modifies some math.h + # definitions + for f in macros: + st = config.check_decl(f, headers=["Python.h", "math.h"]) + if st: + _add_decl(f) + + return priv, pub + +def check_types(config_cmd, ext, build_dir): + private_defines = [] + public_defines = [] + + # Expected size (in number of bytes) for each type. This is an + # optimization: those are only hints, and an exhaustive search for the size + # is done if the hints are wrong. + expected = {'short': [2], 'int': [4], 'long': [8, 4], + 'float': [4], 'double': [8], 'long double': [16, 12, 8], + 'Py_intptr_t': [8, 4], 'PY_LONG_LONG': [8], 'long long': [8], + 'off_t': [8, 4]} + + # Check we have the python header (-dev* packages on Linux) + result = config_cmd.check_header('Python.h') + if not result: + python = 'python' + if '__pypy__' in sys.builtin_module_names: + python = 'pypy' + raise SystemError( + "Cannot compile 'Python.h'. Perhaps you need to " + "install {0}-dev|{0}-devel.".format(python)) + res = config_cmd.check_header("endian.h") + if res: + private_defines.append(('HAVE_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_ENDIAN_H', 1)) + res = config_cmd.check_header("sys/endian.h") + if res: + private_defines.append(('HAVE_SYS_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_SYS_ENDIAN_H', 1)) + + # Check basic types sizes + for type in ('short', 'int', 'long'): + res = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), headers=["Python.h"]) + if res: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), "SIZEOF_%s" % sym2def(type))) + else: + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + for type in ('float', 'double', 'long double'): + already_declared = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), + headers=["Python.h"]) + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + if not already_declared and not type == 'long double': + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # Compute size of corresponding complex type: used to check that our + # definition is binary compatible with C99 complex type (check done at + # build time in npy_common.h) + complex_def = "struct {%s __x; %s __y;}" % (type, type) + res = config_cmd.check_type_size(complex_def, + expected=[2 * x for x in expected[type]]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_COMPLEX_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % complex_def) + + for type in ('Py_intptr_t', 'off_t'): + res = config_cmd.check_type_size(type, headers=["Python.h"], + library_dirs=[pythonlib_dir()], + expected=expected[type]) + + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # We check declaration AND type because that's how distutils does it. + if config_cmd.check_decl('PY_LONG_LONG', headers=['Python.h']): + res = config_cmd.check_type_size('PY_LONG_LONG', headers=['Python.h'], + library_dirs=[pythonlib_dir()], + expected=expected['PY_LONG_LONG']) + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'PY_LONG_LONG') + + res = config_cmd.check_type_size('long long', + expected=expected['long long']) + if res >= 0: + #private_defines.append(('SIZEOF_%s' % sym2def('long long'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('long long'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'long long') + + if not config_cmd.check_decl('CHAR_BIT', headers=['Python.h']): + raise RuntimeError( + "Config wo CHAR_BIT is not supported" + ", please contact the maintainers") + + return private_defines, public_defines + +def check_mathlib(config_cmd): + # Testing the C math library + mathlibs = [] + mathlibs_choices = [[], ['m'], ['cpml']] + mathlib = os.environ.get('MATHLIB') + if mathlib: + mathlibs_choices.insert(0, mathlib.split(',')) + for libs in mathlibs_choices: + if config_cmd.check_func("exp", libraries=libs, decl=True, call=True): + mathlibs = libs + break + else: + raise RuntimeError( + "math library missing; rerun setup.py after setting the " + "MATHLIB env variable") + return mathlibs + +def visibility_define(config): + """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty + string).""" + hide = '__attribute__((visibility("hidden")))' + if config.check_gcc_function_attribute(hide, 'hideme'): + return hide + else: + return '' + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration, dot_join + from numpy.distutils.system_info import (get_info, blas_opt_info, + lapack_opt_info) + + config = Configuration('core', parent_package, top_path) + local_dir = config.local_path + codegen_dir = join(local_dir, 'code_generators') + + if is_released(config): + warnings.simplefilter('error', MismatchCAPIWarning) + + # Check whether we have a mismatch between the set C API VERSION and the + # actual C API VERSION + check_api_version(C_API_VERSION, codegen_dir) + + generate_umath_py = join(codegen_dir, 'generate_umath.py') + n = dot_join(config.name, 'generate_umath') + generate_umath = npy_load_module('_'.join(n.split('.')), + generate_umath_py, ('.py', 'U', 1)) + + header_dir = 'include/numpy' # this is relative to config.path_in_package + + cocache = CallOnceOnly() + + def generate_config_h(ext, build_dir): + target = join(build_dir, header_dir, 'config.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + moredefs, ignored = cocache.check_types(config_cmd, ext, build_dir) + + # Check math library and C99 math funcs availability + mathlibs = check_mathlib(config_cmd) + moredefs.append(('MATHLIB', ','.join(mathlibs))) + + check_math_capabilities(config_cmd, ext, moredefs, mathlibs) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) + + # Signal check + if is_npy_no_signal(): + moredefs.append('__NPY_PRIVATE_NO_SIGNAL') + + # Windows checks + if sys.platform == 'win32' or os.name == 'nt': + win32_checks(moredefs) + + # C99 restrict keyword + moredefs.append(('NPY_RESTRICT', config_cmd.check_restrict())) + + # Inline check + inline = config_cmd.check_inline() + + if can_link_svml(): + moredefs.append(('NPY_CAN_LINK_SVML', 1)) + + # Use relaxed stride checking + if NPY_RELAXED_STRIDES_CHECKING: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) + else: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 0)) + + # Use bogus stride debug aid when relaxed strides are enabled + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + else: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 0)) + + # Get long double representation + rep = check_long_double_representation(config_cmd) + moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1)) + + if check_for_right_shift_internal_compiler_error(config_cmd): + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift') + + # Generate the config.h file from moredefs + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # define inline to our keyword, or nothing + target_f.write('#ifndef __cplusplus\n') + if inline == 'inline': + target_f.write('/* #undef inline */\n') + else: + target_f.write('#define inline %s\n' % inline) + target_f.write('#endif\n') + + # add the guard to make sure config.h is never included directly, + # but always through npy_config.h + target_f.write(textwrap.dedent(""" + #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ + #error config.h should never be included directly, include npy_config.h instead + #endif + """)) + + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + else: + mathlibs = [] + with open(target) as target_f: + for line in target_f: + s = '#define MATHLIB' + if line.startswith(s): + value = line[len(s):].strip() + if value: + mathlibs.extend(value.split(',')) + + # Ugly: this can be called within a library and not an extension, + # in which case there is no libraries attributes (and none is + # needed). + if hasattr(ext, 'libraries'): + ext.libraries.extend(mathlibs) + + incl_dir = os.path.dirname(target) + if incl_dir not in config.numpy_include_dirs: + config.numpy_include_dirs.append(incl_dir) + + return target + + def generate_numpyconfig_h(ext, build_dir): + """Depends on config.h: generate_config_h has to be called before !""" + # put common include directory in build_dir on search path + # allows using code generation in headers + config.add_include_dirs(join(build_dir, "src", "common")) + config.add_include_dirs(join(build_dir, "src", "npymath")) + + target = join(build_dir, header_dir, '_numpyconfig.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + ignored, moredefs = cocache.check_types(config_cmd, ext, build_dir) + + if is_npy_no_signal(): + moredefs.append(('NPY_NO_SIGNAL', 1)) + + if is_npy_no_smp(): + moredefs.append(('NPY_NO_SMP', 1)) + else: + moredefs.append(('NPY_NO_SMP', 0)) + + mathlibs = check_mathlib(config_cmd) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[1]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1]) + + if NPY_RELAXED_STRIDES_CHECKING: + moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1)) + + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + + # Check whether we can use inttypes (C99) formats + if config_cmd.check_decl('PRIdPTR', headers=['inttypes.h']): + moredefs.append(('NPY_USE_C99_FORMATS', 1)) + + # visibility check + hidden_visibility = visibility_define(config_cmd) + moredefs.append(('NPY_VISIBILITY_HIDDEN', hidden_visibility)) + + # Add the C API/ABI versions + moredefs.append(('NPY_ABI_VERSION', '0x%.8X' % C_ABI_VERSION)) + moredefs.append(('NPY_API_VERSION', '0x%.8X' % C_API_VERSION)) + + # Add moredefs to header + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # Define __STDC_FORMAT_MACROS + target_f.write(textwrap.dedent(""" + #ifndef __STDC_FORMAT_MACROS + #define __STDC_FORMAT_MACROS 1 + #endif + """)) + + # Dump the numpyconfig.h header to stdout + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + config.add_data_files((header_dir, target)) + return target + + def generate_api_func(module_name): + def generate_api(ext, build_dir): + script = join(codegen_dir, module_name + '.py') + sys.path.insert(0, codegen_dir) + try: + m = __import__(module_name) + log.info('executing %s', script) + h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir)) + finally: + del sys.path[0] + config.add_data_files((header_dir, h_file), + (header_dir, doc_file)) + return (h_file,) + return generate_api + + generate_numpy_api = generate_api_func('generate_numpy_api') + generate_ufunc_api = generate_api_func('generate_ufunc_api') + + config.add_include_dirs(join(local_dir, "src", "common")) + config.add_include_dirs(join(local_dir, "src")) + config.add_include_dirs(join(local_dir)) + + config.add_data_dir('include/numpy') + config.add_include_dirs(join('src', 'npymath')) + config.add_include_dirs(join('src', 'multiarray')) + config.add_include_dirs(join('src', 'umath')) + config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', '_simd')) + + config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process + config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) + if sys.platform[:3] == "aix": + config.add_define_macros([("_LARGE_FILES", None)]) + else: + config.add_define_macros([("_FILE_OFFSET_BITS", "64")]) + config.add_define_macros([('_LARGEFILE_SOURCE', '1')]) + config.add_define_macros([('_LARGEFILE64_SOURCE', '1')]) + + config.numpy_include_dirs.extend(config.paths('include')) + + deps = [join('src', 'npymath', '_signbit.c'), + join('include', 'numpy', '*object.h'), + join(codegen_dir, 'genapi.py'), + ] + + ####################################################################### + # npymath library # + ####################################################################### + + subst_dict = dict([("sep", os.path.sep), ("pkgname", "numpy.core")]) + + def get_mathlib_info(*args): + # Another ugly hack: the mathlib info is known once build_src is run, + # but we cannot use add_installed_pkg_config here either, so we only + # update the substitution dictionary during npymath build + config_cmd = config.get_config_cmd() + + # Check that the toolchain works, to fail early if it doesn't + # (avoid late errors with MATHLIB which are confusing if the + # compiler does not work). + st = config_cmd.try_link('int main(void) { return 0;}') + if not st: + # rerun the failing command in verbose mode + config_cmd.compiler.verbose = True + config_cmd.try_link('int main(void) { return 0;}') + raise RuntimeError("Broken toolchain: cannot link a simple C program") + mlibs = check_mathlib(config_cmd) + + posix_mlib = ' '.join(['-l%s' % l for l in mlibs]) + msvc_mlib = ' '.join(['%s.lib' % l for l in mlibs]) + subst_dict["posix_mathlib"] = posix_mlib + subst_dict["msvc_mathlib"] = msvc_mlib + + npymath_sources = [join('src', 'npymath', 'npy_math_internal.h.src'), + join('src', 'npymath', 'npy_math.c'), + join('src', 'npymath', 'ieee754.c.src'), + join('src', 'npymath', 'npy_math_complex.c.src'), + join('src', 'npymath', 'halffloat.c') + ] + + # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977. + # Intel and Clang also don't seem happy with /GL + is_msvc = (platform.platform().startswith('Windows') and + platform.python_compiler().startswith('MS')) + config.add_installed_library('npymath', + sources=npymath_sources + [get_mathlib_info], + install_dir='lib', + build_info={ + 'include_dirs' : [], # empty list required for creating npy_math_internal.h + 'extra_compiler_args' : (['/GL-'] if is_msvc else []), + }) + config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config", + subst_dict) + config.add_npy_pkg_config("mlib.ini.in", "lib/npy-pkg-config", + subst_dict) + + ####################################################################### + # multiarray_tests module # + ####################################################################### + + config.add_extension('_multiarray_tests', + sources=[join('src', 'multiarray', '_multiarray_tests.c.src'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c')], + depends=[join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_extint128.h')], + libraries=['npymath']) + + ####################################################################### + # _multiarray_umath module - common part # + ####################################################################### + + common_deps = [ + join('src', 'common', 'array_assign.h'), + join('src', 'common', 'binop_override.h'), + join('src', 'common', 'cblasfuncs.h'), + join('src', 'common', 'lowlevel_strided_loops.h'), + join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_cblas.h'), + join('src', 'common', 'npy_config.h'), + join('src', 'common', 'npy_ctypes.h'), + join('src', 'common', 'npy_extint128.h'), + join('src', 'common', 'npy_import.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_longdouble.h'), + join('src', 'common', 'npy_svml.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.h'), + join('src', 'common', 'ufunc_override.h'), + join('src', 'common', 'umathmodule.h'), + join('src', 'common', 'numpyos.h'), + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + ] + + common_src = [ + join('src', 'common', 'array_assign.c'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c'), + join('src', 'common', 'npy_longdouble.c'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.c'), + join('src', 'common', 'ufunc_override.c'), + join('src', 'common', 'numpyos.c'), + join('src', 'common', 'npy_cpu_features.c.src'), + ] + + if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0": + blas_info = get_info('blas_ilp64_opt', 2) + else: + blas_info = get_info('blas_opt', 0) + + have_blas = blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []) + + if have_blas: + extra_info = blas_info + # These files are also in MANIFEST.in so that they are always in + # the source distribution independently of HAVE_CBLAS. + common_src.extend([join('src', 'common', 'cblasfuncs.c'), + join('src', 'common', 'python_xerbla.c'), + ]) + else: + extra_info = {} + + ####################################################################### + # _multiarray_umath module - multiarray part # + ####################################################################### + + multiarray_deps = [ + join('src', 'multiarray', 'abstractdtypes.h'), + join('src', 'multiarray', 'arrayobject.h'), + join('src', 'multiarray', 'arraytypes.h'), + join('src', 'multiarray', 'arrayfunction_override.h'), + join('src', 'multiarray', 'array_coercion.h'), + join('src', 'multiarray', 'array_method.h'), + join('src', 'multiarray', 'npy_buffer.h'), + join('src', 'multiarray', 'calculation.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'common_dtype.h'), + join('src', 'multiarray', 'convert_datatype.h'), + join('src', 'multiarray', 'convert.h'), + join('src', 'multiarray', 'conversion_utils.h'), + join('src', 'multiarray', 'ctors.h'), + join('src', 'multiarray', 'descriptor.h'), + join('src', 'multiarray', 'dtypemeta.h'), + join('src', 'multiarray', 'dtype_transfer.h'), + join('src', 'multiarray', 'dragon4.h'), + join('src', 'multiarray', 'einsum_debug.h'), + join('src', 'multiarray', 'einsum_sumprod.h'), + join('src', 'multiarray', 'experimental_public_dtype_api.h'), + join('src', 'multiarray', 'getset.h'), + join('src', 'multiarray', 'hashdescr.h'), + join('src', 'multiarray', 'iterators.h'), + join('src', 'multiarray', 'legacy_dtype_implementation.h'), + join('src', 'multiarray', 'mapping.h'), + join('src', 'multiarray', 'methods.h'), + join('src', 'multiarray', 'multiarraymodule.h'), + join('src', 'multiarray', 'nditer_impl.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'multiarray', 'refcount.h'), + join('src', 'multiarray', 'scalartypes.h'), + join('src', 'multiarray', 'sequence.h'), + join('src', 'multiarray', 'shape.h'), + join('src', 'multiarray', 'strfuncs.h'), + join('src', 'multiarray', 'typeinfo.h'), + join('src', 'multiarray', 'usertypes.h'), + join('src', 'multiarray', 'vdot.h'), + join('include', 'numpy', 'arrayobject.h'), + join('include', 'numpy', '_neighborhood_iterator_imp.h'), + join('include', 'numpy', 'npy_endian.h'), + join('include', 'numpy', 'arrayscalars.h'), + join('include', 'numpy', 'noprefix.h'), + join('include', 'numpy', 'npy_interrupt.h'), + join('include', 'numpy', 'npy_3kcompat.h'), + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('include', 'numpy', 'npy_common.h'), + join('include', 'numpy', 'npy_os.h'), + join('include', 'numpy', 'utils.h'), + join('include', 'numpy', 'ndarrayobject.h'), + join('include', 'numpy', 'npy_cpu.h'), + join('include', 'numpy', 'numpyconfig.h'), + join('include', 'numpy', 'ndarraytypes.h'), + join('include', 'numpy', 'npy_1_7_deprecated_api.h'), + # add library sources as distuils does not consider libraries + # dependencies + ] + npymath_sources + + multiarray_src = [ + join('src', 'multiarray', 'abstractdtypes.c'), + join('src', 'multiarray', 'alloc.c'), + join('src', 'multiarray', 'arrayobject.c'), + join('src', 'multiarray', 'arraytypes.c.src'), + join('src', 'multiarray', 'array_coercion.c'), + join('src', 'multiarray', 'array_method.c'), + join('src', 'multiarray', 'array_assign_scalar.c'), + join('src', 'multiarray', 'array_assign_array.c'), + join('src', 'multiarray', 'arrayfunction_override.c'), + join('src', 'multiarray', 'buffer.c'), + join('src', 'multiarray', 'calculation.c'), + join('src', 'multiarray', 'compiled_base.c'), + join('src', 'multiarray', 'common.c'), + join('src', 'multiarray', 'common_dtype.c'), + join('src', 'multiarray', 'convert.c'), + join('src', 'multiarray', 'convert_datatype.c'), + join('src', 'multiarray', 'conversion_utils.c'), + join('src', 'multiarray', 'ctors.c'), + join('src', 'multiarray', 'datetime.c'), + join('src', 'multiarray', 'datetime_strings.c'), + join('src', 'multiarray', 'datetime_busday.c'), + join('src', 'multiarray', 'datetime_busdaycal.c'), + join('src', 'multiarray', 'descriptor.c'), + join('src', 'multiarray', 'dtypemeta.c'), + join('src', 'multiarray', 'dragon4.c'), + join('src', 'multiarray', 'dtype_transfer.c'), + join('src', 'multiarray', 'einsum.c.src'), + join('src', 'multiarray', 'einsum_sumprod.c.src'), + join('src', 'multiarray', 'experimental_public_dtype_api.c'), + join('src', 'multiarray', 'flagsobject.c'), + join('src', 'multiarray', 'getset.c'), + join('src', 'multiarray', 'hashdescr.c'), + join('src', 'multiarray', 'item_selection.c'), + join('src', 'multiarray', 'iterators.c'), + join('src', 'multiarray', 'legacy_dtype_implementation.c'), + join('src', 'multiarray', 'lowlevel_strided_loops.c.src'), + join('src', 'multiarray', 'mapping.c'), + join('src', 'multiarray', 'methods.c'), + join('src', 'multiarray', 'multiarraymodule.c'), + join('src', 'multiarray', 'nditer_templ.c.src'), + join('src', 'multiarray', 'nditer_api.c'), + join('src', 'multiarray', 'nditer_constr.c'), + join('src', 'multiarray', 'nditer_pywrap.c'), + join('src', 'multiarray', 'number.c'), + join('src', 'multiarray', 'refcount.c'), + join('src', 'multiarray', 'sequence.c'), + join('src', 'multiarray', 'shape.c'), + join('src', 'multiarray', 'scalarapi.c'), + join('src', 'multiarray', 'scalartypes.c.src'), + join('src', 'multiarray', 'strfuncs.c'), + join('src', 'multiarray', 'temp_elide.c'), + join('src', 'multiarray', 'typeinfo.c'), + join('src', 'multiarray', 'usertypes.c'), + join('src', 'multiarray', 'vdot.c'), + join('src', 'common', 'npy_sort.h.src'), + join('src', 'npysort', 'quicksort.c.src'), + join('src', 'npysort', 'mergesort.c.src'), + join('src', 'npysort', 'timsort.c.src'), + join('src', 'npysort', 'heapsort.c.src'), + join('src', 'npysort', 'radixsort.c.src'), + join('src', 'common', 'npy_partition.h.src'), + join('src', 'npysort', 'selection.c.src'), + join('src', 'common', 'npy_binsearch.h.src'), + join('src', 'npysort', 'binsearch.c.src'), + ] + + ####################################################################### + # _multiarray_umath module - umath part # + ####################################################################### + + def generate_umath_c(ext, build_dir): + target = join(build_dir, header_dir, '__umath_generated.c') + dir = os.path.dirname(target) + if not os.path.exists(dir): + os.makedirs(dir) + script = generate_umath_py + if newer(script, target): + with open(target, 'w') as f: + f.write(generate_umath.make_code(generate_umath.defdict, + generate_umath.__file__)) + return [] + + umath_src = [ + join('src', 'umath', 'umathmodule.c'), + join('src', 'umath', 'reduction.c'), + join('src', 'umath', 'funcs.inc.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'loops.h.src'), + join('src', 'umath', 'loops_utils.h.src'), + join('src', 'umath', 'loops.c.src'), + join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), + join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), + join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), + join('src', 'umath', 'matmul.h.src'), + join('src', 'umath', 'matmul.c.src'), + join('src', 'umath', 'clip.h.src'), + join('src', 'umath', 'clip.c.src'), + join('src', 'umath', 'dispatching.c'), + join('src', 'umath', 'legacy_array_method.c'), + join('src', 'umath', 'ufunc_object.c'), + join('src', 'umath', 'extobj.c'), + join('src', 'umath', 'scalarmath.c.src'), + join('src', 'umath', 'ufunc_type_resolution.c'), + join('src', 'umath', 'override.c'), + # For testing. Eventually, should use public API and be separate: + join('src', 'umath', '_scaled_float_dtype.c'), + ] + + umath_deps = [ + generate_umath_py, + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'override.h'), + join(codegen_dir, 'generate_ufunc_api.py'), + ] + + svml_path = join('numpy', 'core', 'src', 'umath', 'svml') + svml_objs = [] + if can_link_svml() and check_svml_submodule(svml_path): + svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True) + + config.add_extension('_multiarray_umath', + sources=multiarray_src + umath_src + + common_src + + [generate_config_h, + generate_numpyconfig_h, + generate_numpy_api, + join(codegen_dir, 'generate_numpy_api.py'), + join('*.py'), + generate_umath_c, + generate_ufunc_api, + ], + depends=deps + multiarray_deps + umath_deps + + common_deps, + libraries=['npymath'], + extra_objects=svml_objs, + extra_info=extra_info) + + ####################################################################### + # umath_tests module # + ####################################################################### + + config.add_extension('_umath_tests', sources=[ + join('src', 'umath', '_umath_tests.c.src'), + join('src', 'umath', '_umath_tests.dispatch.c'), + join('src', 'common', 'npy_cpu_features.c.src'), + ]) + + ####################################################################### + # custom rational dtype module # + ####################################################################### + + config.add_extension('_rational_tests', + sources=[join('src', 'umath', '_rational_tests.c.src')]) + + ####################################################################### + # struct_ufunc_test module # + ####################################################################### + + config.add_extension('_struct_ufunc_tests', + sources=[join('src', 'umath', '_struct_ufunc_tests.c.src')]) + + + ####################################################################### + # operand_flag_tests module # + ####################################################################### + + config.add_extension('_operand_flag_tests', + sources=[join('src', 'umath', '_operand_flag_tests.c.src')]) + + ####################################################################### + # SIMD module # + ####################################################################### + + config.add_extension('_simd', sources=[ + join('src', 'common', 'npy_cpu_features.c.src'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd_arg.inc'), + join('src', '_simd', '_simd_convert.inc'), + join('src', '_simd', '_simd_easyintrin.inc'), + join('src', '_simd', '_simd_vector.inc'), + ]) + + config.add_subpackage('tests') + config.add_data_dir('tests/data') + config.add_data_dir('tests/examples') + config.add_data_files('*.pyi') + + config.make_svn_version_py() + + return config + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(configuration=configuration) diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19ea6f22f..32b5bc89e 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,4 +153,70 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +#define npyv_reduce_max_u8 vmaxvq_u8 +#define npyv_reduce_max_s8 vmaxvq_s8 +#define npyv_reduce_max_u16 vmaxvq_u16 +#define npyv_reduce_max_s16 vmaxvq_s16 +#define npyv_reduce_max_u32 vmaxvq_u32 +#define npyv_reduce_max_s32 vmaxvq_s32 +NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v) +{ + npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); + npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); + npyv_lanetype_u64 result = (a > b) ? a : b; + return result; +} +NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v) +{ + npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); + npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); + npyv_lanetype_s64 result = (a > b) ? a : b; + return result; +} +#define npyv_reduce_max_f32 vmaxvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_max_f64 vmaxvq_f64 +#endif // NPY_SIMD_F64 + +// Minimum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +#define npyv_reduce_maxp_f32 vmaxnmvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_maxp_f64 vmaxnmvq_f64 +#endif // NPY_SIMD_F64 + +#define npyv_reduce_min_u8 vminvq_u8 +#define npyv_reduce_min_s8 vminvq_s8 +#define npyv_reduce_min_u16 vminvq_u16 +#define npyv_reduce_min_s16 vminvq_s16 +#define npyv_reduce_min_u32 vminvq_u32 +#define npyv_reduce_min_s32 vminvq_s32 +NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v) +{ + npyv_lanetype_u64 a = vgetq_lane_u64(v, 0); + npyv_lanetype_u64 b = vgetq_lane_u64(v, 1); + npyv_lanetype_u64 result = (a < b) ? a : b; + return result; +} +NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v) +{ + npyv_lanetype_s64 a = vgetq_lane_s64(v, 0); + npyv_lanetype_s64 b = vgetq_lane_s64(v, 1); + npyv_lanetype_s64 result = (a < b) ? a : b; + return result; +} +#define npyv_reduce_min_f32 vminvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_min_f64 vminvq_f64 +#endif // NPY_SIMD_F64 + +// Minimum, supports IEEE floating-point arithmetic (IEC 60559), +// - If one of the two vectors contains NaN, the equivalent element of the other vector is set +// - Only if both corresponded elements are NaN, NaN is set. +#define npyv_reduce_minp_f32 vminnmvq_f32 +#if NPY_SIMD_F64 + #define npyv_reduce_minp_f64 vminnmvq_f64 +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 7c0710819..950ea011e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,6 +13,7 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" +#include "loops.h" #include "npy_pycompat.h" @@ -550,6 +551,7 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void * npy_double, npy_double, npy_double, npy_double# * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# */ #define @TYPE@_floor_divide @TYPE@_divide @@ -724,6 +726,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void /**end repeat1**/ +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**begin repeat1 * #kind = maximum, minimum# * #OP = >, <# @@ -749,6 +752,7 @@ NPY_NO_EXPORT void } /**end repeat1**/ +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -1593,6 +1597,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal, greater, greater_equal, @@ -1716,6 +1721,7 @@ NPY_NO_EXPORT void npy_clear_floatstatus_barrier((char*)dimensions); } +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1741,8 +1747,10 @@ NPY_NO_EXPORT void } npy_clear_floatstatus_barrier((char*)dimensions); } +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**end repeat1**/ +#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ /**begin repeat1 * #kind = fmax, fmin# * #OP = >=, <=# @@ -1770,6 +1778,7 @@ NPY_NO_EXPORT void npy_clear_floatstatus_barrier((char*)dimensions); } /**end repeat1**/ +#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@ NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 0938cd050..aab1b5bbb 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -22,6 +22,13 @@ #define BOOL_fmax BOOL_maximum #define BOOL_fmin BOOL_minimum +/* + * The ARM NEON implementation of min/max for longdouble, longlong, and + * ulonglong assume that they're all 64-bits. If that's not true, then we'll + * use the scalar implementations instead. + */ +#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64) +#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64) /* ***************************************************************************** @@ -660,6 +667,49 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo /* ***************************************************************************** + ** MIN/MAX LOOPS ** + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_minmax.dispatch.h" +#endif + +//---------- Integers ---------- + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ +#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +//---------- Float ---------- + + /**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE# + */ + #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**begin repeat1 + * #kind = maximum, minimum, fmax, fmin# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@ +/**end repeat**/ + +/* + ***************************************************************************** ** END LOOPS ** ***************************************************************************** */ diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig new file mode 100644 index 000000000..0938cd050 --- /dev/null +++ b/numpy/core/src/umath/loops.h.src.orig @@ -0,0 +1,667 @@ +/* -*- c -*- */ +/* + * vim:syntax=c + */ + +#ifndef _NPY_UMATH_LOOPS_H_ +#define _NPY_UMATH_LOOPS_H_ + +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + +#define BOOL_invert BOOL_logical_not +#define BOOL_add BOOL_logical_or +#define BOOL_bitwise_and BOOL_logical_and +#define BOOL_bitwise_or BOOL_logical_or +#define BOOL_logical_xor BOOL_not_equal +#define BOOL_bitwise_xor BOOL_logical_xor +#define BOOL_multiply BOOL_logical_and +#define BOOL_maximum BOOL_logical_or +#define BOOL_minimum BOOL_logical_and +#define BOOL_fmax BOOL_maximum +#define BOOL_fmin BOOL_minimum + + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or, absolute, logical_not# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +NPY_NO_EXPORT void +BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ + +/**begin repeat1 + * both signed and unsigned integer types + * #s = , u# + * #S = , U# + */ + +#define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_fmax @S@@TYPE@_maximum +#define @S@@TYPE@_fmin @S@@TYPE@_minimum + +NPY_NO_EXPORT void +@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #isa = , _avx2# + */ + +NPY_NO_EXPORT void +@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat3 + * Arithmetic + * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift# + * #OP = +, -,*, &, |, ^, <<, >># + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +/**begin repeat3 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal, + * logical_and, logical_or# + * #OP = ==, !=, >, >=, <, <=, &&, ||# + */ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat3**/ + +NPY_NO_EXPORT void +@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**begin repeat2 + * #kind = maximum, minimum# + * #OP = >, <# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +NPY_NO_EXPORT void +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat2 + * #kind = isnan, isinf, isfinite# + **/ +NPY_NO_EXPORT void +@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ + +/**end repeat1**/ + +/**end repeat**/ + +/* + ***************************************************************************** + ** FLOAT LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = sqrt, absolute, square, reciprocal# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_umath_fp.dispatch.h" +#endif + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + */ + +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +/**end repeat**/ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_exponent_log.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * # kind = exp, log, frexp, ldexp# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #func = rint, ceil, floor, trunc# + */ + +/**begin repeat1 +* #TYPE = FLOAT, DOUBLE# +*/ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat2 + * #isa = avx512f, fma# + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 void +@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * Float types + * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, f, , l# + * #C = F, F, , L# + */ + + +/**begin repeat1 + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # OP = +, -, *, /# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = equal, not_equal, less, less_equal, greater, greater_equal, + * logical_and, logical_or# + * #OP = ==, !=, <, <=, >, >=, &&, ||# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# + * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + **/ + +/**begin repeat2 + * #ISA = , _avx512_skx# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = >=, <=# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat**/ + + +/* + ***************************************************************************** + ** COMPLEX LOOPS ** + ***************************************************************************** + */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithm_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = add, subtract, multiply# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi)); +#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi)); +#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi)); +#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi)); +#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi); +#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi); + +/**begin repeat + * complex types + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #c = f, , l# + * #C = F, , L# + */ + +/**begin repeat1 + * arithmetic + * #kind = add, subtract, multiply# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind= greater, greater_equal, less, less_equal, equal, not_equal# + * #OP = CGT, CGE, CLT, CLE, CEQ, CNE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + #kind = logical_and, logical_or# + #OP1 = ||, ||# + #OP2 = &&, ||# +*/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #OP = ||, ||, &&# + **/ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #isa = , _avx512f# + */ + +NPY_NO_EXPORT void +C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); +/**end repeat1**/ + +NPY_NO_EXPORT void +C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #kind = maximum, minimum# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = fmax, fmin# + * #OP = CGE, CLE# + */ +NPY_NO_EXPORT void +C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +#undef CGE +#undef CLE +#undef CGT +#undef CLT +#undef CEQ +#undef CNE + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void +TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**begin repeat + * #TYPE = DATETIME, TIMEDELTA# + */ + +NPY_NO_EXPORT void +@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +#define @TYPE@_isnan @TYPE@_isnat + +NPY_NO_EXPORT void +@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +/**begin repeat1 + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = ==, !=, >, >=, <, <=# + */ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**begin repeat1 + * #kind = maximum, minimum, fmin, fmax# + **/ +NPY_NO_EXPORT void +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ + +/**end repeat**/ + +NPY_NO_EXPORT void +DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); + +NPY_NO_EXPORT void +DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/* Special case equivalents to above functions */ +#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide +#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide +/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +/* + ***************************************************************************** + ** OBJECT LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #kind = equal, not_equal, greater, greater_equal, less, less_equal# + * #OP = EQ, NE, GT, GE, LT, LE# + */ +/**begin repeat1 + * #suffix = , _OO_O# + */ +NPY_NO_EXPORT void +OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat1**/ +/**end repeat**/ + +NPY_NO_EXPORT void +OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func); + +/* + ***************************************************************************** + ** END LOOPS ** + ***************************************************************************** + */ + +#endif diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src new file mode 100644 index 000000000..06aed1bb7 --- /dev/null +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -0,0 +1,671 @@ +/*@targets + ** $maxopt baseline + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + + +//############################################################################## +//## Maximum, Minimum +//############################################################################## + +#ifdef NPY_HAVE_NEON +//****************************************************************************** +//** NEON Min / Max +//****************************************************************************** + + +//------------------------------------------------------------------------------ +//-- Integer +//------------------------------------------------------------------------------ + + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #sfx = s8, u8, s16, u16, s32, u32, + * s64, u64, s64, u64# + * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2# + */ + +// Implementation below assumes longlong and ulonglong are 64-bit. +#if @HAVE_NEON_IMPL@ + +/**begin repeat1 +* Arithmetic +* # kind = maximum, minimum# +* # OP = >, <# +* # vop = max, min# +*/ + +#define SCALAR_OP @TYPE@_scalar_@vop@ +static inline @type@ SCALAR_OP(@type@ a, @type@ b){ + return ((a @OP@ b) ? a : b); +} + +static inline npy_intp +simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + @type@ *ip2 = (@type@ *)args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + // SIMD if possible + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + #define NPYV_CAST (const npyv_lanetype_@sfx@ *) + npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]); + + m0 = npyv_@vop@_@sfx@(m0, v0); + m1 = npyv_@vop@_@sfx@(m1, v1); + m2 = npyv_@vop@_@sfx@(m2, v2); + m3 = npyv_@vop@_@sfx@(m3, v3); + m4 = npyv_@vop@_@sfx@(m4, v4); + m5 = npyv_@vop@_@sfx@(m5, v5); + m6 = npyv_@vop@_@sfx@(m6, v6); + m7 = npyv_@vop@_@sfx@(m7, v7); + } + + #undef NPYV_CAST + + m0 = npyv_@vop@_@sfx@(m0, m1); + m2 = npyv_@vop@_@sfx@(m2, m3); + m4 = npyv_@vop@_@sfx@(m4, m5); + m6 = npyv_@vop@_@sfx@(m6, m7); + + m0 = npyv_@vop@_@sfx@(m0, m2); + m4 = npyv_@vop@_@sfx@(m4, m6); + + m0 = npyv_@vop@_@sfx@(m0, m4); + + @type@ r = npyv_reduce_@vop@_@sfx@(m0); + + io1 = SCALAR_OP(io1, r); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline npy_intp +scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + // 8x scalar + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= n){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + io1 = SCALAR_OP(io1, m0); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline void +run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + npy_intp i; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + i = 0; + + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + // SIMD - do as many iterations as we can with vectors + i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } else{ + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } + + // Scalar - finish up any remaining iterations + io1 = iop1[0]; + ip2 += i * is2; + for(; i<n; ++i, ip2 += is2){ + const @type@ in2 = *(@type@ *)ip2; + io1 = SCALAR_OP(io1, in2); + } + *((@type@ *)iop1) = io1; +} + +static inline npy_intp +simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 6; + const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + int elemPerLoop = vectorsPerLoop * elemPerVector; + + if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){ + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1)); + npyv_@sfx@ v1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1)); + npyv_@sfx@ v2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1)); + npyv_@sfx@ v3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1)); + npyv_@sfx@ v4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1)); + npyv_@sfx@ v5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1)); + + npyv_@sfx@ u0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2)); + npyv_@sfx@ u1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2)); + npyv_@sfx@ u2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2)); + npyv_@sfx@ u3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2)); + npyv_@sfx@ u4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2)); + npyv_@sfx@ u5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2)); + + npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0); + npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1); + npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2); + npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3); + npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4); + npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5); + + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5); + } + } + + return i; +} + +static inline npy_intp +scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; + npy_intp n = dimensions[0]; + + npy_intp elemPerLoop = 4; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + /* Note, we can't just load all, do all ops, then store all here. + * Sometimes ufuncs are called with `accumulate`, which makes the + * assumption that previous iterations have finished before next + * iteration. For example, the output of iteration 2 depends on the + * result of iteration 1. + */ + + /**begin repeat2 + * #unroll = 0, 1, 2, 3# + */ + @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); + @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); + *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); + /**end repeat2**/ + } + + return i; +} + +static inline void +run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + BINARY_DEFS + + i = 0; + + if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) && + nomemoverlap(ip1, is1 * n, op1, os1 * n) && + nomemoverlap(ip2, is2 * n, op1, os1 * n)) + { + // SIMD - do as many iterations as we can with vectors + i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i); + } else { + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i); + } + + + // Scalar - finish up any remaining iterations + ip1 += is1 * i; + ip2 += is2 * i; + op1 += os1 * i; + for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){ + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + + *((@type@ *)op1) = SCALAR_OP(in1, in2); + } +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + run_reduce_@TYPE@_@kind@(args, dimensions, steps); + } else { + run_binary_@TYPE@_@kind@(args, dimensions, steps); + } +} + +#undef SCALAR_OP + +/**end repeat1**/ + +#endif // @HAVE_NEON_IMPL@ + +/**end repeat**/ + + +//------------------------------------------------------------------------------ +//-- Float +//------------------------------------------------------------------------------ + +/**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_float, npy_double, npy_longdouble# + * #sfx = f32, f64, f64# + * #asmType = s, d, d# + * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE# + */ + +// Implementation below assumes longdouble is 64-bit. +#if @HAVE_NEON_IMPL@ + +/**begin repeat1 +* Arithmetic +* # kind = maximum, minimum, fmax, fmin# +* # OP = >=, <=, >=, <=# +* # vop = max, min, maxp, minp# +* # asmInstr = fmax, fmin, fmaxnm, fminnm# +* # PROPAGATE_NAN = 1, 1, 0, 0# +*/ + +#define SCALAR_OP @TYPE@_scalar_@vop@ +static inline @type@ SCALAR_OP(@type@ a, @type@ b){ +#ifdef __aarch64__ + @type@ result = 0; + __asm( + "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]" + : [result] "=w" (result) + : [a] "w" (a), [b] "w" (b) + ); + return result; +#else + +#if @PROPAGATE_NAN@ + return ((a @OP@ b || npy_isnan(a)) ? a : b); +#else + return ((a @OP@ b || npy_isnan(b)) ? a : b); +#endif +#endif // __aarch64__ +} + +static inline npy_intp +simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + @type@ *ip2 = (@type@ *)args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + // SIMD if possible + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); + npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]); + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]); + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]); + npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]); + npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]); + npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]); + npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]); + npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]); + + m0 = npyv_@vop@_@sfx@(m0, v0); + m1 = npyv_@vop@_@sfx@(m1, v1); + m2 = npyv_@vop@_@sfx@(m2, v2); + m3 = npyv_@vop@_@sfx@(m3, v3); + m4 = npyv_@vop@_@sfx@(m4, v4); + m5 = npyv_@vop@_@sfx@(m5, v5); + m6 = npyv_@vop@_@sfx@(m6, v6); + m7 = npyv_@vop@_@sfx@(m7, v7); + } + + m0 = npyv_@vop@_@sfx@(m0, m1); + m2 = npyv_@vop@_@sfx@(m2, m3); + m4 = npyv_@vop@_@sfx@(m4, m5); + m6 = npyv_@vop@_@sfx@(m6, m7); + + m0 = npyv_@vop@_@sfx@(m0, m2); + m4 = npyv_@vop@_@sfx@(m4, m6); + + m0 = npyv_@vop@_@sfx@(m0, m4); + + @type@ r = npyv_reduce_@vop@_@sfx@(m0); + + io1 = SCALAR_OP(io1, r); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline npy_intp +scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + + // 8x scalar + npy_intp elemPerLoop = 8; + if((i+elemPerLoop) <= n){ + @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + i += elemPerLoop; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2)); + @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2)); + @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2)); + @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2)); + @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2)); + @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2)); + @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2)); + @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2)); + + m0 = SCALAR_OP(m0, v0); + m1 = SCALAR_OP(m1, v1); + m2 = SCALAR_OP(m2, v2); + m3 = SCALAR_OP(m3, v3); + m4 = SCALAR_OP(m4, v4); + m5 = SCALAR_OP(m5, v5); + m6 = SCALAR_OP(m6, v6); + m7 = SCALAR_OP(m7, v7); + } + + m0 = SCALAR_OP(m0, m1); + m2 = SCALAR_OP(m2, m3); + m4 = SCALAR_OP(m4, m5); + m6 = SCALAR_OP(m6, m7); + + m0 = SCALAR_OP(m0, m2); + m4 = SCALAR_OP(m4, m6); + + m0 = SCALAR_OP(m0, m4); + + io1 = SCALAR_OP(io1, m0); + } + + *((@type@ *)iop1) = io1; + + return i; +} + +static inline void +run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + @type@ *iop1 = (@type@ *)args[0]; + @type@ io1 = iop1[0]; + char *ip2 = args[1]; + npy_intp is2 = steps[1]; + npy_intp n = dimensions[0]; + npy_intp i; + + const int vectorsPerLoop = 8; + const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + npy_intp elemPerLoop = vectorsPerLoop * elemPerVector; + + i = 0; + + if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){ + // SIMD - do as many iterations as we can with vectors + i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } else{ + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i); + } + + // Scalar - finish up any remaining iterations + io1 = iop1[0]; + ip2 += i * is2; + for(; i<n; ++i, ip2 += is2){ + const @type@ in2 = *(@type@ *)ip2; + io1 = SCALAR_OP(io1, in2); + } + *((@type@ *)iop1) = io1; +} + +static inline npy_intp +simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; + npy_intp n = dimensions[0]; + + const int vectorsPerLoop = 6; + const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@); + int elemPerLoop = vectorsPerLoop * elemPerVector; + + if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){ + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + npyv_@sfx@ v0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1)); + npyv_@sfx@ v1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1)); + npyv_@sfx@ v2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1)); + npyv_@sfx@ v3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1)); + npyv_@sfx@ v4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1)); + npyv_@sfx@ v5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1)); + + npyv_@sfx@ u0 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2)); + npyv_@sfx@ u1 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2)); + npyv_@sfx@ u2 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2)); + npyv_@sfx@ u3 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2)); + npyv_@sfx@ u4 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2)); + npyv_@sfx@ u5 = npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2)); + + npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0); + npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1); + npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2); + npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3); + npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4); + npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5); + + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4); + npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5); + } + } + + return i; +} + +static inline npy_intp +scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i) +{ + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2]; + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2]; + npy_intp n = dimensions[0]; + + npy_intp elemPerLoop = 4; + for(; (i+elemPerLoop)<=n; i+=elemPerLoop){ + /* Note, we can't just load all, do all ops, then store all here. + * Sometimes ufuncs are called with `accumulate`, which makes the + * assumption that previous iterations have finished before next + * iteration. For example, the output of iteration 2 depends on the + * result of iteration 1. + */ + + /**begin repeat2 + * #unroll = 0, 1, 2, 3# + */ + @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1)); + @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2)); + *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@); + /**end repeat2**/ + } + + return i; +} + +static inline void +run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + BINARY_DEFS + + i = 0; + + if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) && + nomemoverlap(ip1, is1 * n, op1, os1 * n) && + nomemoverlap(ip2, is2 * n, op1, os1 * n)) + { + // SIMD - do as many iterations as we can with vectors + i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i); + } else { + // Unrolled scalar - do as many iterations as we can with unrolled loops + i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i); + } + + // Scalar - finish up any remaining iterations + ip1 += is1 * i; + ip2 += is2 * i; + op1 += os1 * i; + for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){ + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + + *((@type@ *)op1) = SCALAR_OP(in1, in2); + } +} + + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + run_reduce_@TYPE@_@kind@(args, dimensions, steps); + } else { + run_binary_@TYPE@_@kind@(args, dimensions, steps); + } + + npy_clear_floatstatus_barrier((char*)dimensions); +} + +#undef SCALAR_OP + +/**end repeat1**/ + +#endif // @HAVE_NEON_IMPL@ + +/**end repeat**/ + +#endif // NPY_HAVE_NEON |