BUG: NEON min/max is slow (#17989)

This fixes numpy/numpy#17989 by adding ARM NEON implementations for min/max and fmin/max. Before: Rosetta faster than native arm64 by `1.2x - 8.6x`. After: Native arm64 faster than Rosetta by `1.6x - 6.7x`. (2.8x - 15.5x improvement) **Benchmarks** ``` before after ratio [b0e1a445] [8301ffd7] <main> <gh-issue-17989/improve-neon-min-max> + 32.6±0.04μs 37.5±0.08μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 1, 'd') + 32.6±0.06μs 37.5±0.04μs 1.15 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 1, 'd') + 37.8±0.09μs 43.2±0.09μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'f') + 37.7±0.09μs 42.9±0.1μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 2, 'd') + 37.9±0.2μs 43.0±0.02μs 1.14 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 2, 'd') + 37.7±0.01μs 42.3±1μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 2, 'd') + 34.2±0.07μs 38.1±0.05μs 1.12 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 2, 'f') + 32.6±0.03μs 35.8±0.04μs 1.10 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 1, 'f') + 37.1±0.1μs 40.3±0.1μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 1, 2, 'd') + 37.2±0.1μs 40.3±0.04μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'f') + 37.1±0.09μs 40.3±0.07μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 1, 2, 'd') + 68.6±0.5μs 74.2±0.3μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'd') + 37.1±0.2μs 40.0±0.1μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 1, 2, 'd') + 2.42±0μs 2.61±0.05μs 1.08 bench_core.CountNonzero.time_count_nonzero_axis(3, 100, <class 'numpy.int16'>) + 69.1±0.7μs 73.5±0.7μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 4, 4, 'd') + 54.7±0.3μs 58.0±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'd') + 54.5±0.2μs 57.8±0.2μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'conjugate'>, 2, 4, 'd') + 3.78±0.04μs 4.00±0.02μs 1.06 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 100, <class 'str'>) + 54.8±0.2μs 57.9±0.3μs 1.06 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 2, 4, 'd') + 3.68±0.01μs 3.87±0.02μs 1.05 bench_core.CountNonzero.time_count_nonzero_multi_axis(1, 100, <class 'object'>) + 69.6±0.2μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'floor'>, 4, 4, 'd') + 229±2μs 241±0.2μs 1.05 bench_random.Bounded.time_bounded('PCG64', [<class 'numpy.uint64'>, 1535]) - 73.0±0.8μs 69.5±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 4, 4, 'd') - 37.6±0.1μs 35.7±0.3μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 1, 4, 'f') - 88.7±0.04μs 84.2±0.7μs 0.95 bench_lib.Pad.time_pad((256, 128, 1), 1, 'wrap') - 57.9±0.2μs 54.8±0.2μs 0.95 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'trunc'>, 2, 4, 'd') - 39.9±0.2μs 37.2±0.04μs 0.93 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 1, 2, 'd') - 2.66±0.01μs 2.47±0.01μs 0.93 bench_lib.Nan.time_nanmin(200, 0) - 2.65±0.02μs 2.46±0.04μs 0.93 bench_lib.Nan.time_nanmin(200, 50.0) - 2.64±0.01μs 2.45±0.01μs 0.93 bench_lib.Nan.time_nanmax(200, 90.0) - 2.64±0μs 2.44±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0) - 2.68±0.02μs 2.48±0μs 0.92 bench_lib.Nan.time_nanmax(200, 2.0) - 40.2±0.01μs 37.1±0.1μs 0.92 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 2, 4, 'f') - 2.69±0μs 2.47±0μs 0.92 bench_lib.Nan.time_nanmin(200, 2.0) - 2.70±0.02μs 2.48±0.02μs 0.92 bench_lib.Nan.time_nanmax(200, 0.1) - 2.70±0μs 2.47±0μs 0.91 bench_lib.Nan.time_nanmin(200, 90.0) - 2.70±0μs 2.46±0μs 0.91 bench_lib.Nan.time_nanmin(200, 0.1) - 2.70±0μs 2.42±0.01μs 0.90 bench_lib.Nan.time_nanmax(200, 50.0) - 11.8±0.6ms 10.6±0.6ms 0.89 bench_core.CountNonzero.time_count_nonzero_axis(2, 1000000, <class 'str'>) - 42.7±0.1μs 37.8±0.02μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'positive'>, 2, 2, 'd') - 42.8±0.03μs 37.8±0.2μs 0.88 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 2, 'd') - 43.1±0.2μs 37.7±0.09μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'ceil'>, 4, 4, 'f') - 37.5±0.07μs 32.6±0.06μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'rint'>, 2, 1, 'd') - 41.7±0.03μs 36.3±0.07μs 0.87 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 1, 4, 'd') - 166±0.8μs 144±1μs 0.87 bench_ufunc.UFunc.time_ufunc_types('fmin') - 11.6±0.8ms 10.0±0.01ms 0.87 bench_core.CountNonzero.time_count_nonzero_multi_axis(2, 1000000, <class 'str'>) - 167±0.9μs 144±2μs 0.86 bench_ufunc.UFunc.time_ufunc_types('minimum') - 168±4μs 143±0.5μs 0.85 bench_ufunc.UFunc.time_ufunc_types('fmax') - 167±1μs 142±0.8μs 0.85 bench_ufunc.UFunc.time_ufunc_types('maximum') - 7.10±0μs 4.97±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 2) - 7.11±0.07μs 4.96±0.01μs 0.70 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 2) - 7.05±0.07μs 4.68±0μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 4) - 7.13±0μs 4.68±0.01μs 0.66 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 4) - 461±0.2μs 297±7μs 0.64 bench_app.MaxesOfDots.time_it - 7.04±0.07μs 3.95±0μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 2) - 7.06±0.06μs 3.95±0.01μs 0.56 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 2) - 7.09±0.06μs 3.24±0μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'd', 1) - 7.12±0.07μs 3.25±0.02μs 0.46 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'd', 1) - 14.5±0.02μs 3.98±0μs 0.27 bench_reduce.MinMax.time_max(<class 'numpy.int64'>) - 14.6±0.1μs 4.00±0.01μs 0.27 bench_reduce.MinMax.time_min(<class 'numpy.int64'>) - 6.88±0.06μs 1.34±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('maximum', 'f', 1) - 7.00±0μs 1.33±0μs 0.19 bench_ufunc_strides.AVX_BFunc.time_ufunc('minimum', 'f', 1) - 39.4±0.01μs 3.95±0.01μs 0.10 bench_reduce.MinMax.time_min(<class 'numpy.float64'>) - 39.4±0.01μs 3.95±0.02μs 0.10 bench_reduce.MinMax.time_max(<class 'numpy.float64'>) - 254±0.02μs 22.8±0.2μs 0.09 bench_lib.Nan.time_nanmax(200000, 50.0) - 253±0.1μs 22.7±0.1μs 0.09 bench_lib.Nan.time_nanmin(200000, 0) - 254±0.06μs 22.7±0.09μs 0.09 bench_lib.Nan.time_nanmin(200000, 2.0) - 254±0.01μs 22.7±0.03μs 0.09 bench_lib.Nan.time_nanmin(200000, 0.1) - 254±0.04μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmin(200000, 50.0) - 253±0.1μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 0.1) - 253±0.03μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmin(200000, 90.0) - 253±0.02μs 22.7±0.07μs 0.09 bench_lib.Nan.time_nanmax(200000, 0) - 254±0.03μs 22.7±0.02μs 0.09 bench_lib.Nan.time_nanmax(200000, 90.0) - 254±0.09μs 22.7±0.04μs 0.09 bench_lib.Nan.time_nanmax(200000, 2.0) - 39.2±0.01μs 2.51±0.01μs 0.06 bench_reduce.MinMax.time_max(<class 'numpy.float32'>) - 39.2±0.01μs 2.50±0.01μs 0.06 bench_reduce.MinMax.time_min(<class 'numpy.float32'>) ``` Size change of _multiarray_umath.cpython-39-darwin.so: Before: 3,890,723 After: 3,924,035 Change: +33,312 (~ +0.856 %)
author: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-10-18 14:04:17 -0700
committer: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-10-18 14:04:17 -0700
commit: 1950fb55ce21f2e125672979872dd4c7781d18ce (patch)
tree: 75a8dfdd914c6e1ae6a531f8d7a54082fca92f1c /numpy
parent: dd2eaaabdb0451631e95376ae2b4d319082b438e (diff)
download: numpy-1950fb55ce21f2e125672979872dd4c7781d18ce.tar.gz
8 files changed, 2531 insertions, 0 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 3a27a34cd..c20468a8f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -516,6 +516,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
           TD(noobj, simd=[('avx512f', 'fd')]),
           TD(O, f='npy_ObjectMax')
           ),
@@ -523,6 +524,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
           TD(noobj, simd=[('avx512f', 'fd')]),
           TD(O, f='npy_ObjectMin')
           ),
@@ -537,6 +539,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
@@ -544,6 +547,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index f2524ae13..2b0e33244 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -943,6 +943,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
diff --git a/numpy/core/setup.py.orig b/numpy/core/setup.py.orig
new file mode 100644
index 000000000..f2524ae13
--- /dev/null
+++ b/numpy/core/setup.py.orig
@@ -0,0 +1,1063 @@
+import os
+import sys
+import pickle
+import copy
+import warnings
+import platform
+import textwrap
+import glob
+from os.path import join
+
+from numpy.distutils import log
+from distutils.dep_util import newer
+from sysconfig import get_config_var
+from numpy.compat import npy_load_module
+from setup_common import *  # noqa: F403
+
+# Set to True to enable relaxed strides checking. This (mostly) means
+# that `strides[dim]` is ignored if `shape[dim] == 1` when setting flags.
+NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0")
+
+# Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a
+# bogus value for affected strides in order to help smoke out bad stride usage
+# when relaxed stride checking is enabled.
+NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0")
+NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING
+
+# XXX: ugly, we use a class to avoid calling twice some expensive functions in
+# config.h/numpyconfig.h. I don't see a better way because distutils force
+# config.h generation inside an Extension class, and as such sharing
+# configuration information between extensions is not easy.
+# Using a pickled-based memoize does not work because config_cmd is an instance
+# method, which cPickle does not like.
+#
+# Use pickle in all cases, as cPickle is gone in python3 and the difference
+# in time is only in build. -- Charles Harris, 2013-03-30
+
+class CallOnceOnly:
+    def __init__(self):
+        self._check_types = None
+        self._check_ieee_macros = None
+        self._check_complex = None
+
+    def check_types(self, *a, **kw):
+        if self._check_types is None:
+            out = check_types(*a, **kw)
+            self._check_types = pickle.dumps(out)
+        else:
+            out = copy.deepcopy(pickle.loads(self._check_types))
+        return out
+
+    def check_ieee_macros(self, *a, **kw):
+        if self._check_ieee_macros is None:
+            out = check_ieee_macros(*a, **kw)
+            self._check_ieee_macros = pickle.dumps(out)
+        else:
+            out = copy.deepcopy(pickle.loads(self._check_ieee_macros))
+        return out
+
+    def check_complex(self, *a, **kw):
+        if self._check_complex is None:
+            out = check_complex(*a, **kw)
+            self._check_complex = pickle.dumps(out)
+        else:
+            out = copy.deepcopy(pickle.loads(self._check_complex))
+        return out
+
+def can_link_svml():
+    """SVML library is supported only on x86_64 architecture and currently
+    only on linux
+    """
+    machine = platform.machine()
+    system = platform.system()
+    return "x86_64" in machine and system == "Linux"
+
+def check_svml_submodule(svmlpath):
+    if not os.path.exists(svmlpath + "/README.md"):
+        raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
+                           "update --init` to fix this.")
+    return True
+
+def pythonlib_dir():
+    """return path where libpython* is."""
+    if sys.platform == 'win32':
+        return os.path.join(sys.prefix, "libs")
+    else:
+        return get_config_var('LIBDIR')
+
+def is_npy_no_signal():
+    """Return True if the NPY_NO_SIGNAL symbol must be defined in configuration
+    header."""
+    return sys.platform == 'win32'
+
+def is_npy_no_smp():
+    """Return True if the NPY_NO_SMP symbol must be defined in public
+    header (when SMP support cannot be reliably enabled)."""
+    # Perhaps a fancier check is in order here.
+    #  so that threads are only enabled if there
+    #  are actually multiple CPUS? -- but
+    #  threaded code can be nice even on a single
+    #  CPU so that long-calculating code doesn't
+    #  block.
+    return 'NPY_NOSMP' in os.environ
+
+def win32_checks(deflist):
+    from numpy.distutils.misc_util import get_build_architecture
+    a = get_build_architecture()
+
+    # Distutils hack on AMD64 on windows
+    print('BUILD_ARCHITECTURE: %r, os.name=%r, sys.platform=%r' %
+          (a, os.name, sys.platform))
+    if a == 'AMD64':
+        deflist.append('DISTUTILS_USE_SDK')
+
+    # On win32, force long double format string to be 'g', not
+    # 'Lg', since the MS runtime does not support long double whose
+    # size is > sizeof(double)
+    if a == "Intel" or a == "AMD64":
+        deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING')
+
+def check_math_capabilities(config, ext, moredefs, mathlibs):
+    def check_func(func_name):
+        return config.check_func(func_name, libraries=mathlibs,
+                                 decl=True, call=True)
+
+    def check_funcs_once(funcs_name):
+        decl = dict([(f, True) for f in funcs_name])
+        st = config.check_funcs_once(funcs_name, libraries=mathlibs,
+                                     decl=decl, call=decl)
+        if st:
+            moredefs.extend([(fname2def(f), 1) for f in funcs_name])
+        return st
+
+    def check_funcs(funcs_name):
+        # Use check_funcs_once first, and if it does not work, test func per
+        # func. Return success only if all the functions are available
+        if not check_funcs_once(funcs_name):
+            # Global check failed, check func per func
+            for f in funcs_name:
+                if check_func(f):
+                    moredefs.append((fname2def(f), 1))
+            return 0
+        else:
+            return 1
+
+    #use_msvc = config.check_decl("_MSC_VER")
+
+    if not check_funcs_once(MANDATORY_FUNCS):
+        raise SystemError("One of the required function to build numpy is not"
+                " available (the list is %s)." % str(MANDATORY_FUNCS))
+
+    # Standard functions which may not be available and for which we have a
+    # replacement implementation. Note that some of these are C99 functions.
+
+    # XXX: hack to circumvent cpp pollution from python: python put its
+    # config.h in the public namespace, so we have a clash for the common
+    # functions we test. We remove every function tested by python's
+    # autoconf, hoping their own test are correct
+    for f in OPTIONAL_STDFUNCS_MAYBE:
+        if config.check_decl(fname2def(f),
+                    headers=["Python.h", "math.h"]):
+            OPTIONAL_STDFUNCS.remove(f)
+
+    check_funcs(OPTIONAL_STDFUNCS)
+
+    for h in OPTIONAL_HEADERS:
+        if config.check_func("", decl=False, call=False, headers=[h]):
+            h = h.replace(".", "_").replace(os.path.sep, "_")
+            moredefs.append((fname2def(h), 1))
+
+    for tup in OPTIONAL_INTRINSICS:
+        headers = None
+        if len(tup) == 2:
+            f, args, m = tup[0], tup[1], fname2def(tup[0])
+        elif len(tup) == 3:
+            f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[0])
+        else:
+            f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[3])
+        if config.check_func(f, decl=False, call=True, call_args=args,
+                             headers=headers):
+            moredefs.append((m, 1))
+
+    for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
+        if config.check_gcc_function_attribute(dec, fn):
+            moredefs.append((fname2def(fn), 1))
+            if fn == 'attribute_target_avx512f':
+                # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
+                # support on Windows-based platforms
+                if (sys.platform in ('win32', 'cygwin') and
+                        config.check_compiler_gcc() and
+                        not config.check_gcc_version_at_least(8, 4)):
+                    ext.extra_compile_args.extend(
+                            ['-ffixed-xmm%s' % n for n in range(16, 32)])
+
+    for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
+        if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
+                                                               header):
+            moredefs.append((fname2def(fn), 1))
+
+    for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
+        if config.check_gcc_variable_attribute(fn):
+            m = fn.replace("(", "_").replace(")", "_")
+            moredefs.append((fname2def(m), 1))
+
+    # C99 functions: float and long double versions
+    check_funcs(C99_FUNCS_SINGLE)
+    check_funcs(C99_FUNCS_EXTENDED)
+
+def check_complex(config, mathlibs):
+    priv = []
+    pub = []
+
+    try:
+        if os.uname()[0] == "Interix":
+            warnings.warn("Disabling broken complex support. See #1365", stacklevel=2)
+            return priv, pub
+    except Exception:
+        # os.uname not available on all platforms. blanket except ugly but safe
+        pass
+
+    # Check for complex support
+    st = config.check_header('complex.h')
+    if st:
+        priv.append(('HAVE_COMPLEX_H', 1))
+        pub.append(('NPY_USE_C99_COMPLEX', 1))
+
+        for t in C99_COMPLEX_TYPES:
+            st = config.check_type(t, headers=["complex.h"])
+            if st:
+                pub.append(('NPY_HAVE_%s' % type2def(t), 1))
+
+        def check_prec(prec):
+            flist = [f + prec for f in C99_COMPLEX_FUNCS]
+            decl = dict([(f, True) for f in flist])
+            if not config.check_funcs_once(flist, call=decl, decl=decl,
+                                           libraries=mathlibs):
+                for f in flist:
+                    if config.check_func(f, call=True, decl=True,
+                                         libraries=mathlibs):
+                        priv.append((fname2def(f), 1))
+            else:
+                priv.extend([(fname2def(f), 1) for f in flist])
+
+        check_prec('')
+        check_prec('f')
+        check_prec('l')
+
+    return priv, pub
+
+def check_ieee_macros(config):
+    priv = []
+    pub = []
+
+    macros = []
+
+    def _add_decl(f):
+        priv.append(fname2def("decl_%s" % f))
+        pub.append('NPY_%s' % fname2def("decl_%s" % f))
+
+    # XXX: hack to circumvent cpp pollution from python: python put its
+    # config.h in the public namespace, so we have a clash for the common
+    # functions we test. We remove every function tested by python's
+    # autoconf, hoping their own test are correct
+    _macros = ["isnan", "isinf", "signbit", "isfinite"]
+    for f in _macros:
+        py_symbol = fname2def("decl_%s" % f)
+        already_declared = config.check_decl(py_symbol,
+                headers=["Python.h", "math.h"])
+        if already_declared:
+            if config.check_macro_true(py_symbol,
+                    headers=["Python.h", "math.h"]):
+                pub.append('NPY_%s' % fname2def("decl_%s" % f))
+        else:
+            macros.append(f)
+    # Normally, isnan and isinf are macro (C99), but some platforms only have
+    # func, or both func and macro version. Check for macro only, and define
+    # replacement ones if not found.
+    # Note: including Python.h is necessary because it modifies some math.h
+    # definitions
+    for f in macros:
+        st = config.check_decl(f, headers=["Python.h", "math.h"])
+        if st:
+            _add_decl(f)
+
+    return priv, pub
+
+def check_types(config_cmd, ext, build_dir):
+    private_defines = []
+    public_defines = []
+
+    # Expected size (in number of bytes) for each type. This is an
+    # optimization: those are only hints, and an exhaustive search for the size
+    # is done if the hints are wrong.
+    expected = {'short': [2], 'int': [4], 'long': [8, 4],
+                'float': [4], 'double': [8], 'long double': [16, 12, 8],
+                'Py_intptr_t': [8, 4], 'PY_LONG_LONG': [8], 'long long': [8],
+                'off_t': [8, 4]}
+
+    # Check we have the python header (-dev* packages on Linux)
+    result = config_cmd.check_header('Python.h')
+    if not result:
+        python = 'python'
+        if '__pypy__' in sys.builtin_module_names:
+            python = 'pypy'
+        raise SystemError(
+                "Cannot compile 'Python.h'. Perhaps you need to "
+                "install {0}-dev|{0}-devel.".format(python))
+    res = config_cmd.check_header("endian.h")
+    if res:
+        private_defines.append(('HAVE_ENDIAN_H', 1))
+        public_defines.append(('NPY_HAVE_ENDIAN_H', 1))
+    res = config_cmd.check_header("sys/endian.h")
+    if res:
+        private_defines.append(('HAVE_SYS_ENDIAN_H', 1))
+        public_defines.append(('NPY_HAVE_SYS_ENDIAN_H', 1))
+
+    # Check basic types sizes
+    for type in ('short', 'int', 'long'):
+        res = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), headers=["Python.h"])
+        if res:
+            public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), "SIZEOF_%s" % sym2def(type)))
+        else:
+            res = config_cmd.check_type_size(type, expected=expected[type])
+            if res >= 0:
+                public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res))
+            else:
+                raise SystemError("Checking sizeof (%s) failed !" % type)
+
+    for type in ('float', 'double', 'long double'):
+        already_declared = config_cmd.check_decl("SIZEOF_%s" % sym2def(type),
+                                                 headers=["Python.h"])
+        res = config_cmd.check_type_size(type, expected=expected[type])
+        if res >= 0:
+            public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res))
+            if not already_declared and not type == 'long double':
+                private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res))
+        else:
+            raise SystemError("Checking sizeof (%s) failed !" % type)
+
+        # Compute size of corresponding complex type: used to check that our
+        # definition is binary compatible with C99 complex type (check done at
+        # build time in npy_common.h)
+        complex_def = "struct {%s __x; %s __y;}" % (type, type)
+        res = config_cmd.check_type_size(complex_def,
+                                         expected=[2 * x for x in expected[type]])
+        if res >= 0:
+            public_defines.append(('NPY_SIZEOF_COMPLEX_%s' % sym2def(type), '%d' % res))
+        else:
+            raise SystemError("Checking sizeof (%s) failed !" % complex_def)
+
+    for type in ('Py_intptr_t', 'off_t'):
+        res = config_cmd.check_type_size(type, headers=["Python.h"],
+                library_dirs=[pythonlib_dir()],
+                expected=expected[type])
+
+        if res >= 0:
+            private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res))
+            public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res))
+        else:
+            raise SystemError("Checking sizeof (%s) failed !" % type)
+
+    # We check declaration AND type because that's how distutils does it.
+    if config_cmd.check_decl('PY_LONG_LONG', headers=['Python.h']):
+        res = config_cmd.check_type_size('PY_LONG_LONG',  headers=['Python.h'],
+                library_dirs=[pythonlib_dir()],
+                expected=expected['PY_LONG_LONG'])
+        if res >= 0:
+            private_defines.append(('SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res))
+            public_defines.append(('NPY_SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res))
+        else:
+            raise SystemError("Checking sizeof (%s) failed !" % 'PY_LONG_LONG')
+
+        res = config_cmd.check_type_size('long long',
+                expected=expected['long long'])
+        if res >= 0:
+            #private_defines.append(('SIZEOF_%s' % sym2def('long long'), '%d' % res))
+            public_defines.append(('NPY_SIZEOF_%s' % sym2def('long long'), '%d' % res))
+        else:
+            raise SystemError("Checking sizeof (%s) failed !" % 'long long')
+
+    if not config_cmd.check_decl('CHAR_BIT', headers=['Python.h']):
+        raise RuntimeError(
+            "Config wo CHAR_BIT is not supported"
+            ", please contact the maintainers")
+
+    return private_defines, public_defines
+
+def check_mathlib(config_cmd):
+    # Testing the C math library
+    mathlibs = []
+    mathlibs_choices = [[], ['m'], ['cpml']]
+    mathlib = os.environ.get('MATHLIB')
+    if mathlib:
+        mathlibs_choices.insert(0, mathlib.split(','))
+    for libs in mathlibs_choices:
+        if config_cmd.check_func("exp", libraries=libs, decl=True, call=True):
+            mathlibs = libs
+            break
+    else:
+        raise RuntimeError(
+            "math library missing; rerun setup.py after setting the "
+            "MATHLIB env variable")
+    return mathlibs
+
+def visibility_define(config):
+    """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty
+    string)."""
+    hide = '__attribute__((visibility("hidden")))'
+    if config.check_gcc_function_attribute(hide, 'hideme'):
+        return hide
+    else:
+        return ''
+
+def configuration(parent_package='',top_path=None):
+    from numpy.distutils.misc_util import Configuration, dot_join
+    from numpy.distutils.system_info import (get_info, blas_opt_info,
+                                             lapack_opt_info)
+
+    config = Configuration('core', parent_package, top_path)
+    local_dir = config.local_path
+    codegen_dir = join(local_dir, 'code_generators')
+
+    if is_released(config):
+        warnings.simplefilter('error', MismatchCAPIWarning)
+
+    # Check whether we have a mismatch between the set C API VERSION and the
+    # actual C API VERSION
+    check_api_version(C_API_VERSION, codegen_dir)
+
+    generate_umath_py = join(codegen_dir, 'generate_umath.py')
+    n = dot_join(config.name, 'generate_umath')
+    generate_umath = npy_load_module('_'.join(n.split('.')),
+                                     generate_umath_py, ('.py', 'U', 1))
+
+    header_dir = 'include/numpy'  # this is relative to config.path_in_package
+
+    cocache = CallOnceOnly()
+
+    def generate_config_h(ext, build_dir):
+        target = join(build_dir, header_dir, 'config.h')
+        d = os.path.dirname(target)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+        if newer(__file__, target):
+            config_cmd = config.get_config_cmd()
+            log.info('Generating %s', target)
+
+            # Check sizeof
+            moredefs, ignored = cocache.check_types(config_cmd, ext, build_dir)
+
+            # Check math library and C99 math funcs availability
+            mathlibs = check_mathlib(config_cmd)
+            moredefs.append(('MATHLIB', ','.join(mathlibs)))
+
+            check_math_capabilities(config_cmd, ext, moredefs, mathlibs)
+            moredefs.extend(cocache.check_ieee_macros(config_cmd)[0])
+            moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0])
+
+            # Signal check
+            if is_npy_no_signal():
+                moredefs.append('__NPY_PRIVATE_NO_SIGNAL')
+
+            # Windows checks
+            if sys.platform == 'win32' or os.name == 'nt':
+                win32_checks(moredefs)
+
+            # C99 restrict keyword
+            moredefs.append(('NPY_RESTRICT', config_cmd.check_restrict()))
+
+            # Inline check
+            inline = config_cmd.check_inline()
+
+            if can_link_svml():
+                moredefs.append(('NPY_CAN_LINK_SVML', 1))
+
+            # Use relaxed stride checking
+            if NPY_RELAXED_STRIDES_CHECKING:
+                moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1))
+            else:
+                moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 0))
+
+            # Use bogus stride debug aid when relaxed strides are enabled
+            if NPY_RELAXED_STRIDES_DEBUG:
+                moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
+            else:
+                moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 0))
+
+            # Get long double representation
+            rep = check_long_double_representation(config_cmd)
+            moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
+
+            if check_for_right_shift_internal_compiler_error(config_cmd):
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift')
+                moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift')
+
+            # Generate the config.h file from moredefs
+            with open(target, 'w') as target_f:
+                for d in moredefs:
+                    if isinstance(d, str):
+                        target_f.write('#define %s\n' % (d))
+                    else:
+                        target_f.write('#define %s %s\n' % (d[0], d[1]))
+
+                # define inline to our keyword, or nothing
+                target_f.write('#ifndef __cplusplus\n')
+                if inline == 'inline':
+                    target_f.write('/* #undef inline */\n')
+                else:
+                    target_f.write('#define inline %s\n' % inline)
+                target_f.write('#endif\n')
+
+                # add the guard to make sure config.h is never included directly,
+                # but always through npy_config.h
+                target_f.write(textwrap.dedent("""
+                    #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
+                    #error config.h should never be included directly, include npy_config.h instead
+                    #endif
+                    """))
+
+            log.info('File: %s' % target)
+            with open(target) as target_f:
+                log.info(target_f.read())
+            log.info('EOF')
+        else:
+            mathlibs = []
+            with open(target) as target_f:
+                for line in target_f:
+                    s = '#define MATHLIB'
+                    if line.startswith(s):
+                        value = line[len(s):].strip()
+                        if value:
+                            mathlibs.extend(value.split(','))
+
+        # Ugly: this can be called within a library and not an extension,
+        # in which case there is no libraries attributes (and none is
+        # needed).
+        if hasattr(ext, 'libraries'):
+            ext.libraries.extend(mathlibs)
+
+        incl_dir = os.path.dirname(target)
+        if incl_dir not in config.numpy_include_dirs:
+            config.numpy_include_dirs.append(incl_dir)
+
+        return target
+
+    def generate_numpyconfig_h(ext, build_dir):
+        """Depends on config.h: generate_config_h has to be called before !"""
+        # put common include directory in build_dir on search path
+        # allows using code generation in headers
+        config.add_include_dirs(join(build_dir, "src", "common"))
+        config.add_include_dirs(join(build_dir, "src", "npymath"))
+
+        target = join(build_dir, header_dir, '_numpyconfig.h')
+        d = os.path.dirname(target)
+        if not os.path.exists(d):
+            os.makedirs(d)
+        if newer(__file__, target):
+            config_cmd = config.get_config_cmd()
+            log.info('Generating %s', target)
+
+            # Check sizeof
+            ignored, moredefs = cocache.check_types(config_cmd, ext, build_dir)
+
+            if is_npy_no_signal():
+                moredefs.append(('NPY_NO_SIGNAL', 1))
+
+            if is_npy_no_smp():
+                moredefs.append(('NPY_NO_SMP', 1))
+            else:
+                moredefs.append(('NPY_NO_SMP', 0))
+
+            mathlibs = check_mathlib(config_cmd)
+            moredefs.extend(cocache.check_ieee_macros(config_cmd)[1])
+            moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1])
+
+            if NPY_RELAXED_STRIDES_CHECKING:
+                moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1))
+
+            if NPY_RELAXED_STRIDES_DEBUG:
+                moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
+
+            # Check whether we can use inttypes (C99) formats
+            if config_cmd.check_decl('PRIdPTR', headers=['inttypes.h']):
+                moredefs.append(('NPY_USE_C99_FORMATS', 1))
+
+            # visibility check
+            hidden_visibility = visibility_define(config_cmd)
+            moredefs.append(('NPY_VISIBILITY_HIDDEN', hidden_visibility))
+
+            # Add the C API/ABI versions
+            moredefs.append(('NPY_ABI_VERSION', '0x%.8X' % C_ABI_VERSION))
+            moredefs.append(('NPY_API_VERSION', '0x%.8X' % C_API_VERSION))
+
+            # Add moredefs to header
+            with open(target, 'w') as target_f:
+                for d in moredefs:
+                    if isinstance(d, str):
+                        target_f.write('#define %s\n' % (d))
+                    else:
+                        target_f.write('#define %s %s\n' % (d[0], d[1]))
+
+                # Define __STDC_FORMAT_MACROS
+                target_f.write(textwrap.dedent("""
+                    #ifndef __STDC_FORMAT_MACROS
+                    #define __STDC_FORMAT_MACROS 1
+                    #endif
+                    """))
+
+            # Dump the numpyconfig.h header to stdout
+            log.info('File: %s' % target)
+            with open(target) as target_f:
+                log.info(target_f.read())
+            log.info('EOF')
+        config.add_data_files((header_dir, target))
+        return target
+
+    def generate_api_func(module_name):
+        def generate_api(ext, build_dir):
+            script = join(codegen_dir, module_name + '.py')
+            sys.path.insert(0, codegen_dir)
+            try:
+                m = __import__(module_name)
+                log.info('executing %s', script)
+                h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir))
+            finally:
+                del sys.path[0]
+            config.add_data_files((header_dir, h_file),
+                                  (header_dir, doc_file))
+            return (h_file,)
+        return generate_api
+
+    generate_numpy_api = generate_api_func('generate_numpy_api')
+    generate_ufunc_api = generate_api_func('generate_ufunc_api')
+
+    config.add_include_dirs(join(local_dir, "src", "common"))
+    config.add_include_dirs(join(local_dir, "src"))
+    config.add_include_dirs(join(local_dir))
+
+    config.add_data_dir('include/numpy')
+    config.add_include_dirs(join('src', 'npymath'))
+    config.add_include_dirs(join('src', 'multiarray'))
+    config.add_include_dirs(join('src', 'umath'))
+    config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', '_simd'))
+
+    config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
+    config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
+    if sys.platform[:3] == "aix":
+        config.add_define_macros([("_LARGE_FILES", None)])
+    else:
+        config.add_define_macros([("_FILE_OFFSET_BITS", "64")])
+        config.add_define_macros([('_LARGEFILE_SOURCE', '1')])
+        config.add_define_macros([('_LARGEFILE64_SOURCE', '1')])
+
+    config.numpy_include_dirs.extend(config.paths('include'))
+
+    deps = [join('src', 'npymath', '_signbit.c'),
+            join('include', 'numpy', '*object.h'),
+            join(codegen_dir, 'genapi.py'),
+            ]
+
+    #######################################################################
+    #                          npymath library                            #
+    #######################################################################
+
+    subst_dict = dict([("sep", os.path.sep), ("pkgname", "numpy.core")])
+
+    def get_mathlib_info(*args):
+        # Another ugly hack: the mathlib info is known once build_src is run,
+        # but we cannot use add_installed_pkg_config here either, so we only
+        # update the substitution dictionary during npymath build
+        config_cmd = config.get_config_cmd()
+
+        # Check that the toolchain works, to fail early if it doesn't
+        # (avoid late errors with MATHLIB which are confusing if the
+        # compiler does not work).
+        st = config_cmd.try_link('int main(void) { return 0;}')
+        if not st:
+            # rerun the failing command in verbose mode
+            config_cmd.compiler.verbose = True
+            config_cmd.try_link('int main(void) { return 0;}')
+            raise RuntimeError("Broken toolchain: cannot link a simple C program")
+        mlibs = check_mathlib(config_cmd)
+
+        posix_mlib = ' '.join(['-l%s' % l for l in mlibs])
+        msvc_mlib = ' '.join(['%s.lib' % l for l in mlibs])
+        subst_dict["posix_mathlib"] = posix_mlib
+        subst_dict["msvc_mathlib"] = msvc_mlib
+
+    npymath_sources = [join('src', 'npymath', 'npy_math_internal.h.src'),
+                       join('src', 'npymath', 'npy_math.c'),
+                       join('src', 'npymath', 'ieee754.c.src'),
+                       join('src', 'npymath', 'npy_math_complex.c.src'),
+                       join('src', 'npymath', 'halffloat.c')
+                       ]
+
+    # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977.
+    # Intel and Clang also don't seem happy with /GL
+    is_msvc = (platform.platform().startswith('Windows') and
+               platform.python_compiler().startswith('MS'))
+    config.add_installed_library('npymath',
+            sources=npymath_sources + [get_mathlib_info],
+            install_dir='lib',
+            build_info={
+                'include_dirs' : [],  # empty list required for creating npy_math_internal.h
+                'extra_compiler_args' : (['/GL-'] if is_msvc else []),
+            })
+    config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config",
+            subst_dict)
+    config.add_npy_pkg_config("mlib.ini.in", "lib/npy-pkg-config",
+            subst_dict)
+
+    #######################################################################
+    #                     multiarray_tests module                         #
+    #######################################################################
+
+    config.add_extension('_multiarray_tests',
+                    sources=[join('src', 'multiarray', '_multiarray_tests.c.src'),
+                             join('src', 'common', 'mem_overlap.c'),
+                             join('src', 'common', 'npy_argparse.c'),
+                             join('src', 'common', 'npy_hashtable.c')],
+                    depends=[join('src', 'common', 'mem_overlap.h'),
+                             join('src', 'common', 'npy_argparse.h'),
+                             join('src', 'common', 'npy_hashtable.h'),
+                             join('src', 'common', 'npy_extint128.h')],
+                    libraries=['npymath'])
+
+    #######################################################################
+    #             _multiarray_umath module - common part                  #
+    #######################################################################
+
+    common_deps = [
+            join('src', 'common', 'array_assign.h'),
+            join('src', 'common', 'binop_override.h'),
+            join('src', 'common', 'cblasfuncs.h'),
+            join('src', 'common', 'lowlevel_strided_loops.h'),
+            join('src', 'common', 'mem_overlap.h'),
+            join('src', 'common', 'npy_argparse.h'),
+            join('src', 'common', 'npy_cblas.h'),
+            join('src', 'common', 'npy_config.h'),
+            join('src', 'common', 'npy_ctypes.h'),
+            join('src', 'common', 'npy_extint128.h'),
+            join('src', 'common', 'npy_import.h'),
+            join('src', 'common', 'npy_hashtable.h'),
+            join('src', 'common', 'npy_longdouble.h'),
+            join('src', 'common', 'npy_svml.h'),
+            join('src', 'common', 'templ_common.h.src'),
+            join('src', 'common', 'ucsnarrow.h'),
+            join('src', 'common', 'ufunc_override.h'),
+            join('src', 'common', 'umathmodule.h'),
+            join('src', 'common', 'numpyos.h'),
+            join('src', 'common', 'npy_cpu_dispatch.h'),
+            join('src', 'common', 'simd', 'simd.h'),
+            ]
+
+    common_src = [
+            join('src', 'common', 'array_assign.c'),
+            join('src', 'common', 'mem_overlap.c'),
+            join('src', 'common', 'npy_argparse.c'),
+            join('src', 'common', 'npy_hashtable.c'),
+            join('src', 'common', 'npy_longdouble.c'),
+            join('src', 'common', 'templ_common.h.src'),
+            join('src', 'common', 'ucsnarrow.c'),
+            join('src', 'common', 'ufunc_override.c'),
+            join('src', 'common', 'numpyos.c'),
+            join('src', 'common', 'npy_cpu_features.c.src'),
+            ]
+
+    if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0":
+        blas_info = get_info('blas_ilp64_opt', 2)
+    else:
+        blas_info = get_info('blas_opt', 0)
+
+    have_blas = blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', [])
+
+    if have_blas:
+        extra_info = blas_info
+        # These files are also in MANIFEST.in so that they are always in
+        # the source distribution independently of HAVE_CBLAS.
+        common_src.extend([join('src', 'common', 'cblasfuncs.c'),
+                           join('src', 'common', 'python_xerbla.c'),
+                          ])
+    else:
+        extra_info = {}
+
+    #######################################################################
+    #             _multiarray_umath module - multiarray part              #
+    #######################################################################
+
+    multiarray_deps = [
+            join('src', 'multiarray', 'abstractdtypes.h'),
+            join('src', 'multiarray', 'arrayobject.h'),
+            join('src', 'multiarray', 'arraytypes.h'),
+            join('src', 'multiarray', 'arrayfunction_override.h'),
+            join('src', 'multiarray', 'array_coercion.h'),
+            join('src', 'multiarray', 'array_method.h'),
+            join('src', 'multiarray', 'npy_buffer.h'),
+            join('src', 'multiarray', 'calculation.h'),
+            join('src', 'multiarray', 'common.h'),
+            join('src', 'multiarray', 'common_dtype.h'),
+            join('src', 'multiarray', 'convert_datatype.h'),
+            join('src', 'multiarray', 'convert.h'),
+            join('src', 'multiarray', 'conversion_utils.h'),
+            join('src', 'multiarray', 'ctors.h'),
+            join('src', 'multiarray', 'descriptor.h'),
+            join('src', 'multiarray', 'dtypemeta.h'),
+            join('src', 'multiarray', 'dtype_transfer.h'),
+            join('src', 'multiarray', 'dragon4.h'),
+            join('src', 'multiarray', 'einsum_debug.h'),
+            join('src', 'multiarray', 'einsum_sumprod.h'),
+            join('src', 'multiarray', 'experimental_public_dtype_api.h'),
+            join('src', 'multiarray', 'getset.h'),
+            join('src', 'multiarray', 'hashdescr.h'),
+            join('src', 'multiarray', 'iterators.h'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.h'),
+            join('src', 'multiarray', 'mapping.h'),
+            join('src', 'multiarray', 'methods.h'),
+            join('src', 'multiarray', 'multiarraymodule.h'),
+            join('src', 'multiarray', 'nditer_impl.h'),
+            join('src', 'multiarray', 'number.h'),
+            join('src', 'multiarray', 'refcount.h'),
+            join('src', 'multiarray', 'scalartypes.h'),
+            join('src', 'multiarray', 'sequence.h'),
+            join('src', 'multiarray', 'shape.h'),
+            join('src', 'multiarray', 'strfuncs.h'),
+            join('src', 'multiarray', 'typeinfo.h'),
+            join('src', 'multiarray', 'usertypes.h'),
+            join('src', 'multiarray', 'vdot.h'),
+            join('include', 'numpy', 'arrayobject.h'),
+            join('include', 'numpy', '_neighborhood_iterator_imp.h'),
+            join('include', 'numpy', 'npy_endian.h'),
+            join('include', 'numpy', 'arrayscalars.h'),
+            join('include', 'numpy', 'noprefix.h'),
+            join('include', 'numpy', 'npy_interrupt.h'),
+            join('include', 'numpy', 'npy_3kcompat.h'),
+            join('include', 'numpy', 'npy_math.h'),
+            join('include', 'numpy', 'halffloat.h'),
+            join('include', 'numpy', 'npy_common.h'),
+            join('include', 'numpy', 'npy_os.h'),
+            join('include', 'numpy', 'utils.h'),
+            join('include', 'numpy', 'ndarrayobject.h'),
+            join('include', 'numpy', 'npy_cpu.h'),
+            join('include', 'numpy', 'numpyconfig.h'),
+            join('include', 'numpy', 'ndarraytypes.h'),
+            join('include', 'numpy', 'npy_1_7_deprecated_api.h'),
+            # add library sources as distuils does not consider libraries
+            # dependencies
+            ] + npymath_sources
+
+    multiarray_src = [
+            join('src', 'multiarray', 'abstractdtypes.c'),
+            join('src', 'multiarray', 'alloc.c'),
+            join('src', 'multiarray', 'arrayobject.c'),
+            join('src', 'multiarray', 'arraytypes.c.src'),
+            join('src', 'multiarray', 'array_coercion.c'),
+            join('src', 'multiarray', 'array_method.c'),
+            join('src', 'multiarray', 'array_assign_scalar.c'),
+            join('src', 'multiarray', 'array_assign_array.c'),
+            join('src', 'multiarray', 'arrayfunction_override.c'),
+            join('src', 'multiarray', 'buffer.c'),
+            join('src', 'multiarray', 'calculation.c'),
+            join('src', 'multiarray', 'compiled_base.c'),
+            join('src', 'multiarray', 'common.c'),
+            join('src', 'multiarray', 'common_dtype.c'),
+            join('src', 'multiarray', 'convert.c'),
+            join('src', 'multiarray', 'convert_datatype.c'),
+            join('src', 'multiarray', 'conversion_utils.c'),
+            join('src', 'multiarray', 'ctors.c'),
+            join('src', 'multiarray', 'datetime.c'),
+            join('src', 'multiarray', 'datetime_strings.c'),
+            join('src', 'multiarray', 'datetime_busday.c'),
+            join('src', 'multiarray', 'datetime_busdaycal.c'),
+            join('src', 'multiarray', 'descriptor.c'),
+            join('src', 'multiarray', 'dtypemeta.c'),
+            join('src', 'multiarray', 'dragon4.c'),
+            join('src', 'multiarray', 'dtype_transfer.c'),
+            join('src', 'multiarray', 'einsum.c.src'),
+            join('src', 'multiarray', 'einsum_sumprod.c.src'),
+            join('src', 'multiarray', 'experimental_public_dtype_api.c'),
+            join('src', 'multiarray', 'flagsobject.c'),
+            join('src', 'multiarray', 'getset.c'),
+            join('src', 'multiarray', 'hashdescr.c'),
+            join('src', 'multiarray', 'item_selection.c'),
+            join('src', 'multiarray', 'iterators.c'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.c'),
+            join('src', 'multiarray', 'lowlevel_strided_loops.c.src'),
+            join('src', 'multiarray', 'mapping.c'),
+            join('src', 'multiarray', 'methods.c'),
+            join('src', 'multiarray', 'multiarraymodule.c'),
+            join('src', 'multiarray', 'nditer_templ.c.src'),
+            join('src', 'multiarray', 'nditer_api.c'),
+            join('src', 'multiarray', 'nditer_constr.c'),
+            join('src', 'multiarray', 'nditer_pywrap.c'),
+            join('src', 'multiarray', 'number.c'),
+            join('src', 'multiarray', 'refcount.c'),
+            join('src', 'multiarray', 'sequence.c'),
+            join('src', 'multiarray', 'shape.c'),
+            join('src', 'multiarray', 'scalarapi.c'),
+            join('src', 'multiarray', 'scalartypes.c.src'),
+            join('src', 'multiarray', 'strfuncs.c'),
+            join('src', 'multiarray', 'temp_elide.c'),
+            join('src', 'multiarray', 'typeinfo.c'),
+            join('src', 'multiarray', 'usertypes.c'),
+            join('src', 'multiarray', 'vdot.c'),
+            join('src', 'common', 'npy_sort.h.src'),
+            join('src', 'npysort', 'quicksort.c.src'),
+            join('src', 'npysort', 'mergesort.c.src'),
+            join('src', 'npysort', 'timsort.c.src'),
+            join('src', 'npysort', 'heapsort.c.src'),
+            join('src', 'npysort', 'radixsort.c.src'),
+            join('src', 'common', 'npy_partition.h.src'),
+            join('src', 'npysort', 'selection.c.src'),
+            join('src', 'common', 'npy_binsearch.h.src'),
+            join('src', 'npysort', 'binsearch.c.src'),
+            ]
+
+    #######################################################################
+    #             _multiarray_umath module - umath part                   #
+    #######################################################################
+
+    def generate_umath_c(ext, build_dir):
+        target = join(build_dir, header_dir, '__umath_generated.c')
+        dir = os.path.dirname(target)
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+        script = generate_umath_py
+        if newer(script, target):
+            with open(target, 'w') as f:
+                f.write(generate_umath.make_code(generate_umath.defdict,
+                                                 generate_umath.__file__))
+        return []
+
+    umath_src = [
+            join('src', 'umath', 'umathmodule.c'),
+            join('src', 'umath', 'reduction.c'),
+            join('src', 'umath', 'funcs.inc.src'),
+            join('src', 'umath', 'simd.inc.src'),
+            join('src', 'umath', 'loops.h.src'),
+            join('src', 'umath', 'loops_utils.h.src'),
+            join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+            join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
+            join('src', 'umath', 'matmul.h.src'),
+            join('src', 'umath', 'matmul.c.src'),
+            join('src', 'umath', 'clip.h.src'),
+            join('src', 'umath', 'clip.c.src'),
+            join('src', 'umath', 'dispatching.c'),
+            join('src', 'umath', 'legacy_array_method.c'),
+            join('src', 'umath', 'ufunc_object.c'),
+            join('src', 'umath', 'extobj.c'),
+            join('src', 'umath', 'scalarmath.c.src'),
+            join('src', 'umath', 'ufunc_type_resolution.c'),
+            join('src', 'umath', 'override.c'),
+            # For testing. Eventually, should use public API and be separate:
+            join('src', 'umath', '_scaled_float_dtype.c'),
+            ]
+
+    umath_deps = [
+            generate_umath_py,
+            join('include', 'numpy', 'npy_math.h'),
+            join('include', 'numpy', 'halffloat.h'),
+            join('src', 'multiarray', 'common.h'),
+            join('src', 'multiarray', 'number.h'),
+            join('src', 'common', 'templ_common.h.src'),
+            join('src', 'umath', 'simd.inc.src'),
+            join('src', 'umath', 'override.h'),
+            join(codegen_dir, 'generate_ufunc_api.py'),
+            ]
+
+    svml_path = join('numpy', 'core', 'src', 'umath', 'svml')
+    svml_objs = []
+    if can_link_svml() and check_svml_submodule(svml_path):
+        svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
+
+    config.add_extension('_multiarray_umath',
+                         sources=multiarray_src + umath_src +
+                                 common_src +
+                                 [generate_config_h,
+                                  generate_numpyconfig_h,
+                                  generate_numpy_api,
+                                  join(codegen_dir, 'generate_numpy_api.py'),
+                                  join('*.py'),
+                                  generate_umath_c,
+                                  generate_ufunc_api,
+                                 ],
+                         depends=deps + multiarray_deps + umath_deps +
+                                common_deps,
+                         libraries=['npymath'],
+                         extra_objects=svml_objs,
+                         extra_info=extra_info)
+
+    #######################################################################
+    #                        umath_tests module                           #
+    #######################################################################
+
+    config.add_extension('_umath_tests', sources=[
+        join('src', 'umath', '_umath_tests.c.src'),
+        join('src', 'umath', '_umath_tests.dispatch.c'),
+        join('src', 'common', 'npy_cpu_features.c.src'),
+    ])
+
+    #######################################################################
+    #                   custom rational dtype module                      #
+    #######################################################################
+
+    config.add_extension('_rational_tests',
+                    sources=[join('src', 'umath', '_rational_tests.c.src')])
+
+    #######################################################################
+    #                        struct_ufunc_test module                     #
+    #######################################################################
+
+    config.add_extension('_struct_ufunc_tests',
+                    sources=[join('src', 'umath', '_struct_ufunc_tests.c.src')])
+
+
+    #######################################################################
+    #                        operand_flag_tests module                    #
+    #######################################################################
+
+    config.add_extension('_operand_flag_tests',
+                    sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
+
+    #######################################################################
+    #                        SIMD module                                  #
+    #######################################################################
+
+    config.add_extension('_simd', sources=[
+        join('src', 'common', 'npy_cpu_features.c.src'),
+        join('src', '_simd', '_simd.c'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd.dispatch.c.src'),
+    ], depends=[
+        join('src', 'common', 'npy_cpu_dispatch.h'),
+        join('src', 'common', 'simd', 'simd.h'),
+        join('src', '_simd', '_simd.h'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd_arg.inc'),
+        join('src', '_simd', '_simd_convert.inc'),
+        join('src', '_simd', '_simd_easyintrin.inc'),
+        join('src', '_simd', '_simd_vector.inc'),
+    ])
+
+    config.add_subpackage('tests')
+    config.add_data_dir('tests/data')
+    config.add_data_dir('tests/examples')
+    config.add_data_files('*.pyi')
+
+    config.make_svn_version_py()
+
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(configuration=configuration)
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 19ea6f22f..32b5bc89e 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,4 +153,70 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+#define npyv_reduce_max_u8 vmaxvq_u8
+#define npyv_reduce_max_s8 vmaxvq_s8
+#define npyv_reduce_max_u16 vmaxvq_u16
+#define npyv_reduce_max_s16 vmaxvq_s16
+#define npyv_reduce_max_u32 vmaxvq_u32
+#define npyv_reduce_max_s32 vmaxvq_s32
+NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v)
+{
+    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
+    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
+    npyv_lanetype_u64 result = (a > b) ? a : b;
+    return result;
+}
+NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v)
+{
+    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
+    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
+    npyv_lanetype_s64 result = (a > b) ? a : b;
+    return result;
+}
+#define npyv_reduce_max_f32 vmaxvq_f32
+#if NPY_SIMD_F64
+    #define npyv_reduce_max_f64 vmaxvq_f64
+#endif // NPY_SIMD_F64
+
+// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
+// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
+// - Only if both corresponded elements are NaN, NaN is set. 
+#define npyv_reduce_maxp_f32 vmaxnmvq_f32
+#if NPY_SIMD_F64
+    #define npyv_reduce_maxp_f64 vmaxnmvq_f64
+#endif // NPY_SIMD_F64
+
+#define npyv_reduce_min_u8 vminvq_u8
+#define npyv_reduce_min_s8 vminvq_s8
+#define npyv_reduce_min_u16 vminvq_u16
+#define npyv_reduce_min_s16 vminvq_s16
+#define npyv_reduce_min_u32 vminvq_u32
+#define npyv_reduce_min_s32 vminvq_s32
+NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v)
+{
+    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
+    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
+    npyv_lanetype_u64 result = (a < b) ? a : b;
+    return result;
+}
+NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v)
+{
+    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
+    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
+    npyv_lanetype_s64 result = (a < b) ? a : b;
+    return result;
+}
+#define npyv_reduce_min_f32 vminvq_f32
+#if NPY_SIMD_F64
+    #define npyv_reduce_min_f64 vminvq_f64
+#endif // NPY_SIMD_F64
+
+// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
+// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
+// - Only if both corresponded elements are NaN, NaN is set. 
+#define npyv_reduce_minp_f32 vminnmvq_f32
+#if NPY_SIMD_F64
+    #define npyv_reduce_minp_f64 vminnmvq_f64
+#endif // NPY_SIMD_F64
+
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7c0710819..950ea011e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
+#include "loops.h"
 
 #include "npy_pycompat.h"
 
@@ -550,6 +551,7 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *          npy_double, npy_double, npy_double, npy_double#
  * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
  * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -724,6 +726,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat1**/
 
+#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = maximum, minimum#
  * #OP =  >, <#
@@ -749,6 +752,7 @@ NPY_NO_EXPORT void
 }
 
 /**end repeat1**/
+#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 
 NPY_NO_EXPORT void
 @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1593,6 +1597,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
+ *  #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
  */
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
@@ -1716,6 +1721,7 @@ NPY_NO_EXPORT void
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
 
+#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1741,8 +1747,10 @@ NPY_NO_EXPORT void
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
+#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 /**end repeat1**/
 
+#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = fmax, fmin#
  * #OP =  >=, <=#
@@ -1770,6 +1778,7 @@ NPY_NO_EXPORT void
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
+#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 0938cd050..aab1b5bbb 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -22,6 +22,13 @@
 #define BOOL_fmax BOOL_maximum
 #define BOOL_fmin BOOL_minimum
 
+/*
+ * The ARM NEON implementation of min/max for longdouble, longlong, and
+ * ulonglong assume that they're all 64-bits.  If that's not true, then we'll
+ * use the scalar implementations instead.
+ */
+#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64)
+#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64)
 
 /*
  *****************************************************************************
@@ -660,6 +667,49 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
 
 /*
  *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ */
+#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+//---------- Float ----------
+
+ /**begin repeat
+  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+  * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE#
+  */
+ #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+/*
+ *****************************************************************************
  **                              END LOOPS                                  **
  *****************************************************************************
  */
diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig
new file mode 100644
index 000000000..0938cd050
--- /dev/null
+++ b/numpy/core/src/umath/loops.h.src.orig
@@ -0,0 +1,667 @@
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#define BOOL_invert BOOL_logical_not
+#define BOOL_add BOOL_logical_or
+#define BOOL_bitwise_and BOOL_logical_and
+#define BOOL_bitwise_or BOOL_logical_or
+#define BOOL_logical_xor BOOL_not_equal
+#define BOOL_bitwise_xor BOOL_logical_xor
+#define BOOL_multiply BOOL_logical_and
+#define BOOL_maximum BOOL_logical_or
+#define BOOL_minimum BOOL_logical_and
+#define BOOL_fmax BOOL_maximum
+#define BOOL_fmin BOOL_minimum
+
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or, absolute, logical_not#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+
+/**begin repeat1
+ * both signed and unsigned integer types
+ * #s = , u#
+ * #S = , U#
+ */
+
+#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_fmax @S@@TYPE@_maximum
+#define @S@@TYPE@_fmin @S@@TYPE@_minimum
+
+NPY_NO_EXPORT void
+@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #isa = , _avx2#
+ */
+
+NPY_NO_EXPORT void
+@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat3
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *          left_shift, right_shift#
+ * #OP = +, -,*, &, |, ^, <<, >>#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+/**begin repeat3
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or#
+ * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**begin repeat2
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = sqrt, absolute, square, reciprocal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos# 
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = maximum, minimum#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * # kind = exp, log, frexp, ldexp#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ *  #c = f, f, , l#
+ *  #C = F, F, , L#
+ */
+
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
+ * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ **/
+
+/**begin repeat2
+ * #ISA  = , _avx512_skx#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = add, subtract, multiply#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract, multiply#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
+ * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+   #kind = logical_and, logical_or#
+   #OP1 = ||, ||#
+   #OP2 = &&, ||#
+*/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #OP = ||, ||, &&#
+ **/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #isa = , _avx512f#
+ */
+
+NPY_NO_EXPORT void
+C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define @TYPE@_isnan @TYPE@_isnat
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, !=, >, >=, <, <=#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum, fmin, fmax#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP = EQ, NE, GT, GE, LT, LE#
+ */
+/**begin repeat1
+ * #suffix = , _OO_O#
+ */
+NPY_NO_EXPORT void
+OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
new file mode 100644
index 000000000..06aed1bb7
--- /dev/null
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -0,0 +1,671 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+
+//##############################################################################
+//## Maximum, Minimum
+//##############################################################################
+
+#ifdef NPY_HAVE_NEON
+//******************************************************************************
+//** NEON Min / Max
+//******************************************************************************
+
+
+//------------------------------------------------------------------------------
+//-- Integer
+//------------------------------------------------------------------------------
+
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #sfx = s8, u8, s16, u16, s32, u32,
+ *        s64, u64, s64, u64#
+ * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ */
+
+// Implementation below assumes longlong and ulonglong are 64-bit.
+#if @HAVE_NEON_IMPL@
+
+/**begin repeat1
+* Arithmetic
+* # kind = maximum, minimum#
+* # OP = >, <#
+* # vop = max, min#
+*/
+
+#define SCALAR_OP @TYPE@_scalar_@vop@
+static inline @type@ SCALAR_OP(@type@ a, @type@ b){
+    return ((a @OP@ b) ? a : b);
+}
+
+static inline npy_intp
+simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    @type@ *ip2 = (@type@ *)args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+
+    const int vectorsPerLoop = 8;
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    // SIMD if possible
+    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
+        #define NPYV_CAST (const npyv_lanetype_@sfx@ *)
+        npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
+        npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
+        npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
+        npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
+        npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
+        npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
+        npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
+        npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
+
+        i += elemPerLoop;
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
+            npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
+
+            m0 = npyv_@vop@_@sfx@(m0, v0);
+            m1 = npyv_@vop@_@sfx@(m1, v1);
+            m2 = npyv_@vop@_@sfx@(m2, v2);
+            m3 = npyv_@vop@_@sfx@(m3, v3);
+            m4 = npyv_@vop@_@sfx@(m4, v4);
+            m5 = npyv_@vop@_@sfx@(m5, v5);
+            m6 = npyv_@vop@_@sfx@(m6, v6);
+            m7 = npyv_@vop@_@sfx@(m7, v7);
+        }
+
+        #undef NPYV_CAST
+
+        m0 = npyv_@vop@_@sfx@(m0, m1);
+        m2 = npyv_@vop@_@sfx@(m2, m3);
+        m4 = npyv_@vop@_@sfx@(m4, m5);
+        m6 = npyv_@vop@_@sfx@(m6, m7);
+
+        m0 = npyv_@vop@_@sfx@(m0, m2);
+        m4 = npyv_@vop@_@sfx@(m4, m6);
+
+        m0 = npyv_@vop@_@sfx@(m0, m4);
+
+        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
+
+        io1 = SCALAR_OP(io1, r);
+    }
+
+    *((@type@ *)iop1) = io1;
+
+    return i;
+}
+
+static inline npy_intp
+scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    char *ip2 = args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+
+    // 8x scalar
+    npy_intp elemPerLoop = 8;
+    if((i+elemPerLoop) <= n){
+        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
+        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
+        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
+        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
+        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
+        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
+        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
+        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+        i += elemPerLoop;
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
+            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
+            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
+            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
+            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
+            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
+            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
+            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+            m0 = SCALAR_OP(m0, v0);
+            m1 = SCALAR_OP(m1, v1);
+            m2 = SCALAR_OP(m2, v2);
+            m3 = SCALAR_OP(m3, v3);
+            m4 = SCALAR_OP(m4, v4);
+            m5 = SCALAR_OP(m5, v5);
+            m6 = SCALAR_OP(m6, v6);
+            m7 = SCALAR_OP(m7, v7);
+        }
+
+        m0 = SCALAR_OP(m0, m1);
+        m2 = SCALAR_OP(m2, m3);
+        m4 = SCALAR_OP(m4, m5);
+        m6 = SCALAR_OP(m6, m7);
+
+        m0 = SCALAR_OP(m0, m2);
+        m4 = SCALAR_OP(m4, m6);
+
+        m0 = SCALAR_OP(m0, m4);
+
+        io1 = SCALAR_OP(io1, m0);
+    }
+
+    *((@type@ *)iop1) = io1;
+
+    return i;
+}
+
+static inline void
+run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    char *ip2 = args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+
+    const int vectorsPerLoop = 8;
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    i = 0;
+
+    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
+        // SIMD - do as many iterations as we can with vectors
+        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+    } else{
+        // Unrolled scalar - do as many iterations as we can with unrolled loops
+        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+    }
+
+    // Scalar - finish up any remaining iterations
+    io1 = iop1[0];
+    ip2 += i * is2;
+    for(; i<n; ++i, ip2 += is2){
+        const @type@ in2 = *(@type@ *)ip2;
+        io1 = SCALAR_OP(io1, in2);
+    }
+    *((@type@ *)iop1) = io1;
+}
+
+static inline npy_intp
+simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+
+    const int vectorsPerLoop = 6;
+    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
+            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
+            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
+            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
+            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
+            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
+
+            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
+            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
+            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
+            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
+            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
+            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
+
+            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
+            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
+            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
+            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
+            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
+            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
+
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
+        }
+    }
+    
+    return i;
+}
+
+static inline npy_intp
+scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+
+    npy_intp elemPerLoop = 4;
+    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+        /* Note, we can't just load all, do all ops, then store all here.
+         * Sometimes ufuncs are called with `accumulate`, which makes the
+         * assumption that previous iterations have finished before next
+         * iteration.  For example, the output of iteration 2 depends on the
+         * result of iteration 1.
+         */
+
+        /**begin repeat2
+         * #unroll = 0, 1, 2, 3#
+         */
+        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
+        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
+        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
+        /**end repeat2**/
+    }
+
+    return i;
+}
+
+static inline void
+run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    BINARY_DEFS
+
+    i = 0;
+
+    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
+            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
+            nomemoverlap(ip2, is2 * n, op1, os1 * n))
+    {
+        // SIMD - do as many iterations as we can with vectors
+        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
+    } else {
+        // Unrolled scalar - do as many iterations as we can with unrolled loops
+        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
+    }
+
+
+    // Scalar - finish up any remaining iterations
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        
+        *((@type@ *)op1) = SCALAR_OP(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
+    } else {
+        run_binary_@TYPE@_@kind@(args, dimensions, steps);
+    }
+}
+
+#undef SCALAR_OP
+
+/**end repeat1**/
+
+#endif // @HAVE_NEON_IMPL@
+
+/**end repeat**/
+
+
+//------------------------------------------------------------------------------
+//-- Float
+//------------------------------------------------------------------------------
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #sfx = f32, f64, f64#
+ * #asmType = s, d, d#
+ * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
+ */
+
+// Implementation below assumes longdouble is 64-bit.
+#if @HAVE_NEON_IMPL@
+
+/**begin repeat1
+* Arithmetic
+* # kind = maximum, minimum, fmax, fmin#
+* # OP = >=, <=, >=, <=#
+* # vop = max, min, maxp, minp#
+* # asmInstr = fmax, fmin, fmaxnm, fminnm#
+* # PROPAGATE_NAN = 1, 1, 0, 0#
+*/
+
+#define SCALAR_OP @TYPE@_scalar_@vop@
+static inline @type@ SCALAR_OP(@type@ a, @type@ b){
+#ifdef __aarch64__
+    @type@ result = 0;
+    __asm(
+        "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+#else
+
+#if @PROPAGATE_NAN@
+    return ((a @OP@ b || npy_isnan(a)) ? a : b);
+#else
+    return ((a @OP@ b || npy_isnan(b)) ? a : b);
+#endif
+#endif // __aarch64__
+}
+
+static inline npy_intp
+simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    @type@ *ip2 = (@type@ *)args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+
+    const int vectorsPerLoop = 8;
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    // SIMD if possible
+    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
+        npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
+        npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
+        npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
+        npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
+        npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
+        npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
+        npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
+        npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
+
+        i += elemPerLoop;
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
+            npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
+
+            m0 = npyv_@vop@_@sfx@(m0, v0);
+            m1 = npyv_@vop@_@sfx@(m1, v1);
+            m2 = npyv_@vop@_@sfx@(m2, v2);
+            m3 = npyv_@vop@_@sfx@(m3, v3);
+            m4 = npyv_@vop@_@sfx@(m4, v4);
+            m5 = npyv_@vop@_@sfx@(m5, v5);
+            m6 = npyv_@vop@_@sfx@(m6, v6);
+            m7 = npyv_@vop@_@sfx@(m7, v7);
+        }
+
+        m0 = npyv_@vop@_@sfx@(m0, m1);
+        m2 = npyv_@vop@_@sfx@(m2, m3);
+        m4 = npyv_@vop@_@sfx@(m4, m5);
+        m6 = npyv_@vop@_@sfx@(m6, m7);
+
+        m0 = npyv_@vop@_@sfx@(m0, m2);
+        m4 = npyv_@vop@_@sfx@(m4, m6);
+
+        m0 = npyv_@vop@_@sfx@(m0, m4);
+
+        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
+
+        io1 = SCALAR_OP(io1, r);
+    }
+
+    *((@type@ *)iop1) = io1;
+
+    return i;
+}
+
+static inline npy_intp
+scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    char *ip2 = args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+
+    // 8x scalar
+    npy_intp elemPerLoop = 8;
+    if((i+elemPerLoop) <= n){
+        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
+        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
+        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
+        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
+        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
+        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
+        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
+        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+        i += elemPerLoop;
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
+            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
+            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
+            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
+            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
+            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
+            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
+            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+            m0 = SCALAR_OP(m0, v0);
+            m1 = SCALAR_OP(m1, v1);
+            m2 = SCALAR_OP(m2, v2);
+            m3 = SCALAR_OP(m3, v3);
+            m4 = SCALAR_OP(m4, v4);
+            m5 = SCALAR_OP(m5, v5);
+            m6 = SCALAR_OP(m6, v6);
+            m7 = SCALAR_OP(m7, v7);
+        }
+
+        m0 = SCALAR_OP(m0, m1);
+        m2 = SCALAR_OP(m2, m3);
+        m4 = SCALAR_OP(m4, m5);
+        m6 = SCALAR_OP(m6, m7);
+
+        m0 = SCALAR_OP(m0, m2);
+        m4 = SCALAR_OP(m4, m6);
+
+        m0 = SCALAR_OP(m0, m4);
+
+        io1 = SCALAR_OP(io1, m0);
+    }
+
+    *((@type@ *)iop1) = io1;
+
+    return i;
+}
+
+static inline void
+run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    @type@ *iop1 = (@type@ *)args[0];
+    @type@ io1 = iop1[0];
+    char *ip2 = args[1];
+    npy_intp is2 = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+
+    const int vectorsPerLoop = 8;
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    i = 0;
+
+    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
+        // SIMD - do as many iterations as we can with vectors
+        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+    } else{
+        // Unrolled scalar - do as many iterations as we can with unrolled loops
+        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+    }
+
+    // Scalar - finish up any remaining iterations
+    io1 = iop1[0];
+    ip2 += i * is2;
+    for(; i<n; ++i, ip2 += is2){
+        const @type@ in2 = *(@type@ *)ip2;
+        io1 = SCALAR_OP(io1, in2);
+    }
+    *((@type@ *)iop1) = io1;
+}
+
+static inline npy_intp
+simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+
+    const int vectorsPerLoop = 6;
+    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
+        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
+            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
+            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
+            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
+            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
+            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
+
+            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
+            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
+            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
+            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
+            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
+            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
+
+            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
+            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
+            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
+            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
+            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
+            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
+
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
+            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
+        }
+    }
+    
+    return i;
+}
+
+static inline npy_intp
+scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
+    npy_intp n = dimensions[0];
+
+    npy_intp elemPerLoop = 4;
+    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
+        /* Note, we can't just load all, do all ops, then store all here.
+         * Sometimes ufuncs are called with `accumulate`, which makes the
+         * assumption that previous iterations have finished before next
+         * iteration.  For example, the output of iteration 2 depends on the
+         * result of iteration 1.
+         */
+
+        /**begin repeat2
+         * #unroll = 0, 1, 2, 3#
+         */
+        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
+        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
+        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
+        /**end repeat2**/
+    }
+
+    return i;
+}
+
+static inline void
+run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    BINARY_DEFS
+
+    i = 0;
+
+    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
+            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
+            nomemoverlap(ip2, is2 * n, op1, os1 * n))
+    {
+        // SIMD - do as many iterations as we can with vectors
+        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
+    } else {
+        // Unrolled scalar - do as many iterations as we can with unrolled loops
+        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
+    }
+
+    // Scalar - finish up any remaining iterations
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        
+        *((@type@ *)op1) = SCALAR_OP(in1, in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
+    } else {
+        run_binary_@TYPE@_@kind@(args, dimensions, steps);
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#undef SCALAR_OP
+
+/**end repeat1**/
+
+#endif // @HAVE_NEON_IMPL@
+
+/**end repeat**/
+
+#endif // NPY_HAVE_NEON
author	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-10-18 14:04:17 -0700
committer	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-10-18 14:04:17 -0700
commit	1950fb55ce21f2e125672979872dd4c7781d18ce (patch)
tree	75a8dfdd914c6e1ae6a531f8d7a54082fca92f1c /numpy
parent	dd2eaaabdb0451631e95376ae2b4d319082b438e (diff)
download	numpy-1950fb55ce21f2e125672979872dd4c7781d18ce.tar.gz