diff options
author | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2022-08-23 12:44:35 -0700 |
---|---|---|
committer | Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> | 2022-12-06 17:22:07 -0800 |
commit | 93cbbbef2318eb9a5478c846a60a55b382e4c60d (patch) | |
tree | bb148a7670aa9672b01627a014d1075a80991902 | |
parent | 28c81bc4b287255c4570a58e6d6bcd86600cd0b5 (diff) | |
download | numpy-93cbbbef2318eb9a5478c846a60a55b382e4c60d.tar.gz |
ENH: Add SIMD versions of bool logical_and, logical_or, logical_not and absolute
NumPy has SIMD versions of BOOL `logical_and`, `logical_or`, `logical_not`, and `absolute` for SSE2. The changes here replace that implementation with one that uses their universal intrinsics. This allows other architectures to have SIMD versions of the functions too.
BOOL `logical_and` and `logical_or` are particularly important for NumPy as that's how `np.any()` / `np.all()` are implemented.
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 4 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/setup.py.orig | 1173 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 92 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 12 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_logical.dispatch.c.src | 427 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 211 |
8 files changed, 1614 insertions, 307 deletions
diff --git a/.gitignore b/.gitignore index 9851fcc77..c0d370bc2 100644 --- a/.gitignore +++ b/.gitignore @@ -220,6 +220,7 @@ numpy/core/src/umath/loops_unary.dispatch.c numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c numpy/core/src/umath/loops_arithmetic.dispatch.c +numpy/core/src/umath/loops_logical.dispatch.c numpy/core/src/umath/loops_minmax.dispatch.c numpy/core/src/umath/loops_trigonometric.dispatch.c numpy/core/src/umath/loops_exponent_log.dispatch.c diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 768c8deee..114c743e2 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -413,6 +413,7 @@ defdict = { docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]), + TD('?', dispatch=[('loops_logical', '?')]), TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), @@ -496,6 +497,7 @@ defdict = { Ufunc(2, 1, True_, docstrings.get('numpy.core.umath.logical_and'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', + TD('?', dispatch=[('loops_logical', '?')]), TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalAnd'), ), @@ -503,6 +505,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.logical_not'), None, + TD('?', dispatch=[('loops_logical', '?')]), TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalNot'), ), @@ -510,6 +513,7 @@ defdict = { Ufunc(2, 1, False_, docstrings.get('numpy.core.umath.logical_or'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', + TD('?', dispatch=[('loops_logical', '?')]), TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), TD(O, f='npy_ObjectLogicalOr'), ), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index da5bc64c0..1c42e99c0 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1009,6 +1009,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_logical.dispatch.c.src'), join('src', 'umath', 'loops_minmax.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), diff --git a/numpy/core/setup.py.orig b/numpy/core/setup.py.orig new file mode 100644 index 000000000..65aacfdad --- /dev/null +++ b/numpy/core/setup.py.orig @@ -0,0 +1,1173 @@ +import os +import sys +import sysconfig +import pickle +import copy +import warnings +import textwrap +import glob +from os.path import join + +from numpy.distutils import log +from numpy.distutils.msvccompiler import lib_opts_if_msvc +from distutils.dep_util import newer +from sysconfig import get_config_var +from numpy.compat import npy_load_module +from setup_common import * # noqa: F403 + +# Set to True to enable relaxed strides checking. This (mostly) means +# that `strides[dim]` is ignored if `shape[dim] == 1` when setting flags. +NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0") +if not NPY_RELAXED_STRIDES_CHECKING: + raise SystemError( + "Support for NPY_RELAXED_STRIDES_CHECKING=0 has been remove as of " + "NumPy 1.23. This error will eventually be removed entirely.") + +# Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a +# bogus value for affected strides in order to help smoke out bad stride usage +# when relaxed stride checking is enabled. +NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0") +NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING + +# Set NPY_DISABLE_SVML=1 in the environment to disable the vendored SVML +# library. This option only has significance on a Linux x86_64 host and is most +# useful to avoid improperly requiring SVML when cross compiling. +NPY_DISABLE_SVML = (os.environ.get('NPY_DISABLE_SVML', "0") == "1") + +# XXX: ugly, we use a class to avoid calling twice some expensive functions in +# config.h/numpyconfig.h. I don't see a better way because distutils force +# config.h generation inside an Extension class, and as such sharing +# configuration information between extensions is not easy. +# Using a pickled-based memoize does not work because config_cmd is an instance +# method, which cPickle does not like. +# +# Use pickle in all cases, as cPickle is gone in python3 and the difference +# in time is only in build. -- Charles Harris, 2013-03-30 + +class CallOnceOnly: + def __init__(self): + self._check_types = None + self._check_ieee_macros = None + self._check_complex = None + + def check_types(self, *a, **kw): + if self._check_types is None: + out = check_types(*a, **kw) + self._check_types = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_types)) + return out + + def check_ieee_macros(self, *a, **kw): + if self._check_ieee_macros is None: + out = check_ieee_macros(*a, **kw) + self._check_ieee_macros = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_ieee_macros)) + return out + + def check_complex(self, *a, **kw): + if self._check_complex is None: + out = check_complex(*a, **kw) + self._check_complex = pickle.dumps(out) + else: + out = copy.deepcopy(pickle.loads(self._check_complex)) + return out + +def can_link_svml(): + """SVML library is supported only on x86_64 architecture and currently + only on linux + """ + if NPY_DISABLE_SVML: + return False + platform = sysconfig.get_platform() + return ("x86_64" in platform + and "linux" in platform + and sys.maxsize > 2**31) + +def check_svml_submodule(svmlpath): + if not os.path.exists(svmlpath + "/README.md"): + raise RuntimeError("Missing `SVML` submodule! Run `git submodule " + "update --init` to fix this.") + return True + +def pythonlib_dir(): + """return path where libpython* is.""" + if sys.platform == 'win32': + return os.path.join(sys.prefix, "libs") + else: + return get_config_var('LIBDIR') + +def is_npy_no_signal(): + """Return True if the NPY_NO_SIGNAL symbol must be defined in configuration + header.""" + return sys.platform == 'win32' + +def is_npy_no_smp(): + """Return True if the NPY_NO_SMP symbol must be defined in public + header (when SMP support cannot be reliably enabled).""" + # Perhaps a fancier check is in order here. + # so that threads are only enabled if there + # are actually multiple CPUS? -- but + # threaded code can be nice even on a single + # CPU so that long-calculating code doesn't + # block. + return 'NPY_NOSMP' in os.environ + +def win32_checks(deflist): + from numpy.distutils.misc_util import get_build_architecture + a = get_build_architecture() + + # Distutils hack on AMD64 on windows + print('BUILD_ARCHITECTURE: %r, os.name=%r, sys.platform=%r' % + (a, os.name, sys.platform)) + if a == 'AMD64': + deflist.append('DISTUTILS_USE_SDK') + + # On win32, force long double format string to be 'g', not + # 'Lg', since the MS runtime does not support long double whose + # size is > sizeof(double) + if a == "Intel" or a == "AMD64": + deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING') + +def check_math_capabilities(config, ext, moredefs, mathlibs): + def check_func( + func_name, + decl=False, + headers=["feature_detection_math.h"], + ): + return config.check_func( + func_name, + libraries=mathlibs, + decl=decl, + call=True, + call_args=FUNC_CALL_ARGS[func_name], + headers=headers, + ) + + def check_funcs_once(funcs_name, headers=["feature_detection_math.h"], + add_to_moredefs=True): + call = dict([(f, True) for f in funcs_name]) + call_args = dict([(f, FUNC_CALL_ARGS[f]) for f in funcs_name]) + st = config.check_funcs_once( + funcs_name, + libraries=mathlibs, + decl=False, + call=call, + call_args=call_args, + headers=headers, + ) + if st and add_to_moredefs: + moredefs.extend([(fname2def(f), 1) for f in funcs_name]) + return st + + def check_funcs(funcs_name, headers=["feature_detection_math.h"]): + # Use check_funcs_once first, and if it does not work, test func per + # func. Return success only if all the functions are available + if not check_funcs_once(funcs_name, headers=headers): + # Global check failed, check func per func + for f in funcs_name: + if check_func(f, headers=headers): + moredefs.append((fname2def(f), 1)) + return 0 + else: + return 1 + + #use_msvc = config.check_decl("_MSC_VER") + if not check_funcs_once(MANDATORY_FUNCS, add_to_moredefs=False): + raise SystemError("One of the required function to build numpy is not" + " available (the list is %s)." % str(MANDATORY_FUNCS)) + + # Standard functions which may not be available and for which we have a + # replacement implementation. Note that some of these are C99 functions. + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + for f in OPTIONAL_FUNCS_MAYBE: + if config.check_decl(fname2def(f), headers=["Python.h"]): + OPTIONAL_FILE_FUNCS.remove(f) + + check_funcs(OPTIONAL_FILE_FUNCS, headers=["feature_detection_stdio.h"]) + check_funcs(OPTIONAL_MISC_FUNCS, headers=["feature_detection_misc.h"]) + + for h in OPTIONAL_HEADERS: + if config.check_func("", decl=False, call=False, headers=[h]): + h = h.replace(".", "_").replace(os.path.sep, "_") + moredefs.append((fname2def(h), 1)) + + # Try with both "locale.h" and "xlocale.h" + locale_headers = [ + "stdlib.h", + "xlocale.h", + "feature_detection_locale.h", + ] + if not check_funcs(OPTIONAL_LOCALE_FUNCS, headers=locale_headers): + # It didn't work with xlocale.h, maybe it will work with locale.h? + locale_headers[1] = "locale.h" + check_funcs(OPTIONAL_LOCALE_FUNCS, headers=locale_headers) + + for tup in OPTIONAL_INTRINSICS: + headers = None + if len(tup) == 2: + f, args, m = tup[0], tup[1], fname2def(tup[0]) + elif len(tup) == 3: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[0]) + else: + f, args, headers, m = tup[0], tup[1], [tup[2]], fname2def(tup[3]) + if config.check_func(f, decl=False, call=True, call_args=args, + headers=headers): + moredefs.append((m, 1)) + + for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES: + if config.check_gcc_function_attribute(dec, fn): + moredefs.append((fname2def(fn), 1)) + if fn == 'attribute_target_avx512f': + # GH-14787: Work around GCC<8.4 bug when compiling with AVX512 + # support on Windows-based platforms + if (sys.platform in ('win32', 'cygwin') and + config.check_compiler_gcc() and + not config.check_gcc_version_at_least(8, 4)): + ext.extra_compile_args.extend( + ['-ffixed-xmm%s' % n for n in range(16, 32)]) + + for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS: + if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code, + header): + moredefs.append((fname2def(fn), 1)) + + for fn in OPTIONAL_VARIABLE_ATTRIBUTES: + if config.check_gcc_variable_attribute(fn): + m = fn.replace("(", "_").replace(")", "_") + moredefs.append((fname2def(m), 1)) + +def check_complex(config, mathlibs): + priv = [] + pub = [] + + # Check for complex support + st = config.check_header('complex.h') + if st: + priv.append(('HAVE_COMPLEX_H', 1)) + pub.append(('NPY_USE_C99_COMPLEX', 1)) + + for t in C99_COMPLEX_TYPES: + st = config.check_type(t, headers=["complex.h"]) + if st: + pub.append(('NPY_HAVE_%s' % type2def(t), 1)) + + def check_prec(prec): + flist = [f + prec for f in C99_COMPLEX_FUNCS] + decl = dict([(f, True) for f in flist]) + if not config.check_funcs_once(flist, call=decl, decl=decl, + libraries=mathlibs): + for f in flist: + if config.check_func(f, call=True, decl=True, + libraries=mathlibs): + priv.append((fname2def(f), 1)) + else: + priv.extend([(fname2def(f), 1) for f in flist]) + + check_prec('') + check_prec('f') + check_prec('l') + + return priv, pub + +def check_ieee_macros(config): + priv = [] + pub = [] + + macros = [] + + def _add_decl(f): + priv.append(fname2def("decl_%s" % f)) + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + + # XXX: hack to circumvent cpp pollution from python: python put its + # config.h in the public namespace, so we have a clash for the common + # functions we test. We remove every function tested by python's + # autoconf, hoping their own test are correct + _macros = ["isnan", "isinf", "signbit", "isfinite"] + for f in _macros: + py_symbol = fname2def("decl_%s" % f) + already_declared = config.check_decl(py_symbol, + headers=["Python.h", "math.h"]) + if already_declared: + if config.check_macro_true(py_symbol, + headers=["Python.h", "math.h"]): + pub.append('NPY_%s' % fname2def("decl_%s" % f)) + else: + macros.append(f) + # Normally, isnan and isinf are macro (C99), but some platforms only have + # func, or both func and macro version. Check for macro only, and define + # replacement ones if not found. + # Note: including Python.h is necessary because it modifies some math.h + # definitions + for f in macros: + st = config.check_decl(f, headers=["Python.h", "math.h"]) + if st: + _add_decl(f) + + return priv, pub + +def check_types(config_cmd, ext, build_dir): + private_defines = [] + public_defines = [] + + # Expected size (in number of bytes) for each type. This is an + # optimization: those are only hints, and an exhaustive search for the size + # is done if the hints are wrong. + expected = {'short': [2], 'int': [4], 'long': [8, 4], + 'float': [4], 'double': [8], 'long double': [16, 12, 8], + 'Py_intptr_t': [8, 4], 'PY_LONG_LONG': [8], 'long long': [8], + 'off_t': [8, 4]} + + # Check we have the python header (-dev* packages on Linux) + result = config_cmd.check_header('Python.h') + if not result: + python = 'python' + if '__pypy__' in sys.builtin_module_names: + python = 'pypy' + raise SystemError( + "Cannot compile 'Python.h'. Perhaps you need to " + "install {0}-dev|{0}-devel.".format(python)) + res = config_cmd.check_header("endian.h") + if res: + private_defines.append(('HAVE_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_ENDIAN_H', 1)) + res = config_cmd.check_header("sys/endian.h") + if res: + private_defines.append(('HAVE_SYS_ENDIAN_H', 1)) + public_defines.append(('NPY_HAVE_SYS_ENDIAN_H', 1)) + + # Check basic types sizes + for type in ('short', 'int', 'long'): + res = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), headers=["Python.h"]) + if res: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), "SIZEOF_%s" % sym2def(type))) + else: + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + for type in ('float', 'double', 'long double'): + already_declared = config_cmd.check_decl("SIZEOF_%s" % sym2def(type), + headers=["Python.h"]) + res = config_cmd.check_type_size(type, expected=expected[type]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + if not already_declared and not type == 'long double': + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # Compute size of corresponding complex type: used to check that our + # definition is binary compatible with C99 complex type (check done at + # build time in npy_common.h) + complex_def = "struct {%s __x; %s __y;}" % (type, type) + res = config_cmd.check_type_size(complex_def, + expected=[2 * x for x in expected[type]]) + if res >= 0: + public_defines.append(('NPY_SIZEOF_COMPLEX_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % complex_def) + + for type in ('Py_intptr_t', 'off_t'): + res = config_cmd.check_type_size(type, headers=["Python.h"], + library_dirs=[pythonlib_dir()], + expected=expected[type]) + + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def(type), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def(type), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % type) + + # We check declaration AND type because that's how distutils does it. + if config_cmd.check_decl('PY_LONG_LONG', headers=['Python.h']): + res = config_cmd.check_type_size('PY_LONG_LONG', headers=['Python.h'], + library_dirs=[pythonlib_dir()], + expected=expected['PY_LONG_LONG']) + if res >= 0: + private_defines.append(('SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('PY_LONG_LONG'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'PY_LONG_LONG') + + res = config_cmd.check_type_size('long long', + expected=expected['long long']) + if res >= 0: + #private_defines.append(('SIZEOF_%s' % sym2def('long long'), '%d' % res)) + public_defines.append(('NPY_SIZEOF_%s' % sym2def('long long'), '%d' % res)) + else: + raise SystemError("Checking sizeof (%s) failed !" % 'long long') + + if not config_cmd.check_decl('CHAR_BIT', headers=['Python.h']): + raise RuntimeError( + "Config wo CHAR_BIT is not supported" + ", please contact the maintainers") + + return private_defines, public_defines + +def check_mathlib(config_cmd): + # Testing the C math library + mathlibs = [] + mathlibs_choices = [[], ["m"], ["cpml"]] + mathlib = os.environ.get("MATHLIB") + if mathlib: + mathlibs_choices.insert(0, mathlib.split(",")) + for libs in mathlibs_choices: + if config_cmd.check_func( + "log", + libraries=libs, + call_args="0", + decl="double log(double);", + call=True + ): + mathlibs = libs + break + else: + raise RuntimeError( + "math library missing; rerun setup.py after setting the " + "MATHLIB env variable" + ) + return mathlibs + + +def visibility_define(config): + """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty + string).""" + hide = '__attribute__((visibility("hidden")))' + if config.check_gcc_function_attribute(hide, 'hideme'): + return hide + else: + return '' + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import (Configuration, dot_join, + exec_mod_from_location) + from numpy.distutils.system_info import (get_info, blas_opt_info, + lapack_opt_info) + from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS + from numpy.version import release as is_released + + config = Configuration('core', parent_package, top_path) + local_dir = config.local_path + codegen_dir = join(local_dir, 'code_generators') + + # Check whether we have a mismatch between the set C API VERSION and the + # actual C API VERSION. Will raise a MismatchCAPIError if so. + check_api_version(C_API_VERSION, codegen_dir) + + generate_umath_py = join(codegen_dir, 'generate_umath.py') + n = dot_join(config.name, 'generate_umath') + generate_umath = exec_mod_from_location('_'.join(n.split('.')), + generate_umath_py) + + header_dir = 'include/numpy' # this is relative to config.path_in_package + + cocache = CallOnceOnly() + + def generate_config_h(ext, build_dir): + target = join(build_dir, header_dir, 'config.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + moredefs, ignored = cocache.check_types(config_cmd, ext, build_dir) + + # Check math library and C99 math funcs availability + mathlibs = check_mathlib(config_cmd) + moredefs.append(('MATHLIB', ','.join(mathlibs))) + + check_math_capabilities(config_cmd, ext, moredefs, mathlibs) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[0]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0]) + + # Signal check + if is_npy_no_signal(): + moredefs.append('__NPY_PRIVATE_NO_SIGNAL') + + # Windows checks + if sys.platform == 'win32' or os.name == 'nt': + win32_checks(moredefs) + + # C99 restrict keyword + moredefs.append(('NPY_RESTRICT', config_cmd.check_restrict())) + + # Inline check + inline = config_cmd.check_inline() + + if can_link_svml(): + moredefs.append(('NPY_CAN_LINK_SVML', 1)) + + # Use bogus stride debug aid to flush out bugs where users use + # strides of dimensions with length 1 to index a full contiguous + # array. + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + else: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 0)) + + # Get long double representation + rep = check_long_double_representation(config_cmd) + moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1)) + + if check_for_right_shift_internal_compiler_error(config_cmd): + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift') + moredefs.append('NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift') + + # Generate the config.h file from moredefs + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # define inline to our keyword, or nothing + target_f.write('#ifndef __cplusplus\n') + if inline == 'inline': + target_f.write('/* #undef inline */\n') + else: + target_f.write('#define inline %s\n' % inline) + target_f.write('#endif\n') + + # add the guard to make sure config.h is never included directly, + # but always through npy_config.h + target_f.write(textwrap.dedent(""" + #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ + #error config.h should never be included directly, include npy_config.h instead + #endif + """)) + + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + else: + mathlibs = [] + with open(target) as target_f: + for line in target_f: + s = '#define MATHLIB' + if line.startswith(s): + value = line[len(s):].strip() + if value: + mathlibs.extend(value.split(',')) + + # Ugly: this can be called within a library and not an extension, + # in which case there is no libraries attributes (and none is + # needed). + if hasattr(ext, 'libraries'): + ext.libraries.extend(mathlibs) + + incl_dir = os.path.dirname(target) + if incl_dir not in config.numpy_include_dirs: + config.numpy_include_dirs.append(incl_dir) + + return target + + def generate_numpyconfig_h(ext, build_dir): + """Depends on config.h: generate_config_h has to be called before !""" + # put common include directory in build_dir on search path + # allows using code generation in headers + config.add_include_dirs(join(build_dir, "src", "common")) + config.add_include_dirs(join(build_dir, "src", "npymath")) + + target = join(build_dir, header_dir, '_numpyconfig.h') + d = os.path.dirname(target) + if not os.path.exists(d): + os.makedirs(d) + if newer(__file__, target): + config_cmd = config.get_config_cmd() + log.info('Generating %s', target) + + # Check sizeof + ignored, moredefs = cocache.check_types(config_cmd, ext, build_dir) + + if is_npy_no_signal(): + moredefs.append(('NPY_NO_SIGNAL', 1)) + + if is_npy_no_smp(): + moredefs.append(('NPY_NO_SMP', 1)) + else: + moredefs.append(('NPY_NO_SMP', 0)) + + mathlibs = check_mathlib(config_cmd) + moredefs.extend(cocache.check_ieee_macros(config_cmd)[1]) + moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1]) + + if NPY_RELAXED_STRIDES_DEBUG: + moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1)) + + # Check whether we can use inttypes (C99) formats + if config_cmd.check_decl('PRIdPTR', headers=['inttypes.h']): + moredefs.append(('NPY_USE_C99_FORMATS', 1)) + + # visibility check + hidden_visibility = visibility_define(config_cmd) + moredefs.append(('NPY_VISIBILITY_HIDDEN', hidden_visibility)) + + # Add the C API/ABI versions + moredefs.append(('NPY_ABI_VERSION', '0x%.8X' % C_ABI_VERSION)) + moredefs.append(('NPY_API_VERSION', '0x%.8X' % C_API_VERSION)) + + # Add moredefs to header + with open(target, 'w') as target_f: + for d in moredefs: + if isinstance(d, str): + target_f.write('#define %s\n' % (d)) + else: + target_f.write('#define %s %s\n' % (d[0], d[1])) + + # Define __STDC_FORMAT_MACROS + target_f.write(textwrap.dedent(""" + #ifndef __STDC_FORMAT_MACROS + #define __STDC_FORMAT_MACROS 1 + #endif + """)) + + # Dump the numpyconfig.h header to stdout + log.info('File: %s' % target) + with open(target) as target_f: + log.info(target_f.read()) + log.info('EOF') + config.add_data_files((header_dir, target)) + return target + + def generate_api_func(module_name): + def generate_api(ext, build_dir): + script = join(codegen_dir, module_name + '.py') + sys.path.insert(0, codegen_dir) + try: + m = __import__(module_name) + log.info('executing %s', script) + h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir)) + finally: + del sys.path[0] + config.add_data_files((header_dir, h_file), + (header_dir, doc_file)) + return (h_file,) + return generate_api + + generate_numpy_api = generate_api_func('generate_numpy_api') + generate_ufunc_api = generate_api_func('generate_ufunc_api') + + config.add_include_dirs(join(local_dir, "src", "common")) + config.add_include_dirs(join(local_dir, "src")) + config.add_include_dirs(join(local_dir)) + + config.add_data_dir('include/numpy') + config.add_include_dirs(join('src', 'npymath')) + config.add_include_dirs(join('src', 'multiarray')) + config.add_include_dirs(join('src', 'umath')) + config.add_include_dirs(join('src', 'npysort')) + config.add_include_dirs(join('src', '_simd')) + + config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process + config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")]) + if sys.platform[:3] == "aix": + config.add_define_macros([("_LARGE_FILES", None)]) + else: + config.add_define_macros([("_FILE_OFFSET_BITS", "64")]) + config.add_define_macros([('_LARGEFILE_SOURCE', '1')]) + config.add_define_macros([('_LARGEFILE64_SOURCE', '1')]) + + config.numpy_include_dirs.extend(config.paths('include')) + + deps = [join('src', 'npymath', '_signbit.c'), + join('include', 'numpy', '*object.h'), + join(codegen_dir, 'genapi.py'), + ] + + ####################################################################### + # npymath library # + ####################################################################### + + subst_dict = dict([("sep", os.path.sep), ("pkgname", "numpy.core")]) + + def get_mathlib_info(*args): + # Another ugly hack: the mathlib info is known once build_src is run, + # but we cannot use add_installed_pkg_config here either, so we only + # update the substitution dictionary during npymath build + config_cmd = config.get_config_cmd() + # Check that the toolchain works, to fail early if it doesn't + # (avoid late errors with MATHLIB which are confusing if the + # compiler does not work). + for lang, test_code, note in ( + ('c', 'int main(void) { return 0;}', ''), + ('c++', ( + 'int main(void)' + '{ auto x = 0.0; return static_cast<int>(x); }' + ), ( + 'note: A compiler with support for C++11 language ' + 'features is required.' + ) + ), + ): + is_cpp = lang == 'c++' + if is_cpp: + # this a workaround to get rid of invalid c++ flags + # without doing big changes to config. + # c tested first, compiler should be here + bk_c = config_cmd.compiler + config_cmd.compiler = bk_c.cxx_compiler() + + # Check that Linux compiler actually support the default flags + if hasattr(config_cmd.compiler, 'compiler'): + config_cmd.compiler.compiler.extend(NPY_CXX_FLAGS) + config_cmd.compiler.compiler_so.extend(NPY_CXX_FLAGS) + + st = config_cmd.try_link(test_code, lang=lang) + if not st: + # rerun the failing command in verbose mode + config_cmd.compiler.verbose = True + config_cmd.try_link(test_code, lang=lang) + raise RuntimeError( + f"Broken toolchain: cannot link a simple {lang.upper()} " + f"program. {note}" + ) + if is_cpp: + config_cmd.compiler = bk_c + mlibs = check_mathlib(config_cmd) + + posix_mlib = ' '.join(['-l%s' % l for l in mlibs]) + msvc_mlib = ' '.join(['%s.lib' % l for l in mlibs]) + subst_dict["posix_mathlib"] = posix_mlib + subst_dict["msvc_mathlib"] = msvc_mlib + + npymath_sources = [join('src', 'npymath', 'npy_math_internal.h.src'), + join('src', 'npymath', 'npy_math.c'), + # join('src', 'npymath', 'ieee754.cpp'), + join('src', 'npymath', 'ieee754.c.src'), + join('src', 'npymath', 'npy_math_complex.c.src'), + join('src', 'npymath', 'halffloat.c') + ] + + config.add_installed_library('npymath', + sources=npymath_sources + [get_mathlib_info], + install_dir='lib', + build_info={ + 'include_dirs' : [], # empty list required for creating npy_math_internal.h + 'extra_compiler_args': [lib_opts_if_msvc], + }) + config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config", + subst_dict) + config.add_npy_pkg_config("mlib.ini.in", "lib/npy-pkg-config", + subst_dict) + + ####################################################################### + # multiarray_tests module # + ####################################################################### + + config.add_extension('_multiarray_tests', + sources=[join('src', 'multiarray', '_multiarray_tests.c.src'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c')], + depends=[join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_extint128.h')], + libraries=['npymath']) + + ####################################################################### + # _multiarray_umath module - common part # + ####################################################################### + + common_deps = [ + join('src', 'common', 'dlpack', 'dlpack.h'), + join('src', 'common', 'array_assign.h'), + join('src', 'common', 'binop_override.h'), + join('src', 'common', 'cblasfuncs.h'), + join('src', 'common', 'lowlevel_strided_loops.h'), + join('src', 'common', 'mem_overlap.h'), + join('src', 'common', 'npy_argparse.h'), + join('src', 'common', 'npy_cblas.h'), + join('src', 'common', 'npy_config.h'), + join('src', 'common', 'npy_ctypes.h'), + join('src', 'common', 'npy_dlpack.h'), + join('src', 'common', 'npy_extint128.h'), + join('src', 'common', 'npy_import.h'), + join('src', 'common', 'npy_hashtable.h'), + join('src', 'common', 'npy_longdouble.h'), + join('src', 'common', 'npy_svml.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.h'), + join('src', 'common', 'ufunc_override.h'), + join('src', 'common', 'umathmodule.h'), + join('src', 'common', 'numpyos.h'), + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + ] + + common_src = [ + join('src', 'common', 'array_assign.c'), + join('src', 'common', 'mem_overlap.c'), + join('src', 'common', 'npy_argparse.c'), + join('src', 'common', 'npy_hashtable.c'), + join('src', 'common', 'npy_longdouble.c'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'common', 'ucsnarrow.c'), + join('src', 'common', 'ufunc_override.c'), + join('src', 'common', 'numpyos.c'), + join('src', 'common', 'npy_cpu_features.c'), + ] + + if os.environ.get('NPY_USE_BLAS_ILP64', "0") != "0": + blas_info = get_info('blas_ilp64_opt', 2) + else: + blas_info = get_info('blas_opt', 0) + + have_blas = blas_info and ('HAVE_CBLAS', None) in blas_info.get('define_macros', []) + + if have_blas: + extra_info = blas_info + # These files are also in MANIFEST.in so that they are always in + # the source distribution independently of HAVE_CBLAS. + common_src.extend([join('src', 'common', 'cblasfuncs.c'), + join('src', 'common', 'python_xerbla.c'), + ]) + else: + extra_info = {} + + ####################################################################### + # _multiarray_umath module - multiarray part # + ####################################################################### + + multiarray_deps = [ + join('src', 'multiarray', 'abstractdtypes.h'), + join('src', 'multiarray', 'arrayobject.h'), + join('src', 'multiarray', 'arraytypes.h.src'), + join('src', 'multiarray', 'arrayfunction_override.h'), + join('src', 'multiarray', 'array_coercion.h'), + join('src', 'multiarray', 'array_method.h'), + join('src', 'multiarray', 'npy_buffer.h'), + join('src', 'multiarray', 'calculation.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'common_dtype.h'), + join('src', 'multiarray', 'convert_datatype.h'), + join('src', 'multiarray', 'convert.h'), + join('src', 'multiarray', 'conversion_utils.h'), + join('src', 'multiarray', 'ctors.h'), + join('src', 'multiarray', 'descriptor.h'), + join('src', 'multiarray', 'dtypemeta.h'), + join('src', 'multiarray', 'dtype_transfer.h'), + join('src', 'multiarray', 'dragon4.h'), + join('src', 'multiarray', 'einsum_debug.h'), + join('src', 'multiarray', 'einsum_sumprod.h'), + join('src', 'multiarray', 'experimental_public_dtype_api.h'), + join('src', 'multiarray', 'getset.h'), + join('src', 'multiarray', 'hashdescr.h'), + join('src', 'multiarray', 'iterators.h'), + join('src', 'multiarray', 'legacy_dtype_implementation.h'), + join('src', 'multiarray', 'mapping.h'), + join('src', 'multiarray', 'methods.h'), + join('src', 'multiarray', 'multiarraymodule.h'), + join('src', 'multiarray', 'nditer_impl.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'multiarray', 'refcount.h'), + join('src', 'multiarray', 'scalartypes.h'), + join('src', 'multiarray', 'sequence.h'), + join('src', 'multiarray', 'shape.h'), + join('src', 'multiarray', 'strfuncs.h'), + join('src', 'multiarray', 'typeinfo.h'), + join('src', 'multiarray', 'usertypes.h'), + join('src', 'multiarray', 'vdot.h'), + join('src', 'multiarray', 'textreading', 'readtext.h'), + join('include', 'numpy', 'arrayobject.h'), + join('include', 'numpy', '_neighborhood_iterator_imp.h'), + join('include', 'numpy', 'npy_endian.h'), + join('include', 'numpy', 'arrayscalars.h'), + join('include', 'numpy', 'noprefix.h'), + join('include', 'numpy', 'npy_interrupt.h'), + join('include', 'numpy', 'npy_3kcompat.h'), + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('include', 'numpy', 'npy_common.h'), + join('include', 'numpy', 'npy_os.h'), + join('include', 'numpy', 'utils.h'), + join('include', 'numpy', 'ndarrayobject.h'), + join('include', 'numpy', 'npy_cpu.h'), + join('include', 'numpy', 'numpyconfig.h'), + join('include', 'numpy', 'ndarraytypes.h'), + join('include', 'numpy', 'npy_1_7_deprecated_api.h'), + # add library sources as distuils does not consider libraries + # dependencies + ] + npymath_sources + + multiarray_src = [ + join('src', 'multiarray', 'abstractdtypes.c'), + join('src', 'multiarray', 'alloc.c'), + join('src', 'multiarray', 'arrayobject.c'), + join('src', 'multiarray', 'arraytypes.h.src'), + join('src', 'multiarray', 'arraytypes.c.src'), + join('src', 'multiarray', 'argfunc.dispatch.c.src'), + join('src', 'multiarray', 'array_coercion.c'), + join('src', 'multiarray', 'array_method.c'), + join('src', 'multiarray', 'array_assign_scalar.c'), + join('src', 'multiarray', 'array_assign_array.c'), + join('src', 'multiarray', 'arrayfunction_override.c'), + join('src', 'multiarray', 'buffer.c'), + join('src', 'multiarray', 'calculation.c'), + join('src', 'multiarray', 'compiled_base.c'), + join('src', 'multiarray', 'common.c'), + join('src', 'multiarray', 'common_dtype.c'), + join('src', 'multiarray', 'convert.c'), + join('src', 'multiarray', 'convert_datatype.c'), + join('src', 'multiarray', 'conversion_utils.c'), + join('src', 'multiarray', 'ctors.c'), + join('src', 'multiarray', 'datetime.c'), + join('src', 'multiarray', 'datetime_strings.c'), + join('src', 'multiarray', 'datetime_busday.c'), + join('src', 'multiarray', 'datetime_busdaycal.c'), + join('src', 'multiarray', 'descriptor.c'), + join('src', 'multiarray', 'dlpack.c'), + join('src', 'multiarray', 'dtypemeta.c'), + join('src', 'multiarray', 'dragon4.c'), + join('src', 'multiarray', 'dtype_transfer.c'), + join('src', 'multiarray', 'einsum.c.src'), + join('src', 'multiarray', 'einsum_sumprod.c.src'), + join('src', 'multiarray', 'experimental_public_dtype_api.c'), + join('src', 'multiarray', 'flagsobject.c'), + join('src', 'multiarray', 'getset.c'), + join('src', 'multiarray', 'hashdescr.c'), + join('src', 'multiarray', 'item_selection.c'), + join('src', 'multiarray', 'iterators.c'), + join('src', 'multiarray', 'legacy_dtype_implementation.c'), + join('src', 'multiarray', 'lowlevel_strided_loops.c.src'), + join('src', 'multiarray', 'mapping.c'), + join('src', 'multiarray', 'methods.c'), + join('src', 'multiarray', 'multiarraymodule.c'), + join('src', 'multiarray', 'nditer_templ.c.src'), + join('src', 'multiarray', 'nditer_api.c'), + join('src', 'multiarray', 'nditer_constr.c'), + join('src', 'multiarray', 'nditer_pywrap.c'), + join('src', 'multiarray', 'number.c'), + join('src', 'multiarray', 'refcount.c'), + join('src', 'multiarray', 'sequence.c'), + join('src', 'multiarray', 'shape.c'), + join('src', 'multiarray', 'scalarapi.c'), + join('src', 'multiarray', 'scalartypes.c.src'), + join('src', 'multiarray', 'strfuncs.c'), + join('src', 'multiarray', 'temp_elide.c'), + join('src', 'multiarray', 'typeinfo.c'), + join('src', 'multiarray', 'usertypes.c'), + join('src', 'multiarray', 'vdot.c'), + join('src', 'common', 'npy_sort.h.src'), + join('src', 'npysort', 'x86-qsort.dispatch.cpp'), + join('src', 'npysort', 'quicksort.cpp'), + join('src', 'npysort', 'mergesort.cpp'), + join('src', 'npysort', 'timsort.cpp'), + join('src', 'npysort', 'heapsort.cpp'), + join('src', 'npysort', 'radixsort.cpp'), + join('src', 'common', 'npy_partition.h'), + join('src', 'npysort', 'selection.cpp'), + join('src', 'common', 'npy_binsearch.h'), + join('src', 'npysort', 'binsearch.cpp'), + join('src', 'multiarray', 'textreading', 'conversions.c'), + join('src', 'multiarray', 'textreading', 'field_types.c'), + join('src', 'multiarray', 'textreading', 'growth.c'), + join('src', 'multiarray', 'textreading', 'readtext.c'), + join('src', 'multiarray', 'textreading', 'rows.c'), + join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), + join('src', 'multiarray', 'textreading', 'str_to_int.c'), + join('src', 'multiarray', 'textreading', 'tokenize.cpp'), + ] + + ####################################################################### + # _multiarray_umath module - umath part # + ####################################################################### + + def generate_umath_c(ext, build_dir): + target = join(build_dir, header_dir, '__umath_generated.c') + dir = os.path.dirname(target) + if not os.path.exists(dir): + os.makedirs(dir) + script = generate_umath_py + if newer(script, target): + with open(target, 'w') as f: + f.write(generate_umath.make_code(generate_umath.defdict, + generate_umath.__file__)) + return [] + + def generate_umath_doc_header(ext, build_dir): + from numpy.distutils.misc_util import exec_mod_from_location + + target = join(build_dir, header_dir, '_umath_doc_generated.h') + dir = os.path.dirname(target) + if not os.path.exists(dir): + os.makedirs(dir) + + generate_umath_doc_py = join(codegen_dir, 'generate_umath_doc.py') + if newer(generate_umath_doc_py, target): + n = dot_join(config.name, 'generate_umath_doc') + generate_umath_doc = exec_mod_from_location( + '_'.join(n.split('.')), generate_umath_doc_py) + generate_umath_doc.write_code(target) + + umath_src = [ + join('src', 'umath', 'umathmodule.c'), + join('src', 'umath', 'reduction.c'), + join('src', 'umath', 'funcs.inc.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'loops.h.src'), + join('src', 'umath', 'loops_utils.h.src'), + join('src', 'umath', 'loops.c.src'), + join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_minmax.dispatch.c.src'), + join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), + join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), + join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), + join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'), + join('src', 'umath', 'loops_modulo.dispatch.c.src'), + join('src', 'umath', 'loops_comparison.dispatch.c.src'), + join('src', 'umath', 'matmul.h.src'), + join('src', 'umath', 'matmul.c.src'), + join('src', 'umath', 'clip.h'), + join('src', 'umath', 'clip.cpp'), + join('src', 'umath', 'dispatching.c'), + join('src', 'umath', 'legacy_array_method.c'), + join('src', 'umath', 'wrapping_array_method.c'), + join('src', 'umath', 'ufunc_object.c'), + join('src', 'umath', 'extobj.c'), + join('src', 'umath', 'scalarmath.c.src'), + join('src', 'umath', 'ufunc_type_resolution.c'), + join('src', 'umath', 'override.c'), + join('src', 'umath', 'string_ufuncs.cpp'), + # For testing. Eventually, should use public API and be separate: + join('src', 'umath', '_scaled_float_dtype.c'), + ] + + umath_deps = [ + generate_umath_py, + join('include', 'numpy', 'npy_math.h'), + join('include', 'numpy', 'halffloat.h'), + join('src', 'multiarray', 'common.h'), + join('src', 'multiarray', 'number.h'), + join('src', 'common', 'templ_common.h.src'), + join('src', 'umath', 'simd.inc.src'), + join('src', 'umath', 'override.h'), + join(codegen_dir, 'generate_ufunc_api.py'), + join(codegen_dir, 'ufunc_docstrings.py'), + ] + + svml_path = join('numpy', 'core', 'src', 'umath', 'svml') + svml_objs = [] + # we have converted the following into universal intrinsics + # so we can bring the benefits of performance for all platforms + # not just for avx512 on linux without performance/accuracy regression, + # actually the other way around, better performance and + # after all maintainable code. + svml_filter = ( + 'svml_z0_tanh_d_la.s', 'svml_z0_tanh_s_la.s' + ) + if can_link_svml() and check_svml_submodule(svml_path): + svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True) + svml_objs = [o for o in svml_objs if not o.endswith(svml_filter)] + + # The ordering of names returned by glob is undefined, so we sort + # to make builds reproducible. + svml_objs.sort() + + config.add_extension('_multiarray_umath', + # Forcing C language even though we have C++ sources. + # It forces the C linker and don't link C++ runtime. + language = 'c', + sources=multiarray_src + umath_src + + common_src + + [generate_config_h, + generate_numpyconfig_h, + generate_numpy_api, + join(codegen_dir, 'generate_numpy_api.py'), + join('*.py'), + generate_umath_c, + generate_umath_doc_header, + generate_ufunc_api, + ], + depends=deps + multiarray_deps + umath_deps + + common_deps, + libraries=['npymath'], + extra_objects=svml_objs, + extra_info=extra_info, + extra_cxx_compile_args=NPY_CXX_FLAGS) + + ####################################################################### + # umath_tests module # + ####################################################################### + + config.add_extension('_umath_tests', sources=[ + join('src', 'umath', '_umath_tests.c.src'), + join('src', 'umath', '_umath_tests.dispatch.c'), + join('src', 'common', 'npy_cpu_features.c'), + ]) + + ####################################################################### + # custom rational dtype module # + ####################################################################### + + config.add_extension('_rational_tests', + sources=[join('src', 'umath', '_rational_tests.c')]) + + ####################################################################### + # struct_ufunc_test module # + ####################################################################### + + config.add_extension('_struct_ufunc_tests', + sources=[join('src', 'umath', '_struct_ufunc_tests.c')]) + + + ####################################################################### + # operand_flag_tests module # + ####################################################################### + + config.add_extension('_operand_flag_tests', + sources=[join('src', 'umath', '_operand_flag_tests.c')]) + + ####################################################################### + # SIMD module # + ####################################################################### + + config.add_extension('_simd', sources=[ + join('src', 'common', 'npy_cpu_features.c'), + join('src', '_simd', '_simd.c'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd.dispatch.c.src'), + ], depends=[ + join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), + join('src', '_simd', '_simd.h'), + join('src', '_simd', '_simd_inc.h.src'), + join('src', '_simd', '_simd_data.inc.src'), + join('src', '_simd', '_simd_arg.inc'), + join('src', '_simd', '_simd_convert.inc'), + join('src', '_simd', '_simd_easyintrin.inc'), + join('src', '_simd', '_simd_vector.inc'), + ]) + + config.add_subpackage('tests') + config.add_data_dir('tests/data') + config.add_data_dir('tests/examples') + config.add_data_files('*.pyi') + + config.make_svn_version_py() + + return config + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(configuration=configuration) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0b4856847..7b070a084 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -416,98 +416,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo ***************************************************************************** */ -/**begin repeat - * #kind = logical_and, logical_or# - * #OP = &&, ||# - * #SC = ==, !=# - * #and = 1, 0# - **/ - -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if(IS_BINARY_REDUCE) { -#ifdef NPY_HAVE_SSE2_INTRINSICS - /* - * stick with our variant for more reliable performance, only known - * platform which outperforms it by ~20% is an i7 with glibc 2.17 - */ - if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } -#else - /* for now only use libc on 32-bit/non-x86 */ - if (steps[1] == 1) { - npy_bool * op = (npy_bool *)args[0]; -#if @and@ - /* np.all(), search for a zero (false) */ - if (*op) { - *op = memchr(args[1], 0, dimensions[0]) == NULL; - } -#else - /* - * np.any(), search for a non-zero (true) via comparing against - * zero blocks, memcmp is faster than memchr on SSE4 machines - * with glibc >= 2.12 and memchr can only check for equal 1 - */ - static const npy_bool zero[4096]; /* zero by C standard */ - npy_uintp i, n = dimensions[0]; - - for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { - *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; - } - if (!*op && n - i > 0) { - *op = memcmp(&args[1][i], zero, n - i) != 0; - } -#endif - return; - } -#endif - else { - BINARY_REDUCE_LOOP(npy_bool) { - const npy_bool in2 = *(npy_bool *)ip2; - io1 = io1 @OP@ in2; - if (io1 @SC@ 0) { - break; - } - } - *((npy_bool *)iop1) = io1; - } - } - else { - if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - BINARY_LOOP { - const npy_bool in1 = *(npy_bool *)ip1; - const npy_bool in2 = *(npy_bool *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } - } -} -/**end repeat**/ - -/**begin repeat - * #kind = absolute, logical_not# - * #OP = !=, ==# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - UNARY_LOOP { - npy_bool in1 = *(npy_bool *)ip1; - *((npy_bool *)op1) = in1 @OP@ 0; - } - } -} -/**end repeat**/ - NPY_NO_EXPORT void BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index e3a410968..2e0ba9c40 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -39,11 +39,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_logical.dispatch.h" +#endif + /**begin repeat - * #kind = logical_and, logical_or, absolute, logical_not# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + * #kind = logical_and, logical_or, logical_not, absolute# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat**/ NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src new file mode 100644 index 000000000..8c863edd9 --- /dev/null +++ b/numpy/core/src/umath/loops_logical.dispatch.c.src @@ -0,0 +1,427 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD +#if !defined(NPY_HAVE_SSE2) +#define USE_NPYV_REDUCE_MINMAX +#endif + +#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) + #define npyv_reduce_min_u8 vminvq_u8 + #define npyv_reduce_max_u8 vmaxvq_u8 +#elif defined(USE_NPYV_REDUCE_MINMAX) + // Scalar intrinsics + #define scalar_max_i(A, B) ((A > B) ? A : B) + #define scalar_min_i(A, B) ((A < B) ? A : B) + + /**begin repeat + * #intrin = min, max# + */ + NPY_FINLINE npyv_lanetype_u8 npyv_reduce_@intrin@_u8(npyv_u8 v) + { + npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_u8]; + npyv_storea_u8(s, v); + npyv_lanetype_u8 result = s[0]; + for(int i=1; i<npyv_nlanes_u8; ++i){ + result = scalar_@intrin@_i(result, s[i]); + } + return result; + } + /**end repeat**/ + #undef scalar_max_i + #undef scalar_min_i +#endif +#endif // NPY_SIMD + +/******************************************************************************* + ** Defining the SIMD kernels + ******************************************************************************/ + +#if NPY_SIMD +/* + * convert any bit set to boolean true so vectorized and normal operations are + * consistent, should not be required if bool is used correctly everywhere but + * you never know + */ +NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v) +{ + const npyv_u8 zero = npyv_zero_u8(); + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00 + npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero)); + // tmp is filled with 0xff/0x00, negate and mask to boolean true + return npyv_andc_u8(truemask, tmp); +} + +/**begin repeat + * #kind = logical_and, logical_or# + * #and = 1, 0# + * #scalar_op = &&, ||# + * #intrin = and, or# + * #reduce = min, max# + * #scalar_cmp = ==, !=# + */ +static void +simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@); + npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@); + npyv_u8 r@unroll@ = npyv_@intrin@_u8(a@unroll@, b@unroll@); + npyv_store_u8(op + vstep * @unroll@, byte_to_true(r@unroll@)); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) { + npyv_u8 a = npyv_load_u8(ip1); + npyv_u8 b = npyv_load_u8(ip2); + npyv_u8 r = npyv_@intrin@_u8(a, b); + npyv_store_u8(op, byte_to_true(r)); + } + + // Scalar loop to finish off + for (; len > 0; len--, ip1++, ip2++, op++) { + *op = *ip1 @scalar_op@ *ip2; + } +} + +static void +simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + /* There's two separate implementations here to accomodate better + * performance on Intel SSE2 and Arm NEON. SSE2 does not have + * a horizontal min/max and the scalar version is slower than + * what was previously in simd.inc.src. + * 1. Both use min/max vertical reduction + * 2. For horizontal reduction: + * a. NEON / ASIMD uses min/max with an early exit clause. + * b. SSE2 uses previous implementation (cmpeq/tobits) tweaked a bit. + */ + + #define UNROLL 8 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + #if !defined(USE_NPYV_REDUCE_MINMAX) + const npyv_u8 zero = npyv_zero_u8(); + #endif + + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep) { + #if defined(NPY_HAVE_SSE2) + NPY_PREFETCH(ip + wstep, 0, 3); + #endif + npyv_u8 v0 = npyv_load_u8(ip + vstep * 0); + npyv_u8 v1 = npyv_load_u8(ip + vstep * 1); + npyv_u8 v2 = npyv_load_u8(ip + vstep * 2); + npyv_u8 v3 = npyv_load_u8(ip + vstep * 3); + npyv_u8 v4 = npyv_load_u8(ip + vstep * 4); + npyv_u8 v5 = npyv_load_u8(ip + vstep * 5); + npyv_u8 v6 = npyv_load_u8(ip + vstep * 6); + npyv_u8 v7 = npyv_load_u8(ip + vstep * 7); + + npyv_u8 m01 = npyv_@reduce@_u8(v0, v1); + npyv_u8 m23 = npyv_@reduce@_u8(v2, v3); + npyv_u8 m45 = npyv_@reduce@_u8(v4, v5); + npyv_u8 m67 = npyv_@reduce@_u8(v6, v7); + + npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23); + npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67); + + npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567); + +#if defined(USE_NPYV_REDUCE_MINMAX) + npy_uint8 r = npyv_reduce_@reduce@_u8(mv); + if(r @scalar_cmp@ 0){ + *op = !@and@; + return; + } +#else + mv = npyv_cvt_u8_b8(npyv_cmpeq_u8(mv, zero)); + npy_uint64 zmask = npyv_tobits_b8(npyv_cvt_b8_u8(mv)); + #if @and@ + if(zmask != 0){ + *op = 0; + return; + } + #else + if(zmask != 0xffff){ + *op = 1; + return; + } + #endif +#endif + } + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep) { + npyv_u8 v0 = npyv_load_u8(ip); +#if defined(USE_NPYV_REDUCE_MINMAX) + npy_uint8 r = npyv_reduce_@reduce@_u8(v0); + if(r @scalar_cmp@ 0){ + *op = !@and@; + return; + } +#else + // cmpeq(v, zero): 0x00 --> 0xff, non-zero --> 0x00 + v0 = npyv_cvt_u8_b8(npyv_cmpeq_u8(v0, zero)); + npy_uint64 zmask = npyv_tobits_b8(npyv_cvt_b8_u8(v0)); + #if @and@ + if(zmask != 0){ + *op = 0; + return; + } + #else + if(zmask != 0xffff){ + *op = 1; + return; + } + #endif +#endif + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip) { + *op = *op @scalar_op@ *ip; + if (*op @scalar_cmp@ 0) { + return; + } + } +#undef UNROLL +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #op = ==, !=# + * #not = 1, 0# + */ +static void +simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + #if @not@ + const npyv_u8 zero = npyv_zero_u8(); + #endif + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@); +#if @not@ + npyv_u8 r@unroll@ = npyv_cvt_u8_b8(npyv_cmpeq_u8(v@unroll@, zero)); +#else + // abs is kind of pointless but maybe its used for byte_to_true below + npyv_u8 r@unroll@ = v@unroll@; +#endif + npyv_store_u8(op + vstep * @unroll@, byte_to_true(r@unroll@)); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) { + npyv_u8 v = npyv_load_u8(ip); +#if @not@ + npyv_u8 r = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero)); +#else + // abs is kind of pointless but maybe its used for byte_to_true below + npyv_u8 r = v; +#endif + npyv_store_u8(op, byte_to_true(r)); + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip, ++op) { + *op = (*ip @op@ 0); + } +} + +static NPY_INLINE int +run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +#undef npyv_reduce_min_u8 +#undef npyv_reduce_max_u8 +#undef USE_NPYV_REDUCE_MINMAX + +#endif // NPY_SIMD + +/******************************************************************************* + ** Defining ufunc inner functions + ******************************************************************************/ + +/**begin repeat + * # kind = logical_or, logical_and# + */ +static NPY_INLINE int +run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], + (npy_bool*)args[1], dimensions[0]); + return 1; + } +#endif + return 0; +} + + +static NPY_INLINE int +run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], + dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_and, logical_or# + * #OP = &&, ||# + * #SC = ==, !=# + * #and = 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if(IS_BINARY_REDUCE) { +#ifdef NPY_SIMD + /* + * stick with our variant for more reliable performance, only known + * platform which outperforms it by ~20% is an i7 with glibc 2.17 + */ + if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } +#else + /* for now only use libc on 32-bit/non-x86 */ + if (steps[1] == 1) { + npy_bool * op = (npy_bool *)args[0]; +#if @and@ + /* np.all(), search for a zero (false) */ + if (*op) { + *op = memchr(args[1], 0, dimensions[0]) == NULL; + } +#else + /* + * np.any(), search for a non-zero (true) via comparing against + * zero blocks, memcmp is faster than memchr on SSE4 machines + * with glibc >= 2.12 and memchr can only check for equal 1 + */ + static const npy_bool zero[4096]; /* zero by C standard */ + npy_uintp i, n = dimensions[0]; + + for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { + *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; + } + if (!*op && n - i > 0) { + *op = memcmp(&args[1][i], zero, n - i) != 0; + } +#endif + return; + } +#endif + else { + BINARY_REDUCE_LOOP(npy_bool) { + const npy_bool in2 = *(npy_bool *)ip2; + io1 = io1 @OP@ in2; + if (io1 @SC@ 0) { + break; + } + } + *((npy_bool *)iop1) = io1; + } + } + else { + if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + BINARY_LOOP { + const npy_bool in1 = *(npy_bool *)ip1; + const npy_bool in2 = *(npy_bool *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + } + } +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #OP = ==, !=# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + UNARY_LOOP { + npy_bool in1 = *(npy_bool *)ip1; + *((npy_bool *)op1) = in1 @OP@ 0; + } + } +} +/**end repeat**/ + diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 6fc1501c9..10c44ce30 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -154,80 +154,6 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const * /**end repeat1**/ /**end repeat**/ -/* - ***************************************************************************** - ** BOOL DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, - npy_intp n); - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n); -#endif - -static inline int -run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], - (npy_bool*)args[1], dimensions[0]); - return 1; - } -#endif - return 0; -} - - -static inline int -run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], - dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n); -#endif - -static inline int -run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - #ifdef NPY_HAVE_SSE2_INTRINSICS /* @@ -1005,143 +931,6 @@ AVX512F_absolute_@TYPE@(@type@ * op, #endif /**end repeat**/ -/* - ***************************************************************************** - ** BOOL LOOPS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - * # and = 0, 1# - * # op = ||, &&# - * # sc = !=, ==# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vload = _mm_load_si128*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -/* - * convert any bit set to boolean true so vectorized and normal operations are - * consistent, should not be required if bool is used correctly everywhere but - * you never know - */ -#if !@and@ -NPY_FINLINE @vtype@ byte_to_true(@vtype@ v) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* get 0xFF for zeros */ - @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero); - /* filled with 0xFF/0x00, negate and mask to boolean true */ - return @vpre@_andnot_@vsuf@(tmp, truemask); -} -#endif - -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[i] @op@ ip2[i]; - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip1[i]); - @vtype@ b = @vloadu@((@vtype@*)&ip2[i]); -#if @and@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - /* get 0xFF for non zeros*/ - @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero); - /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */ - tmp = @vpre@_andnot_@vsuf@(tmp, b); -#else - @vtype@ tmp = @vpre@_or_@vsuf@(a, b); -#endif - - @vstore@((@vtype@*)&op[i], byte_to_true(tmp)); - } - LOOP_BLOCKED_END { - op[i] = (ip1[i] @op@ ip2[i]); - } -} - - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } - /* unrolled once to replace a slow movmsk with a fast pmaxb */ - LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) { - @vtype@ v = @vload@((@vtype@*)&ip[i]); - @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]); - v = @vpre@_cmpeq_epi8(v, zero); - v2 = @vpre@_cmpeq_epi8(v2, zero); -#if @and@ - if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) { - *op = 0; -#else - if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) { - *op = 1; -#endif - return; - } - } - LOOP_BLOCKED_END { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - * # op = !=, ==# - * # not = 0, 1# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -static void -sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = (ip[i] @op@ 0); - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip[i]); -#if @not@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* equivalent to byte_to_true but can skip the negation */ - a = @vpre@_cmpeq_epi8(a, zero); - a = @vpre@_and_@vsuf@(a, truemask); -#else - /* abs is kind of pointless but maybe its used for byte_to_true */ - a = byte_to_true(a); -#endif - @vstore@((@vtype@*)&op[i], a); - } - LOOP_BLOCKED_END { - op[i] = (ip[i] @op@ 0); - } -} - -/**end repeat**/ - #undef VECTOR_SIZE_BYTES #endif /* NPY_HAVE_SSE2_INTRINSICS */ #endif |