diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-06-23 06:14:04 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-06-23 06:14:04 -0700 |
commit | f361c6b073316f3ee59ac23413155145b27aed90 (patch) | |
tree | 6422a8aa8c6a12e59248a437eea1215196006e87 | |
parent | 6dd20ff5fa5b937b86fa4a1404ceeabafceaa589 (diff) | |
parent | 75df68b5ca238eabc2de144dce08d481c59ffcaf (diff) | |
download | numpy-f361c6b073316f3ee59ac23413155145b27aed90.tar.gz |
Merge pull request #3429 from juliantaylor/copy-unroll
ENH: tell gcc to unroll strided copy loops
-rw-r--r-- | numpy/core/include/numpy/npy_common.h | 14 | ||||
-rw-r--r-- | numpy/core/setup.py | 7 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 7 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 14 |
4 files changed, 40 insertions, 2 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 30829f929..2dccc575e 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -3,6 +3,20 @@ /* numpconfig.h is auto-generated */ #include "numpyconfig.h" +#ifdef HAVE_NPY_CONFIG_H +#include <npy_config.h> +#endif + +/* + * gcc does not unroll even with -O3 + * use with care, unrolling on modern cpus rarely speeds things up + */ +#ifdef HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS +#define NPY_GCC_UNROLL_LOOPS \ + __attribute__((optimize("unroll-loops"))) +#else +#define NPY_GCC_UNROLL_LOOPS +#endif #if defined(_MSC_VER) #define NPY_INLINE __inline diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 926142b55..b48414c2d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -161,7 +161,6 @@ def check_math_capabilities(config, moredefs, mathlibs): check_funcs(OPTIONAL_STDFUNCS) - for h in OPTIONAL_HEADERS: if config.check_func("", decl=False, call=False, headers=[h]): moredefs.append((fname2def(h).replace(".", "_"), 1)) @@ -170,6 +169,12 @@ def check_math_capabilities(config, moredefs, mathlibs): if config.check_func(f, decl=False, call=True, call_args=args): moredefs.append((fname2def(f), 1)) + for dec, fn in OPTIONAL_GCC_ATTRIBUTES: + if config.check_funcs_once([fn], + decl=dict((('%s %s' % (dec, fn), True),)), + call=False): + moredefs.append((fname2def(fn), 1)) + # C99 functions: float and long double versions check_funcs(C99_FUNCS_SINGLE) check_funcs(C99_FUNCS_EXTENDED) diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 284acfe21..cb30c83c9 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -113,6 +113,13 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_bswap64", '5u'), ] +# gcc function attributes +# (attribute as understood by gcc, function name), +# function name will be converted to HAVE_<upper-case-name> preprocessor macro +OPTIONAL_GCC_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))', + 'attribute_optimize_unroll_loops'), + ] + # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot", "copysign"] diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index 0adb24c8d..875656394 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -104,7 +104,15 @@ #if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0 +/* + * unrolling gains about 20-50% if the copy can be done in one mov instruction + * if not it can decrease performance + * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4 + */ static void +#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP + NPY_GCC_UNROLL_LOOPS +#endif @prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride, char *src, npy_intp src_stride, npy_intp N, npy_intp NPY_UNUSED(src_itemsize), @@ -161,7 +169,11 @@ static void #endif -/* specialized copy and swap for source stride 0 */ +/* + * specialized copy and swap for source stride 0, + * interestingly unrolling here is like above is only marginally profitable for + * small types and detrimental for >= 8byte moves on x86 + */ #if (@src_contig@ == 0) && @is_aligned@ static void @prefix@_@oper@_size@elsize@_srcstride0(char *dst, |