Merge pull request #3429 from juliantaylor/copy-unroll

ENH: tell gcc to unroll strided copy loops
author: Charles Harris <charlesr.harris@gmail.com> 2013-06-23 06:14:04 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2013-06-23 06:14:04 -0700
commit: f361c6b073316f3ee59ac23413155145b27aed90 (patch)
tree: 6422a8aa8c6a12e59248a437eea1215196006e87
parent: 6dd20ff5fa5b937b86fa4a1404ceeabafceaa589 (diff)
parent: 75df68b5ca238eabc2de144dce08d481c59ffcaf (diff)
download: numpy-f361c6b073316f3ee59ac23413155145b27aed90.tar.gz
4 files changed, 40 insertions, 2 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 30829f929..2dccc575e 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -3,6 +3,20 @@
 
 /* numpconfig.h is auto-generated */
 #include "numpyconfig.h"
+#ifdef HAVE_NPY_CONFIG_H
+#include <npy_config.h>
+#endif
+
+/*
+ * gcc does not unroll even with -O3
+ * use with care, unrolling on modern cpus rarely speeds things up
+ */
+#ifdef HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS
+#define NPY_GCC_UNROLL_LOOPS \
+    __attribute__((optimize("unroll-loops")))
+#else
+#define NPY_GCC_UNROLL_LOOPS
+#endif
 
 #if defined(_MSC_VER)
         #define NPY_INLINE __inline
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 926142b55..b48414c2d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -161,7 +161,6 @@ def check_math_capabilities(config, moredefs, mathlibs):
 
     check_funcs(OPTIONAL_STDFUNCS)
 
-
     for h in OPTIONAL_HEADERS:
         if config.check_func("", decl=False, call=False, headers=[h]):
             moredefs.append((fname2def(h).replace(".", "_"), 1))
@@ -170,6 +169,12 @@ def check_math_capabilities(config, moredefs, mathlibs):
         if config.check_func(f, decl=False, call=True, call_args=args):
             moredefs.append((fname2def(f), 1))
 
+    for dec, fn in OPTIONAL_GCC_ATTRIBUTES:
+        if config.check_funcs_once([fn],
+                                   decl=dict((('%s %s' % (dec, fn), True),)),
+                                   call=False):
+            moredefs.append((fname2def(fn), 1))
+
     # C99 functions: float and long double versions
     check_funcs(C99_FUNCS_SINGLE)
     check_funcs(C99_FUNCS_EXTENDED)
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 284acfe21..cb30c83c9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -113,6 +113,13 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap64", '5u'),
                        ]
 
+# gcc function attributes
+# (attribute as understood by gcc, function name),
+# function name will be converted to HAVE_<upper-case-name> preprocessor macro
+OPTIONAL_GCC_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
+                            'attribute_optimize_unroll_loops'),
+                          ]
+
 # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h
 OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot",
         "copysign"]
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 0adb24c8d..875656394 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -104,7 +104,15 @@
 
 
 #if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
 static void
+#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
 @prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -161,7 +169,11 @@ static void
 #endif
 
 
-/* specialized copy and swap for source stride 0 */
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ */
 #if (@src_contig@ == 0) && @is_aligned@
 static void
 @prefix@_@oper@_size@elsize@_srcstride0(char *dst,
author	Charles Harris <charlesr.harris@gmail.com>	2013-06-23 06:14:04 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2013-06-23 06:14:04 -0700
commit	f361c6b073316f3ee59ac23413155145b27aed90 (patch)
tree	6422a8aa8c6a12e59248a437eea1215196006e87
parent	6dd20ff5fa5b937b86fa4a1404ceeabafceaa589 (diff)
parent	75df68b5ca238eabc2de144dce08d481c59ffcaf (diff)
download	numpy-f361c6b073316f3ee59ac23413155145b27aed90.tar.gz