summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2013-06-23 06:14:04 -0700
committerCharles Harris <charlesr.harris@gmail.com>2013-06-23 06:14:04 -0700
commitf361c6b073316f3ee59ac23413155145b27aed90 (patch)
tree6422a8aa8c6a12e59248a437eea1215196006e87
parent6dd20ff5fa5b937b86fa4a1404ceeabafceaa589 (diff)
parent75df68b5ca238eabc2de144dce08d481c59ffcaf (diff)
downloadnumpy-f361c6b073316f3ee59ac23413155145b27aed90.tar.gz
Merge pull request #3429 from juliantaylor/copy-unroll
ENH: tell gcc to unroll strided copy loops
-rw-r--r--numpy/core/include/numpy/npy_common.h14
-rw-r--r--numpy/core/setup.py7
-rw-r--r--numpy/core/setup_common.py7
-rw-r--r--numpy/core/src/multiarray/lowlevel_strided_loops.c.src14
4 files changed, 40 insertions, 2 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 30829f929..2dccc575e 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -3,6 +3,20 @@
/* numpconfig.h is auto-generated */
#include "numpyconfig.h"
+#ifdef HAVE_NPY_CONFIG_H
+#include <npy_config.h>
+#endif
+
+/*
+ * gcc does not unroll even with -O3
+ * use with care, unrolling on modern cpus rarely speeds things up
+ */
+#ifdef HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS
+#define NPY_GCC_UNROLL_LOOPS \
+ __attribute__((optimize("unroll-loops")))
+#else
+#define NPY_GCC_UNROLL_LOOPS
+#endif
#if defined(_MSC_VER)
#define NPY_INLINE __inline
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 926142b55..b48414c2d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -161,7 +161,6 @@ def check_math_capabilities(config, moredefs, mathlibs):
check_funcs(OPTIONAL_STDFUNCS)
-
for h in OPTIONAL_HEADERS:
if config.check_func("", decl=False, call=False, headers=[h]):
moredefs.append((fname2def(h).replace(".", "_"), 1))
@@ -170,6 +169,12 @@ def check_math_capabilities(config, moredefs, mathlibs):
if config.check_func(f, decl=False, call=True, call_args=args):
moredefs.append((fname2def(f), 1))
+ for dec, fn in OPTIONAL_GCC_ATTRIBUTES:
+ if config.check_funcs_once([fn],
+ decl=dict((('%s %s' % (dec, fn), True),)),
+ call=False):
+ moredefs.append((fname2def(fn), 1))
+
# C99 functions: float and long double versions
check_funcs(C99_FUNCS_SINGLE)
check_funcs(C99_FUNCS_EXTENDED)
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 284acfe21..cb30c83c9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -113,6 +113,13 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
("__builtin_bswap64", '5u'),
]
+# gcc function attributes
+# (attribute as understood by gcc, function name),
+# function name will be converted to HAVE_<upper-case-name> preprocessor macro
+OPTIONAL_GCC_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
+ 'attribute_optimize_unroll_loops'),
+ ]
+
# Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h
OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot",
"copysign"]
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 0adb24c8d..875656394 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -104,7 +104,15 @@
#if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
static void
+#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
+ NPY_GCC_UNROLL_LOOPS
+#endif
@prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
char *src, npy_intp src_stride,
npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -161,7 +169,11 @@ static void
#endif
-/* specialized copy and swap for source stride 0 */
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ */
#if (@src_contig@ == 0) && @is_aligned@
static void
@prefix@_@oper@_size@elsize@_srcstride0(char *dst,