summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/include/numpy/npy_common.h15
-rw-r--r--numpy/core/setup_common.py3
-rw-r--r--numpy/core/src/umath/loops.c.src4
3 files changed, 22 insertions, 0 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index eff5dd339..47ef94c92 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -61,6 +61,21 @@
#define NPY_UNLIKELY(x) (x)
#endif
+#ifdef HAVE___BUILTIN_PREFETCH
+/* unlike _mm_prefetch also works on non-x86 */
+#define NPY_PREFETCH(x, rw, loc) __builtin_prefetch((x), (rw), (loc))
+#else
+#ifdef HAVE__MM_PREFETCH
+/* _MM_HINT_ET[01] (rw = 1) unsupported, only available in gcc >= 4.9 */
+#define NPY_PREFETCH(x, rw, loc) _mm_prefetch((x), loc == 0 ? _MM_HINT_NTA : \
+ (loc == 1 ? _MM_HINT_T2 : \
+ (loc == 2 ? _MM_HINT_T1 : \
+ (loc == 3 ? _MM_HINT_T0 : -1))))
+#else
+#define NPY_PREFETCH(x, rw,loc)
+#endif
+#endif
+
#if defined(_MSC_VER)
#define NPY_INLINE __inline
#elif defined(__GNUC__)
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 68efd1791..d93e475e3 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -125,7 +125,10 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
("__builtin_expect", '5, 0'),
("__builtin_mul_overflow", '5, 5, (int*)5'),
("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
+ ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
+ "xmmintrin.h"), # SSE
("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
+ ("__builtin_prefetch", "(float*)0, 0, 3"),
]
# function attributes
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 854c1e17a..aff6180c7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1444,6 +1444,8 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
r[7] = @trf@(a[7 * stride]);
for (i = 8; i < n - (n % 8); i += 8) {
+ /* small blocksizes seems to mess with hardware prefetch */
+ NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
r[0] += @trf@(a[(i + 0) * stride]);
r[1] += @trf@(a[(i + 1) * stride]);
r[2] += @trf@(a[(i + 2) * stride]);
@@ -2190,6 +2192,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
r[7] = a[6 * stride + 1];
for (i = 8; i < n - (n % 8); i += 8) {
+ /* small blocksizes seems to mess with hardware prefetch */
+ NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
r[0] += a[(i + 0) * stride];
r[1] += a[(i + 0) * stride + 1];
r[2] += a[(i + 2) * stride];