summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/bscript6
-rw-r--r--numpy/core/setup.py7
-rw-r--r--numpy/core/src/umath/loops.c.src143
-rw-r--r--numpy/core/src/umath/simd.inc.src173
4 files changed, 205 insertions, 124 deletions
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 44f7f14f0..ee5543354 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -492,8 +492,10 @@ def pre_build(context):
pattern="ufunc_api",
name="ufunc_api")
- ufunc_templates = ["src/umath/loops.c.src",
- "src/umath/funcs.inc.src"]
+ ufunc_templates = [
+ "src/umath/loops.c.src",
+ "src/umath/funcs.inc.src",
+ "src/umath/simd.inc.src"]
bld(target="ufunc_templates", source=ufunc_templates)
bld(features="umath_gen",
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a6ddfb843..926142b55 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -837,7 +837,9 @@ def configuration(parent_package='',top_path=None):
subpath = join('src', 'umath')
# NOTE: For manual template conversion of loops.h.src, read the note
# in that file.
- sources = [join(local_dir, subpath, 'loops.c.src')]
+ sources = [
+ join(local_dir, subpath, 'loops.c.src'),
+ join(local_dir, subpath, 'simd.inc.src')]
# numpy.distutils generate .c from .c.src in weird directories, we have
# to add them there as they depend on the build_dir
@@ -864,12 +866,14 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'umathmodule.c'),
join('src', 'umath', 'reduction.c'),
join('src', 'umath', 'funcs.inc.src'),
+ join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'ufunc_object.c'),
join('src', 'umath', 'ufunc_type_resolution.c')]
umath_deps = [
generate_umath_py,
+ join('src', 'umath', 'simd.inc.src'),
join(codegen_dir,'generate_ufunc_api.py')]
if not ENABLE_SEPARATE_COMPILATION:
@@ -877,6 +881,7 @@ def configuration(parent_package='',top_path=None):
umath_src = [join('src', 'umath', 'umathmodule_onefile.c')]
umath_src.append(generate_umath_templated_sources)
umath_src.append(join('src', 'umath', 'funcs.inc.src'))
+ umath_src.append(join('src', 'umath', 'simd.inc.src'))
config.add_extension('umath',
sources = umath_src +
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index c8ce8fb2a..c0287b8c8 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -19,11 +19,15 @@
#include "npy_pycompat.h"
#include "ufunc_object.h"
-#include <assert.h>
-#ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
-#endif
+
+/*
+ * include vectorized functions and dispatchers
+ * this file is safe to include also for generic builds
+ * platform specific instructions are either masked via the proprocessor or
+ * runtime detected
+ */
+#include "simd.inc"
/*
@@ -37,15 +41,6 @@
&& (steps[0] == steps[2])\
&& (steps[0] == 0))
-/*
- * stride is equal to element size and input and destination are equal or
- * don't overlap within one register
- */
-#define IS_BLOCKABLE_UNARY(esize, vsize) \
- (steps[0] == (esize) && steps[0] == steps[1] && \
- (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
- ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
-
#define OUTPUT_LOOP\
char *op1 = args[1];\
npy_intp os1 = steps[1];\
@@ -67,19 +62,6 @@
npy_intp i;\
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
-/* align var to alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
- npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
- alignment, n);\
- for(i = 0; i < peel; i++)
-
-#define UNARY_LOOP_BLOCKED(type, vsize)\
- for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
- i += (vsize / sizeof(type)))
-
-#define UNARY_LOOP_BLOCKED_END\
- for (; i < n; i++)
-
#define BINARY_LOOP\
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
@@ -1294,101 +1276,25 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
*****************************************************************************
*/
-
/**begin repeat
+ * Float types
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
* #scalarf = npy_sqrtf, npy_sqrt#
- * #c = f, #
- * #vtype = __m128, __m128d#
- * #vpre = _mm, _mm#
- * #vsuf = ps, pd#
*/
-#ifdef HAVE_EMMINTRIN_H
-
-#define NPY_HAVE_SIMD_@TYPE@
-
-static void
-sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
-{
- /* align output to 16 bytes */
- UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
- op[i] = @scalarf@(ip[i]);
- }
- assert(npy_is_aligned(&op[i], 16));
- if (npy_is_aligned(&ip[i], 16)) {
- UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
- }
- }
- else {
- UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
- }
- }
- UNARY_LOOP_BLOCKED_END {
- op[i] = @scalarf@(ip[i]);
- }
-}
-
-
-static void
-sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
-{
- /*
- * get 0x7FFFFFFF mask (everything but signbit set)
- * float & ~mask will remove the sign
- * this is equivalent to how the compiler implements fabs on amd64
- */
- const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-
- /* align output to 16 bytes */
- UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
- const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
- /* add 0 to clear -0.0 */
- op[i] = tmp + 0;
- }
- if (npy_is_aligned(&ip[i], 16)) {
- UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
- }
- }
- else {
- UNARY_LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
- }
- }
- UNARY_LOOP_BLOCKED_END {
- const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
- /* add 0 to clear -0.0 */
- op[i] = tmp + 0;
- }
-}
-
-#endif
-
-
NPY_NO_EXPORT void
@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
-#ifdef HAVE_EMMINTRIN_H
- if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
- sse2_sqrt_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+ if (run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
+ return;
}
- else
-#endif
- {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *(@type@ *)op1 = @scalarf@(in1);
- }
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = @scalarf@(in1);
}
}
+
/**end repeat**/
@@ -1617,19 +1523,14 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
-#if defined HAVE_EMMINTRIN_H && defined NPY_HAVE_SIMD_@TYPE@
- if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
- sse2_absolute_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+ if (run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
+ return;
}
- else
-#endif
- {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ tmp = in1 > 0 ? in1 : -in1;
- /* add 0 to clear -0.0 */
- *((@type@ *)op1) = tmp + 0;
- }
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ const @type@ tmp = in1 > 0 ? in1 : -in1;
+ /* add 0 to clear -0.0 */
+ *((@type@ *)op1) = tmp + 0;
}
}
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
new file mode 100644
index 000000000..916473a0b
--- /dev/null
+++ b/numpy/core/src/umath/simd.inc.src
@@ -0,0 +1,173 @@
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of simd vectorized operations.
+ *
+ * Currently contains sse2 functions that are built on amd64, x32 or
+ * non-generic builds (CFLAGS=-march=...)
+ * In future it may contain other instruction sets like AVX or NEON detected
+ * at runtime in which case it needs to be included indirectly via a file
+ * compiled with special options (or use gcc target attributes) so the binary
+ * stays portable.
+ */
+
+
+#ifndef __NPY_SIMD_INC
+#define __NPY_SIMD_INC
+
+#include "lowlevel_strided_loops.h"
+#include "npy_config.h"
+#include <assert.h>
+#include <stdlib.h>
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+ (steps[0] == (esize) && steps[0] == steps[1] && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
+
+
+/* align var to alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+ alignment, n);\
+ for(i = 0; i < peel; i++)
+
+#define UNARY_LOOP_BLOCKED(type, vsize)\
+ for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+ i += (vsize / sizeof(type)))
+
+#define UNARY_LOOP_BLOCKED_END\
+ for (; i < n; i++)
+
+
+/*
+ * Dispatcher functions
+ * decide whether the operation can be vectorized and run it
+ * if it was run returns true and false if nothing was done
+ */
+
+/**begin repeat
+ * Float types
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #vector = 1, 1, 0#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute#
+ */
+
+#if @vector@
+
+/* prototypes */
+static void
+sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined HAVE_EMMINTRIN_H
+ if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+ sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/*
+ * Vectorized operations
+ */
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #scalarf = npy_sqrtf, npy_sqrt#
+ * #c = f, #
+ * #vtype = __m128, __m128d#
+ * #vpre = _mm, _mm#
+ * #vsuf = ps, pd#
+ */
+
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+
+static void
+sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+ /* align output to 16 bytes */
+ UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ op[i] = @scalarf@(ip[i]);
+ }
+ assert(npy_is_aligned(&op[i], 16));
+ if (npy_is_aligned(&ip[i], 16)) {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+ }
+ }
+ else {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+ }
+ }
+ UNARY_LOOP_BLOCKED_END {
+ op[i] = @scalarf@(ip[i]);
+ }
+}
+
+
+static void
+sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+ /*
+ * get 0x7FFFFFFF mask (everything but signbit set)
+ * float & ~mask will remove the sign
+ * this is equivalent to how the compiler implements fabs on amd64
+ */
+ const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+ /* align output to 16 bytes */
+ UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+ /* add 0 to clear -0.0 */
+ op[i] = tmp + 0;
+ }
+ assert(npy_is_aligned(&op[i], 16));
+ if (npy_is_aligned(&ip[i], 16)) {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+ }
+ }
+ else {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+ @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+ }
+ }
+ UNARY_LOOP_BLOCKED_END {
+ const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+ /* add 0 to clear -0.0 */
+ op[i] = tmp + 0;
+ }
+}
+
+#endif
+
+
+/**end repeat**/
+
+#endif