summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorMark Wiebe <mwwiebe@gmail.com>2010-12-20 13:49:55 -0800
committerMark Wiebe <mwwiebe@gmail.com>2011-01-09 01:54:59 -0800
commit6d41baf0f07b67aa4cfdd275b3dd562ceed9c3de (patch)
tree222b67a0b9529071a4541449e5f07093f36c654a /numpy
parent3f5de70b2d7841c32547b4a87c16dc6e10658335 (diff)
downloadnumpy-6d41baf0f07b67aa4cfdd275b3dd562ceed9c3de.tar.gz
ENH: Implemented basic buffering
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/setup.py7
-rw-r--r--numpy/core/src/multiarray/lowlevel_strided_loops.c.src1024
-rw-r--r--numpy/core/src/multiarray/lowlevel_strided_loops.h153
-rw-r--r--numpy/core/src/multiarray/new_iterator.c.src841
-rw-r--r--numpy/core/src/multiarray/new_iterator.h8
-rw-r--r--numpy/core/src/multiarray/new_iterator_pywrap.c19
6 files changed, 1964 insertions, 88 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index acf2726a7..442bcee57 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -601,7 +601,8 @@ def configuration(parent_package='',top_path=None):
subpath = join('src', 'multiarray')
sources = [join(local_dir, subpath, 'scalartypes.c.src'),
join(local_dir, subpath, 'arraytypes.c.src'),
- join(local_dir, subpath, 'new_iterator.c.src')]
+ join(local_dir, subpath, 'new_iterator.c.src'),
+ join(local_dir, subpath, 'lowlevel_strided_loops.c.src')]
# numpy.distutils generate .c from .c.src in weird directories, we have
# to add them there as they depend on the build_dir
@@ -737,7 +738,8 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'shape.h'),
join('src', 'multiarray', 'ucsnarrow.h'),
join('src', 'multiarray', 'usertypes.h'),
- join('src', 'multiarray', 'new_iterator.h')]
+ join('src', 'multiarray', 'new_iterator.h'),
+ join('src', 'multiarray', 'lowlevel_strided_loops.h')]
multiarray_src = [join('src', 'multiarray', 'multiarraymodule.c'),
join('src', 'multiarray', 'hashdescr.c'),
@@ -768,6 +770,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'arraytypes.c.src'),
join('src', 'multiarray', 'scalartypes.c.src'),
join('src', 'multiarray', 'new_iterator.c.src'),
+ join('src', 'multiarray', 'lowlevel_strided_loops.c.src'),
join('src', 'multiarray', 'new_iterator_pywrap.c')]
if PYTHON_HAS_UNICODE_WIDE:
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
new file mode 100644
index 000000000..b51f1e680
--- /dev/null
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -0,0 +1,1024 @@
+/*
+ * This file contains low-level loops for copying and byte-swapping
+ * strided data.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+#include <numpy/ufuncobject.h>
+#include <numpy/npy_cpu.h>
+
+#include "lowlevel_strided_loops.h"
+
+#if ! (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
+//# define NPY_USE_UNALIGNED_ACCESS
+#endif
+
+#define _NPY_NOP1(x) (x)
+#define _NPY_NOP2(x) (x)
+#define _NPY_NOP4(x) (x)
+#define _NPY_NOP8(x) (x)
+
+#define _NPY_SWAP2(x) (((((npy_uint16)x)&0xffu) << 8) | \
+ (((npy_uint16)x) >> 8))
+
+#define _NPY_SWAP4(x) (((((npy_uint32)x)&0xffu) << 24) | \
+ ((((npy_uint32)x)&0xff00u) << 8) | \
+ ((((npy_uint32)x)&0xff0000u) >> 8) | \
+ (((npy_uint32)x) >> 24))
+
+#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \
+ ((((npy_uint32)x)&0xff00u) >> 8) | \
+ ((((npy_uint32)x)&0xff0000u) << 8) | \
+ (((npy_uint32)x) >> 8))
+
+#define _NPY_SWAP8(x) (((((npy_uint64)x)&0xffu) << 56) | \
+ ((((npy_uint64)x)&0xff00u) << 40) | \
+ ((((npy_uint64)x)&0xff0000u) << 24) | \
+ ((((npy_uint64)x)&0xff000000u) << 8) | \
+ ((((npy_uint64)x)&0xff00000000u) >> 8) | \
+ ((((npy_uint64)x)&0xff0000000000u) >> 24) | \
+ ((((npy_uint64)x)&0xff000000000000u) >> 40) | \
+ (((npy_uint64)x) >> 56))
+
+#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffu) << 24) | \
+ ((((npy_uint64)x)&0xff00u) << 8) | \
+ ((((npy_uint64)x)&0xff0000u) >> 8) | \
+ ((((npy_uint64)x)&0xff000000u) >> 24) | \
+ ((((npy_uint64)x)&0xff00000000u) << 24) | \
+ ((((npy_uint64)x)&0xff0000000000u) << 8) | \
+ ((((npy_uint64)x)&0xff000000000000u) >> 8) | \
+ (((npy_uint64)x) >> 24))
+
+#define _NPY_SWAP_INPLACE2(x) { \
+ char a = (x)[0]; (x)[0] = (x)[1]; (x)[1] = a; \
+ }
+
+#define _NPY_SWAP_INPLACE4(x) { \
+ char a = (x)[0]; (x)[0] = (x)[3]; (x)[3] = a; \
+ a = (x)[1]; (x)[1] = (x)[2]; (x)[2] = a; \
+ }
+
+#define _NPY_SWAP_INPLACE8(x) { \
+ char a = (x)[0]; (x)[0] = (x)[7]; (x)[7] = a; \
+ a = (x)[1]; (x)[1] = (x)[6]; (x)[6] = a; \
+ a = (x)[2]; (x)[2] = (x)[5]; (x)[5] = a; \
+ a = (x)[3]; (x)[3] = (x)[4]; (x)[4] = a; \
+ }
+
+#define _NPY_SWAP_INPLACE16(x) { \
+ char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \
+ a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \
+ a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \
+ a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \
+ a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \
+ a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \
+ a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \
+ a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \
+ }
+
+/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/
+
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ * #elsize_half = 0, 1, 2, 4, 8#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128#
+ */
+/**begin repeat1
+ * #oper = strided_to_strided, strided_to_contig,
+ * contig_to_strided, contig_to_contig#
+ * #src_contig = 0, 0, 1 ,1#
+ * #dst_contig = 0, 1, 0 ,1#
+ */
+/**begin repeat2
+ * #swap = _NPY_NOP, _NPY_NOP, _NPY_SWAP_INPLACE, _NPY_SWAP,
+ * _NPY_SWAP_INPLACE, _NPY_SWAP_PAIR#
+ * #prefix = , _aligned, _swap, _aligned_swap, _swap_pair, _aligned_swap_pair#
+ * #is_aligned = 0, 1, 0, 1, 0, 1#
+ * #minelsize = 1, 1, 2, 2, 4, 4#
+ * #is_swap = 0, 0, 1, 1, 2, 2#
+ */
+
+#if (@elsize@ >= @minelsize@) && \
+ (@elsize@ > 1 || @is_aligned@) && \
+ (!defined(NPY_USE_UNALIGNED_ACCESS) || @is_aligned@)
+
+
+#if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+static void
+@prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp NPY_UNUSED(itemsize),
+ void *NPY_UNUSED(data))
+{
+ while (N > 0) {
+#if @is_aligned@
+
+ /* aligned copy and swap */
+# if @elsize@ != 16
+ (*((@type@ *)dst)) = @swap@@elsize@(*((@type@ *)src));
+# else
+# if @is_swap@ == 0
+ (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+ (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+# elif @is_swap@ == 1
+ (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+ (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+# elif @is_swap@ == 2
+ (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+ (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+# endif
+# endif
+
+#else
+
+ /* unaligned copy and swap */
+ memcpy(dst, src, @elsize@);
+# if @is_swap@ == 1
+ @swap@@elsize@(dst);
+# elif @is_swap@ == 2
+ @swap@@elsize_half@(dst);
+ @swap@@elsize_half@(dst + @elsize_half@);
+# endif
+
+#endif
+
+#if @dst_contig@
+ dst += @elsize@;
+#else
+ dst += dst_stride;
+#endif
+
+#if @src_contig@
+ src += @elsize@;
+#else
+ src += src_stride;
+#endif
+
+ --N;
+ }
+}
+#endif
+
+
+/* specialized copy and swap for source stride 0 */
+#if (@src_contig@ == 0) && @is_aligned@
+static void
+@prefix@_@oper@_size@elsize@_srcstride0(char *dst,
+ npy_intp dst_stride,
+ char *src, npy_intp NPY_UNUSED(src_stride),
+ npy_intp N, npy_intp NPY_UNUSED(itemsize),
+ void *NPY_UNUSED(data))
+{
+#if @elsize@ != 16
+ @type@ temp = @swap@@elsize@(*((@type@ *)src));
+#else
+ npy_uint64 temp0, temp1;
+# if @is_swap@ == 0
+ temp0 = (*((npy_uint64 *)src));
+ temp1 = (*((npy_uint64 *)src + 1));
+# elif @is_swap@ == 1
+ temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+ temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+# elif @is_swap@ == 2
+ temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+ temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+# endif
+#endif
+ while (N > 0) {
+#if @elsize@ != 16
+ *((@type@ *)dst) = temp;
+#else
+ *((npy_uint64 *)dst) = temp0;
+ *((npy_uint64 *)dst + 1) = temp1;
+#endif
+#if @dst_contig@
+ dst += @elsize@;
+#else
+ dst += dst_stride;
+#endif
+ --N;
+ }
+}
+#endif
+
+#endif/* @elsize@ >= @minelsize@ */
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+static void
+_strided_to_strided(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ while (N > 0) {
+ memcpy(dst, src, itemsize);
+ dst += dst_stride;
+ src += src_stride;
+ --N;
+ }
+}
+
+static void
+_swap_strided_to_strided(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ char *a, *b, c;
+
+ while (N > 0) {
+ memcpy(dst, src, itemsize);
+ /* general in-place swap */
+ a = dst;
+ b = dst + itemsize - 1;
+ while (a < b) {
+ c = *a;
+ *a = *b;
+ *b = c;
+ ++a; --b;
+ }
+ dst += dst_stride;
+ src += src_stride;
+ --N;
+ }
+}
+
+static void
+_swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ char *a, *b, c;
+ npy_intp itemsize_half = itemsize / 2;
+
+ while (N > 0) {
+ memcpy(dst, src, itemsize);
+ /* general in-place swap */
+ a = dst;
+ b = dst + itemsize_half - 1;
+ while (a < b) {
+ c = *a;
+ *a = *b;
+ *b = c;
+ ++a; --b;
+ }
+ /* general in-place swap */
+ a = dst + itemsize_half;
+ b = dst + 2*itemsize_half - 1;
+ while (a < b) {
+ c = *a;
+ *a = *b;
+ *b = c;
+ ++a; --b;
+ }
+ dst += dst_stride;
+ src += src_stride;
+ --N;
+ }
+}
+
+static void
+_strided_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ while (N > 0) {
+ memcpy(dst, src, itemsize);
+ dst += itemsize;
+ src += src_stride;
+ --N;
+ }
+}
+
+static void
+_contig_to_strided(char *dst, npy_intp dst_stride,
+ char *src, npy_intp NPY_UNUSED(src_stride),
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ while (N > 0) {
+ memcpy(dst, src, itemsize);
+ dst += dst_stride;
+ src += itemsize;
+ --N;
+ }
+}
+
+static void
+_contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
+ char *src, npy_intp NPY_UNUSED(src_stride),
+ npy_intp N, npy_intp itemsize,
+ void *NPY_UNUSED(data))
+{
+ memcpy(dst, src, itemsize*N);
+}
+
+
+NPY_NO_EXPORT PyArray_StridedTransferFn
+PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride,
+ npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#ifndef NPY_USE_UNALIGNED_ACCESS
+ if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+ /* contiguous dst */
+ if (itemsize != 0 && dst_stride == itemsize) {
+ /* constant src */
+ if (src_stride == 0) {
+ switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return
+ &_aligned_strided_to_contig_size@elsize@_srcstride0;
+/**end repeat**/
+ }
+ }
+ /* contiguous src */
+ else if (src_stride == itemsize) {
+ return &_contig_to_contig;
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_aligned_strided_to_contig_size@elsize@;
+/**end repeat**/
+ }
+ }
+
+ return &_strided_to_contig;
+ }
+ /* general dst */
+ else {
+ /* constant src */
+ if (src_stride == 0) {
+ switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return
+ &_aligned_strided_to_strided_size@elsize@_srcstride0;
+/**end repeat**/
+ }
+ }
+ /* contiguous src */
+ else if (src_stride == itemsize) {
+ switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_aligned_contig_to_strided_size@elsize@;
+/**end repeat**/
+ }
+
+ return &_contig_to_strided;
+ }
+ else {
+ switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_aligned_strided_to_strided_size@elsize@;
+/**end repeat**/
+ }
+ }
+ }
+
+#ifndef NPY_USE_UNALIGNED_ACCESS
+ }
+ else {
+ /* contiguous dst */
+ if (itemsize != 0 && dst_stride == itemsize) {
+ /* contiguous src */
+ if (itemsize != 0 && src_stride == itemsize) {
+ return &_contig_to_contig;
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+ case 1:
+ return &_aligned_strided_to_contig_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_strided_to_contig_size@elsize@;
+/**end repeat**/
+ }
+ }
+
+ return &_strided_to_contig;
+ }
+ /* general dst */
+ else {
+ /* contiguous src */
+ if (itemsize != 0 && src_stride == itemsize) {
+ switch (itemsize) {
+ case 1:
+ return &_aligned_contig_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_contig_to_strided_size@elsize@;
+/**end repeat**/
+ }
+
+ return &_contig_to_strided;
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+ case 1:
+ return &_aligned_strided_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+ case @elsize@:
+ return &_strided_to_strided_size@elsize@;
+/**end repeat**/
+ }
+ }
+ }
+ }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+ return &_strided_to_strided;
+}
+
+/*
+ * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are
+ * nearly identical, so can do a repeat for them.
+ */
+/**begin repeat
+ * #function = PyArray_GetStridedCopySwapFn, PyArray_GetStridedCopySwapPairFn#
+ * #tag = , _pair#
+ * #not_pair = 1, 0#
+ */
+
+NPY_NO_EXPORT PyArray_StridedTransferFn
+@function@(npy_intp aligned, npy_intp src_stride,
+ npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#ifndef NPY_USE_UNALIGNED_ACCESS
+ if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+ /* contiguous dst */
+ if (itemsize != 0 && dst_stride == itemsize) {
+ /* constant src */
+ if (src_stride == 0) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return
+ &_aligned_swap@tag@_strided_to_contig_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+ }
+ }
+ /* contiguous src */
+ else if (src_stride == itemsize) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_aligned_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_aligned_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+ }
+ /* general dst */
+ else {
+ /* constant src */
+ if (src_stride == 0) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return
+ &_aligned_swap@tag@_strided_to_strided_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+ }
+ }
+ /* contiguous src */
+ else if (src_stride == itemsize) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_aligned_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+
+ return &_contig_to_strided;
+ }
+ else {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_aligned_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+ }
+
+#ifndef NPY_USE_UNALIGNED_ACCESS
+ }
+ else {
+ /* contiguous dst */
+ if (itemsize != 0 && dst_stride == itemsize) {
+ /* contiguous src */
+ if (itemsize != 0 && src_stride == itemsize) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+
+ return &_strided_to_contig;
+ }
+ /* general dst */
+ else {
+ /* contiguous src */
+ if (itemsize != 0 && src_stride == itemsize) {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+
+ return &_contig_to_strided;
+ }
+ /* general src */
+ else {
+ switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+ case @elsize@:
+ return &_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+ }
+ }
+ }
+ }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+ return &_swap@tag@_strided_to_strided;
+}
+
+/**end repeat**/
+
+/* Does a simple aligned cast */
+typedef struct {
+ void *freefunc;
+ PyArray_VectorUnaryFunc *castfunc;
+} _strided_cast_data;
+
+static void
+_aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *data)
+{
+ PyArray_VectorUnaryFunc *castfunc = ((_strided_cast_data *)data)->castfunc;
+
+ while (N > 0) {
+ castfunc(src, dst, 1, NULL, NULL);
+ dst += dst_stride;
+ src += src_stride;
+ --N;
+ }
+}
+
+static void
+_aligned_contig_to_contig_cast(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *data)
+{
+ PyArray_VectorUnaryFunc *castfunc = ((_strided_cast_data *)data)->castfunc;
+
+ castfunc(src, dst, N, NULL, NULL);
+}
+
+
+
+NPY_NO_EXPORT int
+PyArray_GetTransferFunction(int aligned,
+ npy_intp src_stride, npy_intp dst_stride,
+ PyArray_Descr *from, PyArray_Descr *to,
+ PyArray_StridedTransferFn *outstransfer,
+ void **outtransferdata)
+{
+ /* First look at the possibilities of just a copy or swap */
+ if (from->elsize == to->elsize && from->type_num < NPY_OBJECT &&
+ to->type_num < NPY_OBJECT &&
+ from->kind == to->kind) {
+ /* This is a straight copy */
+ if (from->elsize == 1 || PyArray_ISNBO(from->byteorder) ==
+ PyArray_ISNBO(to->byteorder)) {
+ *outstransfer = PyArray_GetStridedCopyFn(aligned,
+ src_stride, dst_stride,
+ from->elsize);
+ *outtransferdata = NULL;
+ return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+ }
+ /* This is a straight copy + byte swap */
+ else if (!PyTypeNum_ISCOMPLEX(from->type_num)) {
+ *outstransfer = PyArray_GetStridedCopySwapFn(aligned,
+ src_stride, dst_stride,
+ from->elsize);
+ *outtransferdata = NULL;
+ return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+ }
+ /* This is a straight copy + element pair byte swap */
+ else {
+ *outstransfer = PyArray_GetStridedCopySwapPairFn(aligned,
+ src_stride, dst_stride,
+ from->elsize);
+ *outtransferdata = NULL;
+ return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+ }
+ }
+
+ /* TODO check for fields & subarrays */
+
+ /* Check whether a simple cast will suffice */
+ if (from->type_num < NPY_OBJECT && to->type_num < NPY_OBJECT &&
+ PyArray_ISNBO(from->type_num) && PyArray_ISNBO(to->type_num)) {
+ PyArray_VectorUnaryFunc *castfunc =
+ PyArray_GetCastFunc(from, to->type_num);
+ if (!castfunc) {
+ *outstransfer = NULL;
+ *outtransferdata = NULL;
+ return NPY_FAIL;
+ }
+
+ if (aligned) {
+ /* Allocate the data that describes the cast */
+ _strided_cast_data *data =
+ PyArray_malloc(sizeof(_strided_cast_data));
+ if (data == NULL) {
+ PyErr_NoMemory();
+ *outstransfer = NULL;
+ *outtransferdata = NULL;
+ return NPY_FAIL;
+ }
+ data->freefunc = (void*)&(PyArray_free);
+ data->castfunc = castfunc;
+
+ /* Choose the contiguous cast if we can */
+ if (src_stride == from->elsize && dst_stride == to->elsize) {
+ *outstransfer = _aligned_contig_to_contig_cast;
+ }
+ else {
+ *outstransfer = _aligned_strided_to_strided_cast;
+ }
+ *outtransferdata = data;
+
+ return NPY_SUCCEED;
+ }
+
+ /* TODO wrap the cast in an alignment operation */
+ }
+
+ /* TODO: write more complicated transfer code! */
+ *outstransfer = NULL;
+ *outtransferdata = NULL;
+ PyErr_SetString(PyExc_RuntimeError,
+ "General transfer function support has not been written yet");
+ return NPY_FAIL;
+}
+
+typedef void (*_npy_stridedtransfer_dealloc)(void *);
+NPY_NO_EXPORT void
+PyArray_FreeStridedTransferData(void *transferdata)
+{
+ if (transferdata != NULL) {
+ _npy_stridedtransfer_dealloc dealloc =
+ *((_npy_stridedtransfer_dealloc *)transferdata);
+ dealloc(transferdata);
+ }
+}
+
+
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+ char *dst, npy_intp dst_stride,
+ char *src, npy_intp *src_strides, npy_intp src_strides_inc,
+ npy_intp *coords, npy_intp coords_inc,
+ npy_intp *shape, npy_intp shape_inc,
+ npy_intp count, npy_intp itemsize,
+ PyArray_StridedTransferFn stransfer,
+ void *data)
+{
+ npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1;
+
+ /* Finish off dimension 0 */
+ coord0 = coords[0];
+ shape0 = shape[0];
+ src_stride0 = src_strides[0];
+ N = shape0 - coord0;
+ if (N >= count) {
+ stransfer(dst, dst_stride, src, src_stride0, count, itemsize, data);
+ return 0;
+ }
+ stransfer(dst, dst_stride, src, src_stride0, N, itemsize, data);
+ count -= N;
+
+ /* If it's 1-dimensional, there's no more to copy */
+ if (ndim == 1) {
+ return count;
+ }
+
+ /* Adjust the src and dst pointers */
+ coord1 = (coords + coords_inc)[0];
+ shape1 = (shape + shape_inc)[0];
+ src_stride1 = (src_strides + src_strides_inc)[0];
+ src = src - coord0*src_stride0 + src_stride1;
+ dst += N*dst_stride;
+
+ /* Finish off dimension 1 */
+ M = (shape1 - coord1 - 1);
+ N = shape0*M;
+ for (i = 0; i < M; ++i) {
+ if (shape0 >= count) {
+ stransfer(dst, dst_stride, src, src_stride0,
+ count, itemsize, data);
+ return 0;
+ }
+ else {
+ stransfer(dst, dst_stride, src, src_stride0,
+ shape0, itemsize, data);
+ }
+ count -= shape0;
+ src += src_stride1;
+ dst += shape0*dst_stride;
+ }
+
+ /* If it's 2-dimensional, there's no more to copy */
+ if (ndim == 2) {
+ return count;
+ }
+
+ /* General-case loop for everything else */
+ else {
+ /* Iteration structure for dimensions 2 and up */
+ struct {
+ npy_intp coord, shape, src_stride;
+ } it[NPY_MAXDIMS];
+
+ /* Copy the coordinates and shape */
+ coords += 2*coords_inc;
+ shape += 2*shape_inc;
+ src_strides += 2*src_strides_inc;
+ for (i = 0; i < ndim-2; ++i) {
+ it[i].coord = coords[0];
+ it[i].shape = shape[0];
+ it[i].src_stride = src_strides[0];
+ coords += coords_inc;
+ shape += shape_inc;
+ src_strides += src_strides_inc;
+ }
+
+ for (;;) {
+ /* Adjust the src pointer from the dimension 0 and 1 loop */
+ src = src - shape1*src_stride1;
+
+ /* Increment to the next coordinate */
+ for (i = 0; i < ndim-2; ++i) {
+ src += it[i].src_stride;
+ if (++it[i].coord >= it[i].shape) {
+ it[i].coord = 0;
+ src -= it[i].src_stride*it[i].shape;
+ }
+ else {
+ break;
+ }
+ }
+ /* If the last dimension rolled over, we're done */
+ if (i == ndim-2) {
+ return count;
+ }
+
+ /* A loop for dimensions 0 and 1 */
+ for (i = 0; i < shape1; ++i) {
+ if (shape0 >= count) {
+ stransfer(dst, dst_stride, src, src_stride0,
+ count, itemsize, data);
+ return 0;
+ }
+ else {
+ stransfer(dst, dst_stride, src, src_stride0,
+ shape0, itemsize, data);
+ }
+ count -= shape0;
+ src += src_stride1;
+ dst += shape0*dst_stride;
+ }
+ }
+ }
+}
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+ char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+ char *src, npy_intp src_stride,
+ npy_intp *coords, npy_intp coords_inc,
+ npy_intp *shape, npy_intp shape_inc,
+ npy_intp count, npy_intp itemsize,
+ PyArray_StridedTransferFn stransfer,
+ void *data)
+{
+ npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+
+ /* Finish off dimension 0 */
+ coord0 = coords[0];
+ shape0 = shape[0];
+ dst_stride0 = dst_strides[0];
+ N = shape0 - coord0;
+ if (N >= count) {
+ stransfer(dst, dst_stride0, src, src_stride, count, itemsize, data);
+ return 0;
+ }
+ stransfer(dst, dst_stride0, src, src_stride, N, itemsize, data);
+ count -= N;
+
+ /* If it's 1-dimensional, there's no more to copy */
+ if (ndim == 1) {
+ return count;
+ }
+
+ /* Adjust the src and dst pointers */
+ coord1 = (coords + coords_inc)[0];
+ shape1 = (shape + shape_inc)[0];
+ dst_stride1 = (dst_strides + dst_strides_inc)[0];
+ dst = dst - coord0*dst_stride0 + dst_stride1;
+ src += N*src_stride;
+
+ /* Finish off dimension 1 */
+ M = (shape1 - coord1 - 1);
+ N = shape0*M;
+ for (i = 0; i < M; ++i) {
+ if (shape0 >= count) {
+ stransfer(dst, dst_stride0, src, src_stride,
+ count, itemsize, data);
+ return 0;
+ }
+ else {
+ stransfer(dst, dst_stride0, src, src_stride,
+ shape0, itemsize, data);
+ }
+ count -= shape0;
+ dst += dst_stride1;
+ src += shape0*src_stride;
+ }
+
+ /* If it's 2-dimensional, there's no more to copy */
+ if (ndim == 2) {
+ return count;
+ }
+
+ /* General-case loop for everything else */
+ else {
+ /* Iteration structure for dimensions 2 and up */
+ struct {
+ npy_intp coord, shape, dst_stride;
+ } it[NPY_MAXDIMS];
+
+ /* Copy the coordinates and shape */
+ coords += 2*coords_inc;
+ shape += 2*shape_inc;
+ dst_strides += 2*dst_strides_inc;
+ for (i = 0; i < ndim-2; ++i) {
+ it[i].coord = coords[0];
+ it[i].shape = shape[0];
+ it[i].dst_stride = dst_strides[0];
+ coords += coords_inc;
+ shape += shape_inc;
+ dst_strides += dst_strides_inc;
+ }
+
+ for (;;) {
+ /* Adjust the dst pointer from the dimension 0 and 1 loop */
+ dst = dst - shape1*dst_stride1;
+
+ /* Increment to the next coordinate */
+ for (i = 0; i < ndim-2; ++i) {
+ dst += it[i].dst_stride;
+ if (++it[i].coord >= it[i].shape) {
+ it[i].coord = 0;
+ dst -= it[i].dst_stride*it[i].shape;
+ }
+ else {
+ break;
+ }
+ }
+ /* If the last dimension rolled over, we're done */
+ if (i == ndim-2) {
+ return count;
+ }
+
+ /* A loop for dimensions 0 and 1 */
+ for (i = 0; i < shape1; ++i) {
+ if (shape0 >= count) {
+ stransfer(dst, dst_stride0, src, src_stride,
+ count, itemsize, data);
+ return 0;
+ }
+ else {
+ stransfer(dst, dst_stride0, src, src_stride,
+ shape0, itemsize, data);
+ }
+ count -= shape0;
+ dst += dst_stride1;
+ src += shape0*src_stride;
+ }
+ }
+ }
+}
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.h b/numpy/core/src/multiarray/lowlevel_strided_loops.h
new file mode 100644
index 000000000..b3d9c9de0
--- /dev/null
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.h
@@ -0,0 +1,153 @@
+#ifndef __LOWLEVEL_STRIDED_LOOPS_H
+#define __LOWLEVEL_STRIDED_LOOPS_H
+
+/*
+ * This function pointer is for functions that transfer an arbitrarily strided
+ * input to a an arbitrarily strided output. It may be a fully general
+ * function, or a specialized function when the strides or item size
+ * have special values.
+ *
+ * Examples of transfer functions are a straight copy, a byte-swap,
+ * and a casting operation,
+ *
+ * The 'transferdata' parameter is slightly special, and must always contain
+ * a pointer to a deallocation routine at its beginning. The function
+ * PyArray_FreeStridedTransferFn should be used to deallocate such
+ * pointers, and calls the function pointer.
+ *
+ */
+typedef void (*PyArray_StridedTransferFn)(char *dst, npy_intp dst_stride,
+ char *src, npy_intp src_stride,
+ npy_intp N, npy_intp itemsize,
+ void *transferdata);
+
+/*
+ * Deallocates a PyArray_StridedTransferFunction data object. See
+ * the comment with the function typedef for more details.
+ */
+NPY_NO_EXPORT void
+PyArray_FreeStridedTransferData(void *transferdata);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * strided memory. Returns NULL if there is a problem with the inputs.
+ *
+ * aligned:
+ * Should be 1 if the src and dst pointers are always aligned,
+ * 0 otherwise.
+ * src_stride:
+ * Should be the src stride if it will always be the same,
+ * NPY_MAX_INTP otherwise.
+ * dst_stride:
+ * Should be the dst stride if it will always be the same,
+ * NPY_MAX_INTP otherwise.
+ * itemsize:
+ * Should be the item size if it will always be the same, 0 otherwise.
+ *
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn
+PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride,
+ npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * and swapping strided memory. This assumes each element is a single
+ * value to be swapped.
+ *
+ * Parameters are as for PyArray_GetStridedCopyFn.
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn
+PyArray_GetStridedCopySwapFn(npy_intp aligned, npy_intp src_stride,
+ npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * and swapping strided memory. This assumes each element is a pair
+ * of values, each of which needs to be swapped.
+ *
+ * Parameters are as for PyArray_GetStridedCopyFn.
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn
+PyArray_GetStridedCopySwapPairFn(npy_intp aligned, npy_intp src_stride,
+ npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * If it's possible, gives back a transfer function which casts and/or
+ * byte swaps data with the dtype 'from' into data with the dtype 'to'.
+ * If the outtransferdata is populated with a non-NULL value, it
+ * must be deallocated with the PyArray_free function when the transfer
+ * function is no longer required.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+PyArray_GetTransferFunction(int aligned,
+ npy_intp src_stride, npy_intp dst_stride,
+ PyArray_Descr *from, PyArray_Descr *to,
+ PyArray_StridedTransferFn *outstransfer,
+ void **outtransferdata);
+
+/*
+ * These two functions copy or convert the data of an n-dimensional array
+ * to/from a 1-dimensional strided buffer. These functions will only call
+ * 'stransfer' with the provided dst_stride/src_stride and
+ * dst_strides[0]/src_strides[0], so the caller can use those values to
+ * specialize the function.
+ *
+ * The return value is the number of elements it couldn't copy. A return value
+ * of 0 means all elements were copied, a larger value means the end of
+ * the n-dimensional array was reached before 'count' elements were copied.
+ *
+ * ndim:
+ * The number of dimensions of the n-dimensional array.
+ * dst/src:
+ * The destination or src starting pointer.
+ * dst_stride/src_stride:
+ * The stride of the 1-dimensional strided buffer
+ * dst_strides/src_strides:
+ * The strides of the n-dimensional array.
+ * dst_strides_inc/src_strides_inc:
+ * How much to add to the ..._strides pointer to get to the next stride.
+ * coords:
+ * The starting coordinates in the n-dimensional array.
+ * coords_inc:
+ * How much to add to the coords pointer to get to the next coordinate.
+ * shape:
+ * The shape of the n-dimensional array.
+ * shape_inc:
+ * How much to add to the shape pointer to get to the next shape entry.
+ * count:
+ * How many elements to transfer
+ * itemsize:
+ * How big each element is. If transfering between elements of different
+ * sizes, for example a casting operation, the 'stransfer' function
+ * should be specialized for that, in which case 'stransfer' will ignore
+ * this parameter.
+ * stransfer:
+ * The strided transfer function.
+ * transferdata:
+ * An auxiliary data pointer passed to the strided transfer function.
+ * If a non-NULL value is returned, it must be deallocated with the
+ * function PyArray_FreeStridedTransferData.
+ */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+ char *dst, npy_intp dst_stride,
+ char *src, npy_intp *src_strides, npy_intp src_strides_inc,
+ npy_intp *coords, npy_intp coords_inc,
+ npy_intp *shape, npy_intp shape_inc,
+ npy_intp count, npy_intp itemsize,
+ PyArray_StridedTransferFn stransfer,
+ void *transferdata);
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+ char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+ char *src, npy_intp src_stride,
+ npy_intp *coords, npy_intp coords_inc,
+ npy_intp *shape, npy_intp shape_inc,
+ npy_intp count, npy_intp itemsize,
+ PyArray_StridedTransferFn stransfer,
+ void *transferdata);
+
+#endif
diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
index 62c5929ca..f4a1d0b9a 100644
--- a/numpy/core/src/multiarray/new_iterator.c.src
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -6,6 +6,7 @@
#include <numpy/ndarrayobject.h>
#include "new_iterator.h"
+#include "lowlevel_strided_loops.h"
/* Adjust ptr to be the next 16-byte aligned position after adding 'amount' */
#define NPYITER_INC_BUFFERPTR(ptr, amount) \
@@ -34,21 +35,29 @@
#define NPY_ITFLAG_FORCEDORDER 0x020
/* The inner loop is handled outside the iterator */
#define NPY_ITFLAG_NOINNER 0x040
+/* The iterator is buffered */
+#define NPY_ITFLAG_BUFFER 0x080
+/* The iterator is buffered */
+#define NPY_ITFLAG_GROWINNER 0x100
/* Internal iterator per-operand iterator flags */
/* The operand will be written to */
-#define NPY_OP_ITFLAG_WRITE 0x001
+#define NPY_OP_ITFLAG_WRITE 0x01
/* The operand will be read from */
-#define NPY_OP_ITFLAG_READ 0x002
+#define NPY_OP_ITFLAG_READ 0x02
/* The operand may have a temporary copy made */
-#define NPY_OP_ITFLAG_COPY 0x004
+#define NPY_OP_ITFLAG_COPY 0x04
/* The operand needs casting */
-#define NPY_OP_ITFLAG_CAST 0x008
+#define NPY_OP_ITFLAG_CAST 0x08
/* The operand needs a byte swap and/or alignment operation */
-#define NPY_OP_ITFLAG_COPYSWAP 0x010
-/* The operand should not be allocated with a subtype */
-#define NPY_OP_ITFLAG_NOSUBTYPE 0x020
+#define NPY_OP_ITFLAG_COPYSWAP 0x10
+/* The operand never needs buffering */
+#define NPY_OP_ITFLAG_BUFNEVER 0x20
+/* The operand needs to be byte-swapped */
+#define NPY_OP_ITFLAG_BUFSWAP 0x40
+/* The operand is aligned */
+#define NPY_OP_ITFLAG_ALIGNED 0x80
/* Internal flag, for the type of operands */
#define NPY_ITER_OP_ARRAY 0
@@ -79,6 +88,8 @@
((NPY_SIZEOF_INTP)*(niter))
#define NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter) \
(NPY_INTP_ALIGNED(niter))
+#define NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter) \
+ ((itflags&NPY_ITFLAG_BUFFER) ? ((NPY_SIZEOF_INTP)*(3 + 7*niter)) : 0)
/* Byte offsets of the iterator members */
#define NIT_ITERSIZE_OFFSET() \
@@ -98,9 +109,12 @@
#define NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) \
(NIT_OBJECTS_OFFSET(itflags, ndim, niter) + \
NIT_OBJECTS_SIZEOF(itflags, ndim, niter))
-#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \
+#define NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) \
(NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) + \
NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter))
+#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \
+ (NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) + \
+ NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter))
/* Internal-only ITERATOR DATA MEMBER ACCESS */
#define NIT_ITFLAGS(iter) \
@@ -121,9 +135,27 @@
(char*)(iter) + NIT_OBJECTS_OFFSET(itflags, ndim, niter)))
#define NIT_OPITFLAGS(iter) ( \
(char*)(iter) + NIT_OPITFLAGS_OFFSET(itflags, ndim, niter))
+#define NIT_BUFFERDATA(iter) \
+ ((char*)(iter) + NIT_BUFFERDATA_OFFSET(itflags, ndim, niter))
#define NIT_AXISDATA(iter) \
((char*)(iter) + NIT_AXISDATA_OFFSET(itflags, ndim, niter))
+/* Internal-only BUFFERDATA MEMBER ACCESS */
+#define NBF_BUFFERSIZE(bufferdata) (*((npy_intp *)(bufferdata)))
+#define NBF_SIZE(bufferdata) (*((npy_intp *)(bufferdata) + 1))
+#define NBF_POS(bufferdata) (*((npy_intp *)(bufferdata) + 2))
+#define NBF_STRIDES(bufferdata) ((npy_intp *)(bufferdata) + 3)
+#define NBF_PTRS(bufferdata) ((char **)(bufferdata) + 3 + 1*(niter))
+#define NBF_READTRANSFERFN(bufferdata) \
+ ((PyArray_StridedTransferFn *)(bufferdata) + 3 + 2*(niter))
+#define NBF_READTRANSFERDATA(bufferdata) \
+ ((void **)(bufferdata) + 3 + 3*(niter))
+#define NBF_WRITETRANSFERFN(bufferdata) \
+ ((PyArray_StridedTransferFn *)(bufferdata) + 3 + 4*(niter))
+#define NBF_WRITETRANSFERDATA(bufferdata) \
+ ((void **)(bufferdata) + 3 + 5*(niter))
+#define NBF_BUFFERS(bufferdata) ((char **)(bufferdata) + 3 + 6*(niter))
+
/* Internal-only AXISDATA MEMBER ACCESS. */
#define NAD_SHAPE(axisdata) (*((npy_intp *)(axisdata)))
#define NAD_COORD(axisdata) (*((npy_intp *)(axisdata) + 1))
@@ -202,11 +234,20 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
npy_intp op_ndim, npy_intp *shape,
PyArray_Descr *op_dtype, npy_intp *op_axes);
+static int
+npyiter_allocate_buffers(NpyIter *iter);
+static int
+npyiter_jumpforward(NpyIter *iter, npy_intp count);
+static void
+npyiter_copy_from_buffers(NpyIter *iter);
+static void
+npyiter_copy_to_buffers(NpyIter *iter);
+
/* The constructor for an iterator over multiple objects */
NpyIter*
NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
npy_uint32 *op_flags, PyArray_Descr **op_request_dtypes,
- npy_intp oa_ndim, npy_intp **op_axes)
+ npy_intp oa_ndim, npy_intp **op_axes, npy_intp buffersize)
{
npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
npy_intp idim, ndim = 0;
@@ -215,10 +256,6 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
/* The iterator being constructed */
NpyIter *iter;
- /* For tracking buffer space at the end of the iterator memory */
- npy_intp bufferspace = 0;
- char *bufferptr = 0;
-
/* Per-operand values */
PyArrayObject *op[NPY_MAXARGS];
PyArray_Descr *op_dtype[NPY_MAXARGS];
@@ -228,6 +265,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
char **op_dataptr;
npy_intp *perm;
+ char *bufferdata = NULL;
char axes_dupcheck[NPY_MAXDIMS];
int any_allocate_if_null = 0, any_missing_dtypes = 0,
allocate_output_scalars = 0;
@@ -289,12 +327,12 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
}
}
- /* Checks the flags (C|F)_ORDER_INDEX, COORDS, and NO_INNER_ITERATION */
+ /* Checks the global iterator flags */
if (!pyiter_check_global_flags(flags, &itflags)) {
return NULL;
}
- /* If offsets were requested, make sure copying/buffering is disabled */
+ /* If offsets were requested, make sure copying is disabled */
if (itflags&NPY_ITFLAG_HASOFFSETS) {
for (iiter = 0; iiter < niter; ++iiter) {
if (op_flags[iiter]&(NPY_ITER_COPY|
@@ -307,6 +345,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
}
}
+ /*
+ * If buffering is enabled, and no buffersize was given, use a default
+ * chosen to be big enough to get some amortization benefits, but
+ * small enough to be cache-friendly.
+ */
+ if (itflags&NPY_ITFLAG_BUFFER && buffersize <= 0) {
+ buffersize = 256;
+ }
+
/* Prepare all the operands */
for (iiter = 0; iiter < niter; ++iiter) {
/*
@@ -386,18 +433,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
}
/* Allocate memory for the iterator */
- if (bufferspace > 0) {
- npy_intp itsize = NIT_SIZEOF_ITERATOR(itflags, ndim, niter);
- iter = (NpyIter*)
- PyArray_malloc(itsize + bufferspace);
- /* Initialize the buffer pointer to just after the iterator memory */
- bufferptr = (char*)iter;
- NPYITER_INC_BUFFERPTR(bufferptr, itsize);
- }
- else {
- iter = (NpyIter*)
- PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
- }
+ iter = (NpyIter*)
+ PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
/* Fill in the base data */
NIT_ITFLAGS(iter) = itflags;
@@ -453,12 +490,33 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
/* Set resetindex to zero as well (it's just after the resetdataptr) */
op_dataptr[niter] = 0;
+ /*
+ * Initialize buffer data (must set the buffers and transferdata
+ * to NULL before we might deallocate the iterator).
+ */
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ bufferdata = NIT_BUFFERDATA(iter);
+ memset(NBF_BUFFERS(bufferdata), 0, (niter+1)*NPY_SIZEOF_INTP);
+ for (iiter = 0; iiter < niter; ++iiter) {
+ NBF_READTRANSFERDATA(bufferdata)[iiter] = NULL;
+ NBF_WRITETRANSFERDATA(bufferdata)[iiter] = NULL;
+ }
+ }
+
/* Fill in the AXISDATA arrays and set the ITERSIZE field */
if (!npyiter_fill_axisdata(iter, op, op_ndim, op_dataptr, op_axes)) {
NpyIter_Deallocate(iter);
return NULL;
}
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ /* No point in a buffer bigger than the iteration size */
+ if (buffersize > NIT_ITERSIZE(iter)) {
+ buffersize = NIT_ITERSIZE(iter);
+ }
+ NBF_BUFFERSIZE(bufferdata) = buffersize;
+ }
+
/*
* If an index was requested, compute the strides for it.
* Note that we must do this before changing the order of the
@@ -574,7 +632,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
PyTypeObject *op_subtype;
/* Check whether the subtype was disabled */
- if (op_itflags[iiter]&NPY_OP_ITFLAG_NOSUBTYPE) {
+ if (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) {
op_subtype = &PyArray_Type;
}
else {
@@ -600,20 +658,16 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
*/
npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+ /* Newly allocated arrays need no buffering */
+ op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+ op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
}
- else if (op_itflags[iiter]&
- (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) {
+ else if ((op_itflags[iiter]&
+ (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) &&
+ (op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) {
PyArrayObject *temp;
- /* Copying or buffering must be enabled for casting/conversion */
- if (!(op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) {
- PyErr_SetString(PyExc_TypeError,
- "Iterator input required copying or buffering, "
- "but neither copying nor buffering was enabled");
- NpyIter_Deallocate(iter);
- return NULL;
- }
-
/* Allocate the temporary array, if possible */
temp = npyiter_new_temp_array(iter, &PyArray_Type,
PyArray_NDIM(op[iiter]),
@@ -651,7 +705,44 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
*/
npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+ /* Copied arrays need no buffering */
+ op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+ op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
+ }
+ else {
+ /*
+ * Buffering must be enabled for casting/conversion if copy
+ * wasn't specified.
+ */
+ if (op_itflags[iiter]&
+ (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP) &&
+ !(itflags&NPY_ITFLAG_BUFFER)) {
+ PyErr_SetString(PyExc_TypeError,
+ "Iterator input required copying or buffering, "
+ "but neither copying nor buffering was enabled");
+ NpyIter_Deallocate(iter);
+ return NULL;
+ }
+
+ /*
+ * If the operand is aligned, any buffering can use aligned
+ * optimizations.
+ */
+ if (PyArray_ISALIGNED(op[iiter])) {
+ op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+ }
+
+ /*
+ * If the operand has a different byte order than the
+ * desired data type, need a swap
+ */
+ if (PyArray_ISNBO(PyArray_DESCR(op[iiter])->byteorder) !=
+ PyArray_ISNBO(op_dtype[iiter]->byteorder)) {
+ op_itflags[iiter] |= NPY_OP_ITFLAG_BUFSWAP;
+ }
}
+
}
/*
@@ -665,7 +756,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
}
/* Now that the axes are finished, adjust ITERSIZE if necessary */
- if (itflags&NPY_ITFLAG_NOINNER) {
+ if ((itflags&NPY_ITFLAG_NOINNER) && !(itflags&NPY_ITFLAG_BUFFER)) {
char *axisdata = NIT_AXISDATA(iter);
NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata);
}
@@ -673,13 +764,29 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
/* Copy the per-op itflags into the iterator */
memcpy(NIT_OPITFLAGS(iter), op_itflags, niter);
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ /* Allocate the buffers */
+ if (!npyiter_allocate_buffers(iter)) {
+ NpyIter_Deallocate(iter);
+ return NULL;
+ }
+
+ /* BUFFERED + NOINNER may not have a predictable itersize */
+ if (itflags&NPY_ITFLAG_NOINNER) {
+ NIT_ITERSIZE(iter) = 0;
+ }
+
+ /* Prepare the next buffers and set pos/size */
+ npyiter_copy_to_buffers(iter);
+ }
+
return iter;
}
/* The constructor for an iterator over one object */
NpyIter*
NpyIter_New(PyArrayObject *op, npy_uint32 flags, PyArray_Descr* dtype,
- npy_intp a_ndim, npy_intp *axes)
+ npy_intp a_ndim, npy_intp *axes, npy_intp buffersize)
{
/* Split the flags into separate global and op flags */
npy_uint32 op_flags = flags&NPY_ITER_PER_OP_FLAGS;
@@ -687,22 +794,52 @@ NpyIter_New(PyArrayObject *op, npy_uint32 flags, PyArray_Descr* dtype,
if (a_ndim > 0) {
return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype,
- a_ndim, &axes);
+ a_ndim, &axes, buffersize);
}
else {
- return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype, 0, NULL);
+ return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype,
+ 0, NULL, buffersize);
}
}
int NpyIter_Deallocate(NpyIter *iter)
{
- /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
npy_intp ndim = NIT_NDIM(iter);
npy_intp iiter, niter = NIT_NITER(iter);
PyArray_Descr **dtype = NIT_DTYPES(iter);
PyArrayObject **object = NIT_OBJECTS(iter);
+ /* Deallocate any buffers and buffering data */
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ char *bufferdata = NIT_BUFFERDATA(iter);
+ char **buffers;
+ void **transferdata;
+
+ /* buffers */
+ buffers = NBF_BUFFERS(bufferdata);
+ for(iiter = 0; iiter < niter; ++iiter, ++buffers) {
+ if (*buffers) {
+ PyArray_free(*buffers);
+ }
+ }
+ /* read bufferdata */
+ transferdata = NBF_READTRANSFERDATA(bufferdata);
+ for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+ if (*transferdata) {
+ PyArray_FreeStridedTransferData(*transferdata);
+ }
+ }
+ /* write bufferdata */
+ transferdata = NBF_WRITETRANSFERDATA(bufferdata);
+ for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+ if (*transferdata) {
+ PyArray_FreeStridedTransferData(*transferdata);
+ }
+ }
+ }
+
/* Deallocate all the dtypes and objects that were iterated */
for(iiter = 0; iiter < niter; ++iiter, ++dtype, ++object) {
Py_XDECREF(*dtype);
@@ -739,8 +876,6 @@ int NpyIter_RemoveInnerLoop(NpyIter *iter)
npy_intp ndim = NIT_NDIM(iter);
npy_intp niter = NIT_NITER(iter);
- char *axisdata;
-
/* Check conditions under which this can be done */
if (itflags&(NPY_ITFLAG_HASINDEX|NPY_ITFLAG_HASCOORDS)) {
PyErr_SetString(PyExc_ValueError,
@@ -754,8 +889,15 @@ int NpyIter_RemoveInnerLoop(NpyIter *iter)
NIT_ITFLAGS(iter) = itflags;
/* Adjust ITERSIZE */
- axisdata = NIT_AXISDATA(iter);
- NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata);
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ /* BUFFERED + NOINNER may not have a predictable itersize */
+ NIT_ITERSIZE(iter) = 0;
+
+ }
+ else {
+ char *axisdata = NIT_AXISDATA(iter);
+ NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata);
+ }
}
/* Reset the iterator */
@@ -789,6 +931,11 @@ void NpyIter_Reset(NpyIter *iter)
ptrs[istrides] = resetdataptr[istrides];
}
}
+
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ /* Prepare the next buffers and set pos/size */
+ npyiter_copy_to_buffers(iter);
+ }
}
/*
@@ -818,6 +965,13 @@ int NpyIter_GotoCoords(NpyIter *iter, npy_intp *coords)
return NPY_FAIL;
}
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ PyErr_SetString(PyExc_ValueError,
+ "Cannot call GotoCoords on an iterator which "
+ "is buffered");
+ return NPY_FAIL;
+ }
+
perm = NIT_PERM(iter);
dataptr = NIT_RESETDATAPTR(iter);
axisdata = NIT_AXISDATA(iter);
@@ -897,6 +1051,13 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index)
return NPY_FAIL;
}
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ PyErr_SetString(PyExc_ValueError,
+ "Cannot call GotoIndex on an iterator which "
+ "is buffered");
+ return NPY_FAIL;
+ }
+
if (index < 0 || index >= NIT_ITERSIZE(iter)) {
PyErr_SetString(PyExc_IndexError,
"Iterator GotoIndex called with out-of-bounds "
@@ -947,14 +1108,15 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index)
return NPY_SUCCEED;
}
-/* SPECIALIZED iternext functions */
+/* SPECIALIZED iternext functions that handle the non-buffering part */
- /* The combination HASINDEX | NOINNER is excluded in the New functions */
/**begin repeat
* #const_itflags = 0,
* NPY_ITFLAG_HASINDEX,
- * NPY_ITFLAG_NOINNER#
- * #tag_itflags = 0, IND, NOINN#
+ * NPY_ITFLAG_NOINNER,
+ * NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, NOINN, BUF, INDuBUF#
*/
/**begin repeat1
* #const_ndim = 1, 2, 100#
@@ -965,7 +1127,7 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index)
* #tag_niter = 1, 2, ANY#
*/
-/* Specialized iternext (@const_itflags@,@const_ndim@,@const_niter@) */
+/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_niter@) */
NPY_NO_EXPORT int
npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
NpyIter *iter)
@@ -1107,6 +1269,53 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
/**end repeat1**/
/**end repeat**/
+/* iternext function that handle the buffering part */
+NPY_NO_EXPORT int
+npyiter_buffered_iternext(NpyIter *iter)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp ndim = NIT_NDIM(iter);
+ npy_intp niter = NIT_NITER(iter);
+
+ char *bufferdata = NIT_BUFFERDATA(iter);
+
+ /*
+ * If the iterator handles the inner loop, need to increment all
+ * the coordinates and pointers
+ */
+ if (!(itflags&NPY_ITFLAG_NOINNER)) {
+ /* Increment within the buffer */
+ if (++NBF_POS(bufferdata) < NBF_SIZE(bufferdata)) {
+ npy_intp iiter, *strides;
+ char **ptrs;
+
+ strides = NBF_STRIDES(bufferdata);
+ ptrs = NBF_PTRS(bufferdata);
+ for (iiter = 0; iiter < niter; ++iiter) {
+ ptrs[iiter] += strides[iiter];
+ }
+ return 1;
+ }
+ }
+
+ /* Write back to the arrays */
+ npyiter_copy_from_buffers(iter);
+
+ /* Increment to the next buffer */
+ if (!npyiter_jumpforward(iter, NBF_SIZE(bufferdata))) {
+ return 0;
+ }
+
+ /* Prepare the next buffers and set pos/size */
+ npyiter_copy_to_buffers(iter);
+
+ return 1;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
/* Specialization of iternext for when the iteration size is 1 */
NPY_NO_EXPORT int
npyiter_iternext_sizeone(NpyIter *iter)
@@ -1130,11 +1339,18 @@ NpyIter_IterNext_Fn NpyIter_GetIterNext(NpyIter *iter)
}
/*
+ * If buffering is enabled, don't specialize further.
+ */
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ return &npyiter_buffered_iternext;
+ }
+
+ /*
* Ignore all the flags that don't affect the iterator memory
- * layout or the iternext function. Currently only HASINDEX
- * and NOINNER affect them.
+ * layout or the iternext function. Currently only HASINDEX,
+ * NOINNER, and BUFFER affect them.
*/
- itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NOINNER);
+ itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NOINNER|NPY_ITFLAG_BUFFER);
/* Switch statements let the compiler optimize this most effectively */
switch (itflags) {
@@ -1199,8 +1415,15 @@ NpyIter_IterNext_Fn NpyIter_GetIterNext(NpyIter *iter)
* NPY_ITFLAG_IDENTPERM,
* NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
* NPY_ITFLAG_NEGPERM,
- * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM#
- * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP#
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ * NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ * BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
*/
NPY_NO_EXPORT void
npyiter_getcoord_itflags@tag_itflags@(NpyIter *iter, npy_intp *outcoord)
@@ -1264,7 +1487,8 @@ NpyIter_GetCoords_Fn NpyIter_GetGetCoords(NpyIter *iter)
*/
itflags &= (NPY_ITFLAG_HASINDEX |
NPY_ITFLAG_IDENTPERM |
- NPY_ITFLAG_NEGPERM);
+ NPY_ITFLAG_NEGPERM |
+ NPY_ITFLAG_BUFFER);
switch (itflags) {
/**begin repeat
@@ -1273,8 +1497,15 @@ NpyIter_GetCoords_Fn NpyIter_GetGetCoords(NpyIter *iter)
* NPY_ITFLAG_IDENTPERM,
* NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
* NPY_ITFLAG_NEGPERM,
- * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM#
- * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP#
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ * NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ * BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
*/
case @const_itflags@:
return npyiter_getcoord_itflags@tag_itflags@;
@@ -1326,7 +1557,7 @@ npy_intp NpyIter_GetIterSize(NpyIter *iter)
int NpyIter_GetShape(NpyIter *iter, npy_intp *outshape)
{
- const npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
npy_intp ndim = NIT_NDIM(iter);
npy_intp niter = NIT_NITER(iter);
@@ -1363,9 +1594,16 @@ char **NpyIter_GetDataPtrArray(NpyIter *iter)
npy_intp ndim = NIT_NDIM(iter);
npy_intp niter = NIT_NITER(iter);
- char* axisdata = NIT_AXISDATA(iter);
+ char* data;
- return NAD_PTRS(axisdata);
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ data = NIT_BUFFERDATA(iter);
+ return NBF_PTRS(data);
+ }
+ else {
+ data = NIT_AXISDATA(iter);
+ return NAD_PTRS(data);
+ }
}
PyArray_Descr **NpyIter_GetDescrArray(NpyIter *iter)
@@ -1488,22 +1726,38 @@ void NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags)
npy_intp *NpyIter_GetInnerStrideArray(NpyIter *iter)
{
- /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
npy_intp ndim = NIT_NDIM(iter);
npy_intp niter = NIT_NITER(iter);
- char* axisdata = NIT_AXISDATA(iter);
- return NAD_STRIDES(axisdata);
+ char* data;
+
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ data = NIT_BUFFERDATA(iter);
+ return NBF_STRIDES(data);
+ }
+ else {
+ data = NIT_AXISDATA(iter);
+ return NAD_STRIDES(data);
+ }
}
npy_intp* NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
{
- /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
npy_intp ndim = NIT_NDIM(iter);
npy_intp niter = NIT_NITER(iter);
- char* axisdata = NIT_AXISDATA(iter);
- return &NAD_SHAPE(axisdata);
+ char* data;
+
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ data = NIT_BUFFERDATA(iter);
+ return &NBF_SIZE(data);
+ }
+ else {
+ data = NIT_AXISDATA(iter);
+ return &NAD_SHAPE(data);
+ }
}
/* Checks 'flags' for (C|F)_ORDER_INDEX, COORDS, and NO_INNER_ITERATION,
@@ -1560,6 +1814,13 @@ pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
}
/* Check if offsets were requested instead of pointers */
if (flags&NPY_ITER_OFFSETS) {
+ /* Buffering is incompatible with offsets */
+ if (flags&(NPY_ITER_BUFFERED|NPY_ITER_BUFFERED_GROWINNER)) {
+ PyErr_SetString(PyExc_ValueError,
+ "Iterator flag BUFFERED cannot be used "
+ "with the flag OFFSETS");
+ return 0;
+ }
(*itflags) |= NPY_ITFLAG_HASOFFSETS;
}
/* Check if the caller wants to handle inner iteration */
@@ -1572,6 +1833,13 @@ pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
}
(*itflags) |= NPY_ITFLAG_NOINNER;
}
+ /* Buffering */
+ if (flags&(NPY_ITER_BUFFERED|NPY_ITER_BUFFERED_GROWINNER)) {
+ (*itflags) |= NPY_ITFLAG_BUFFER;
+ if (flags&NPY_ITER_BUFFERED_GROWINNER) {
+ (*itflags) |= NPY_ITFLAG_GROWINNER;
+ }
+ }
return 1;
}
@@ -1717,11 +1985,7 @@ pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
/* NULL operands must be automatically allocated outputs */
if (*op == NULL) {
/* ALLOCATE should be enabled */
- if (op_flags&NPY_ITER_ALLOCATE) {
- if (op_flags&NPY_ITER_NO_SUBTYPE) {
- *op_itflags |= NPY_OP_ITFLAG_NOSUBTYPE;
- }
- } else {
+ if (!(op_flags&NPY_ITER_ALLOCATE)) {
PyErr_SetString(PyExc_ValueError,
"Iterator input was NULL, but automatic allocation as an "
"output wasn't requested");
@@ -2944,6 +3208,380 @@ npyiter_promote_types(int type1, int type2)
return NPY_NOTYPE;
}
+static int
+npyiter_allocate_buffers(NpyIter *iter)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp ndim = NIT_NDIM(iter);
+ npy_intp iiter, niter = NIT_NITER(iter);
+
+ char *op_itflags = NIT_OPITFLAGS(iter);
+ char *bufferdata = NIT_BUFFERDATA(iter), *axisdata = NIT_AXISDATA(iter);
+ PyArrayObject **op = NIT_OBJECTS(iter);
+ PyArray_Descr **op_dtype = NIT_DTYPES(iter);
+ npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
+ npy_intp buffersize = NBF_BUFFERSIZE(bufferdata);
+ char *buffer;
+
+ PyArray_StridedTransferFn stransfer = NULL;
+ void *transferdata = NULL;
+
+ for (iiter = 0; iiter < niter; ++iiter) {
+ char flags = op_itflags[iiter];
+ op_stride = strides[iiter];
+
+ /*
+ * If we have determined that a buffer may be needed,
+ * allocate one.
+ */
+ if (!(flags&NPY_OP_ITFLAG_BUFNEVER)) {
+ npy_intp itemsize = op_dtype[iiter]->elsize;
+ buffer = PyArray_malloc(itemsize*buffersize);
+ if (buffer == NULL) {
+ PyErr_NoMemory();
+ return 0;
+ }
+ NBF_BUFFERS(bufferdata)[iiter] = buffer;
+
+ /* Also need to get an appropriate transfer functions */
+ if (flags&NPY_OP_ITFLAG_READ) {
+ if (PyArray_GetTransferFunction(
+ (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
+ op_stride,
+ op_dtype[iiter]->elsize,
+ PyArray_DESCR(op[iiter]),
+ op_dtype[iiter],
+ &stransfer,
+ &transferdata) != NPY_SUCCEED) {
+ return 0;
+ }
+ NBF_READTRANSFERFN(bufferdata)[iiter] = stransfer;
+ NBF_READTRANSFERDATA(bufferdata)[iiter] = transferdata;
+ }
+ else {
+ NBF_READTRANSFERFN(bufferdata)[iiter] = NULL;
+ }
+ if (flags&NPY_OP_ITFLAG_WRITE) {
+ if (PyArray_GetTransferFunction(
+ (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
+ op_dtype[iiter]->elsize,
+ op_stride,
+ op_dtype[iiter],
+ PyArray_DESCR(op[iiter]),
+ &stransfer,
+ &transferdata) != NPY_SUCCEED) {
+ return 0;
+ }
+ NBF_WRITETRANSFERFN(bufferdata)[iiter] = stransfer;
+ NBF_WRITETRANSFERDATA(bufferdata)[iiter] = transferdata;
+ }
+ else {
+ NBF_WRITETRANSFERFN(bufferdata)[iiter] = NULL;
+ }
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * This function jumps the iterator forward by 'count' steps.
+ * It is used by the buffering code, and only affects the non-buffering
+ * data.
+ *
+ * Returns 1 if it landed on a valid element, 0 if it went
+ * past the end.
+ */
+static int
+npyiter_jumpforward(NpyIter *iter, npy_intp count)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp idim, ndim = NIT_NDIM(iter);
+ npy_intp niter = NIT_NITER(iter);
+
+ char *axisdata, **dataptr;
+ npy_intp sizeof_axisdata;
+ npy_intp delta, coord, shape;
+ npy_intp istrides, nstrides;
+
+ sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter);
+ axisdata = NIT_AXISDATA(iter);
+
+ /* Go forward through axisdata, incrementing the coordinates */
+ for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) {
+ shape = NAD_SHAPE(axisdata);
+ delta = count % shape;
+ coord = NAD_COORD(axisdata) + delta;
+ count -= delta;
+ if (coord >= shape) {
+ coord -= shape;
+ count += shape;
+ }
+ NAD_COORD(axisdata) = coord;
+ if (count == 0) {
+ break;
+ }
+ count /= shape;
+ }
+
+ /* If it incremented past the end, set to the last coordinate */
+ if (count > 0) {
+ axisdata = NIT_AXISDATA(iter);
+ for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) {
+ NAD_COORD(axisdata) = NAD_SHAPE(axisdata)-1;
+ }
+ return 0;
+ }
+
+ /* Go backwards through axisdata, applying the new coordinates */
+ dataptr = NIT_RESETDATAPTR(iter);
+ nstrides = NAD_NSTRIDES();
+
+ /*
+ * Set the pointers, from the slowest-changing to the
+ * fastest-changing. The successive pointers accumulate
+ * the offsets, starting from the original data pointers.
+ */
+ axisdata = NIT_AXISDATA(iter) + (ndim-1)*sizeof_axisdata;
+ for (idim = 0; idim < ndim; ++idim, axisdata -= sizeof_axisdata) {
+ npy_intp *strides;
+ char **ptrs;
+
+ coord = NAD_COORD(axisdata);
+ strides = NAD_STRIDES(axisdata);
+ ptrs = NAD_PTRS(axisdata);
+
+ for (istrides = 0; istrides < nstrides; ++istrides) {
+ ptrs[istrides] = dataptr[istrides] + coord*strides[istrides];
+ }
+
+ dataptr = ptrs;
+ }
+
+ return 1;
+}
+
+/*
+ * This checks how much space is left before we reach the end of
+ * the iterator, and whether the whole buffer can be done with one
+ * stride.
+ */
+static npy_intp
+npyiter_checkspaceleft(NpyIter *iter, npy_intp count,
+ int *out_is_onestride)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp idim, ndim = NIT_NDIM(iter);
+ npy_intp niter = NIT_NITER(iter);
+
+ char *axisdata;
+ npy_intp sizeof_axisdata;
+ npy_intp coord, shape;
+ npy_intp spaceleft = 1, factor = 1;
+
+ sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter);
+ axisdata = NIT_AXISDATA(iter);
+
+ /* Go forward through axisdata, calculating the space left */
+ for (idim = 0; idim < ndim && spaceleft < count;
+ ++idim, axisdata += sizeof_axisdata) {
+ shape = NAD_SHAPE(axisdata);
+ coord = NAD_COORD(axisdata);
+ spaceleft += (shape-coord-1) * factor;
+ factor *= shape;
+ }
+
+ /* If we broke out after dimension 0, it works with one stride */
+ if (idim == 1) {
+ *out_is_onestride = 1;
+ }
+ else {
+ *out_is_onestride = 0;
+ }
+
+ /* Return either count or how much space is left */
+ if (spaceleft < count) {
+ return spaceleft;
+ }
+ else {
+ return count;
+ }
+}
+
+/*
+ * This gets called after the the buffers have been exhausted, and
+ * their data needs to be written back to the arrays. The coordinates
+ * must be positioned for the beginning of the buffer.
+ */
+static void
+npyiter_copy_from_buffers(NpyIter *iter)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp ndim = NIT_NDIM(iter);
+ npy_intp iiter, niter = NIT_NITER(iter);
+
+ char *op_itflags = NIT_OPITFLAGS(iter);
+ char *bufferdata = NIT_BUFFERDATA(iter),
+ *axisdata = NIT_AXISDATA(iter);
+
+ PyArray_Descr **dtypes = NIT_DTYPES(iter);
+ npy_intp transfersize = NBF_SIZE(bufferdata);
+ npy_intp *strides = NBF_STRIDES(bufferdata),
+ *ad_strides = NAD_STRIDES(axisdata);
+ char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
+ char **buffers = NBF_BUFFERS(bufferdata);
+
+ PyArray_StridedTransferFn stransfer = NULL;
+ void *transferdata = NULL;
+
+ npy_intp axisdata_incr = NIT_SIZEOF_AXISDATA(itflags, ndim, niter) /
+ NPY_SIZEOF_INTP;
+
+ for (iiter = 0; iiter < niter; ++iiter) {
+ stransfer = NBF_WRITETRANSFERFN(bufferdata)[iiter];
+ transferdata = NBF_WRITETRANSFERDATA(bufferdata)[iiter];
+ if ((stransfer != NULL) && (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) {
+ /* Copy back only if the pointer was pointing to the buffer */
+ npy_intp delta = (ptrs[iiter] - buffers[iiter]);
+ if (0 <= delta && delta <= transfersize*dtypes[iiter]->elsize) {
+ /*printf("transfer %p <- %p\n", ad_ptrs[iiter], buffers[iiter]);*/
+ PyArray_TransferStridedToNDim(ndim,
+ ad_ptrs[iiter], &ad_strides[iiter], axisdata_incr,
+ buffers[iiter], strides[iiter],
+ &NAD_COORD(axisdata), axisdata_incr,
+ &NAD_SHAPE(axisdata), axisdata_incr,
+ transfersize, dtypes[iiter]->elsize,
+ stransfer,
+ transferdata);
+ }
+ }
+ }
+}
+
+/*
+ * This gets called after the iterator has been positioned to coordinates
+ * for the start of a buffer. It decides which operands need a buffer,
+ * and copies the data into the buffers.
+ */
+static void
+npyiter_copy_to_buffers(NpyIter *iter)
+{
+ npy_uint32 itflags = NIT_ITFLAGS(iter);
+ npy_intp ndim = NIT_NDIM(iter);
+ npy_intp iiter, niter = NIT_NITER(iter);
+
+ char *op_itflags = NIT_OPITFLAGS(iter);
+ char *bufferdata = NIT_BUFFERDATA(iter),
+ *axisdata = NIT_AXISDATA(iter);
+
+ PyArray_Descr **dtypes = NIT_DTYPES(iter);
+ npy_intp *strides = NBF_STRIDES(bufferdata),
+ *ad_strides = NAD_STRIDES(axisdata);
+ char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
+ char **buffers = NBF_BUFFERS(bufferdata);
+ npy_intp buffersize = NBF_BUFFERSIZE(bufferdata), transfersize;
+ int is_onestride = 0, any_buffered = 0;
+
+ PyArray_StridedTransferFn stransfer = NULL;
+ void *transferdata = NULL;
+
+ npy_intp axisdata_incr = NIT_SIZEOF_AXISDATA(itflags, ndim, niter) /
+ NPY_SIZEOF_INTP;
+
+ transfersize = npyiter_checkspaceleft(iter, buffersize, &is_onestride);
+ NBF_SIZE(bufferdata) = transfersize;
+ NBF_POS(bufferdata) = 0;
+
+ for (iiter = 0; iiter < niter; ++iiter) {
+ /*
+ * If the buffer is write-only, these two are NULL, and the buffer
+ * pointers will be set up but the read copy won't be done
+ */
+ stransfer = NBF_READTRANSFERFN(bufferdata)[iiter];
+ transferdata = NBF_READTRANSFERDATA(bufferdata)[iiter];
+ switch (op_itflags[iiter]&
+ (NPY_OP_ITFLAG_BUFNEVER|
+ NPY_OP_ITFLAG_BUFSWAP|
+ NPY_OP_ITFLAG_ALIGNED|
+ NPY_OP_ITFLAG_CAST)) {
+ /* never need to buffer this operand */
+ case NPY_OP_ITFLAG_BUFNEVER:
+ ptrs[iiter] = ad_ptrs[iiter];
+ strides[iiter] = ad_strides[iiter];
+ stransfer = NULL;
+ break;
+ /* aligned copy*/
+ case NPY_OP_ITFLAG_ALIGNED:
+ /*
+ * Since all we're doing is copying the data to make
+ * it work with one stride, if it already does there's
+ * no need to copy.
+ */
+ if (is_onestride) {
+ ptrs[iiter] = ad_ptrs[iiter];
+ strides[iiter] = ad_strides[iiter];
+ stransfer = NULL;
+ }
+ else {
+ /* In this case, the buffer is being used */
+ ptrs[iiter] = buffers[iiter];
+ strides[iiter] = dtypes[iiter]->elsize;
+ }
+ break;
+ /* unaligned copy */
+ case 0:
+ /*
+ * If aligned data wasn't requested (as seen by the
+ * copyswap flag not being set), all we're doing is
+ * copying the data to make it work with one stride.
+ * If it already does there's no need to copy.
+ */
+ if (is_onestride &&
+ !(op_itflags[iiter]&NPY_OP_ITFLAG_COPYSWAP)) {
+ ptrs[iiter] = ad_ptrs[iiter];
+ strides[iiter] = ad_strides[iiter];
+ stransfer = NULL;
+ }
+ else {
+ /* In this case, the buffer is being used */
+ ptrs[iiter] = buffers[iiter];
+ strides[iiter] = dtypes[iiter]->elsize;
+ }
+ break;
+ default:
+ /* In this case, the buffer is being used */
+ ptrs[iiter] = buffers[iiter];
+ strides[iiter] = dtypes[iiter]->elsize;
+ break;
+ }
+
+ if (stransfer != NULL) {
+ /*printf("transfer %p -> %p\n", ad_ptrs[iiter], ptrs[iiter]);*/
+ any_buffered = 1;
+ PyArray_TransferNDimToStrided(ndim,
+ ptrs[iiter], strides[iiter],
+ ad_ptrs[iiter], &ad_strides[iiter], axisdata_incr,
+ &NAD_COORD(axisdata), axisdata_incr,
+ &NAD_SHAPE(axisdata), axisdata_incr,
+ transfersize, dtypes[iiter]->elsize,
+ stransfer,
+ transferdata);
+ }
+
+ }
+
+ /*
+ * If buffering wasn't needed, we can grow the inner
+ * loop to as large as possible.
+ */
+ if (!any_buffered && (itflags&NPY_ITFLAG_GROWINNER)) {
+ npy_intp maxsize = NAD_SHAPE(axisdata)-NAD_COORD(axisdata);
+ if (maxsize > transfersize) {
+ NBF_SIZE(bufferdata) = maxsize;
+ }
+ }
+}
+
/* For debugging */
NPY_NO_EXPORT void
NpyIter_DebugPrint(NpyIter *iter)
@@ -2972,6 +3610,10 @@ NpyIter_DebugPrint(NpyIter *iter)
printf("FORCEDORDER ");
if (itflags&NPY_ITFLAG_NOINNER)
printf("NOINNER ");
+ if (itflags&NPY_ITFLAG_BUFFER)
+ printf("BUFFER ");
+ if (itflags&NPY_ITFLAG_GROWINNER)
+ printf("GROWINNER ");
printf("\n");
printf("NDim: %d\n", (int)ndim);
printf("NIter: %d\n", (int)niter);
@@ -3018,22 +3660,63 @@ NpyIter_DebugPrint(NpyIter *iter)
printf("OpItFlags:\n");
for (iiter = 0; iiter < niter; ++iiter) {
printf(" Flags[%d]: ", (int)iiter);
- if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE)
- printf("WRITE ");
if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_READ)
printf("READ ");
+ if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE)
+ printf("WRITE ");
if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_COPY)
printf("COPY ");
if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_CAST)
printf("CAST ");
if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_COPYSWAP)
printf("COPYSWAP ");
- if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_NOSUBTYPE)
- printf("NOSUBTYPE ");
+ if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFNEVER)
+ printf("BUFNEVER ");
+ if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFSWAP)
+ printf("BUFSWAP ");
+ if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_ALIGNED)
+ printf("ALIGNED ");
printf("\n");
}
printf("\n");
+ if (itflags&NPY_ITFLAG_BUFFER) {
+ char *bufferdata = NIT_BUFFERDATA(iter);
+ printf("BufferData:\n");
+ printf(" BufferSize: %d\n", (int)NBF_BUFFERSIZE(bufferdata));
+ printf(" Size: %d\n", (int)NBF_SIZE(bufferdata));
+ printf(" Pos: %d\n", (int)NBF_POS(bufferdata));
+ printf(" Strides: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%d ", (int)NBF_STRIDES(bufferdata)[iiter]);
+ printf("\n");
+ printf(" Ptrs: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_PTRS(bufferdata)[iiter]);
+ printf("\n");
+ printf(" ReadTransferFn: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_READTRANSFERFN(bufferdata)[iiter]);
+ printf("\n");
+ printf(" ReadTransferData: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_READTRANSFERDATA(bufferdata)[iiter]);
+ printf("\n");
+ printf(" WriteTransferFn: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_WRITETRANSFERFN(bufferdata)[iiter]);
+ printf("\n");
+ printf(" WriteTransferData: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_WRITETRANSFERDATA(bufferdata)[iiter]);
+ printf("\n");
+ printf(" Buffers: ");
+ for (iiter = 0; iiter < niter; ++iiter)
+ printf("%p ", NBF_BUFFERS(bufferdata)[iiter]);
+ printf("\n");
+ printf("\n");
+ }
+
axisdata = NIT_AXISDATA(iter);
sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter);
for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) {
diff --git a/numpy/core/src/multiarray/new_iterator.h b/numpy/core/src/multiarray/new_iterator.h
index 63351471a..905642d79 100644
--- a/numpy/core/src/multiarray/new_iterator.h
+++ b/numpy/core/src/multiarray/new_iterator.h
@@ -16,13 +16,13 @@ typedef void (*NpyIter_GetCoords_Fn )(NpyIter *iter,
/* Allocate a new iterator over one array object */
NpyIter*
NpyIter_New(PyArrayObject* op, npy_uint32 flags, PyArray_Descr* dtype,
- npy_intp a_ndim, npy_intp *axes);
+ npy_intp a_ndim, npy_intp *axes, npy_intp buffersize);
/* Allocate a new iterator over multiple array objects */
NpyIter*
NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
npy_uint32 *op_flags, PyArray_Descr **op_request_dtypes,
- npy_intp oa_ndim, npy_intp **op_axes);
+ npy_intp oa_ndim, npy_intp **op_axes, npy_intp buffersize);
/* Removes coords support from an iterator */
int NpyIter_RemoveCoords(NpyIter *iter);
@@ -107,6 +107,10 @@ NPY_NO_EXPORT void NpyIter_DebugPrint(NpyIter *iter);
#define NPY_ITER_COMMON_DATA_TYPE 0x00000080
/* Produce offsets instead of pointers into the data */
#define NPY_ITER_OFFSETS 0x00000100
+/* Enables buffering */
+#define NPY_ITER_BUFFERED 0x00000200
+/* Enables buffering, and grows the inner loop when possible */
+#define NPY_ITER_BUFFERED_GROWINNER 0x00000400
/*** Per-operand flags that may be passed to the iterator constructors ***/
diff --git a/numpy/core/src/multiarray/new_iterator_pywrap.c b/numpy/core/src/multiarray/new_iterator_pywrap.c
index 3ec9008fb..0695d7f4c 100644
--- a/numpy/core/src/multiarray/new_iterator_pywrap.c
+++ b/numpy/core/src/multiarray/new_iterator_pywrap.c
@@ -79,12 +79,12 @@ static int
npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
{
static char *kwlist[] = {"op", "flags", "op_flags", "op_dtypes",
- "op_axes"};
+ "op_axes", "buffersize"};
PyObject *op_in, *flags_in = NULL, *op_flags_in = NULL,
*op_dtypes_in = NULL, *op_axes_in = NULL;
- npy_intp iiter;
+ npy_intp iiter, buffersize = 0;
npy_intp niter = 0;
PyArrayObject *op[NPY_MAXARGS];
@@ -101,9 +101,9 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
return -1;
}
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOOO", kwlist,
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOOOi", kwlist,
&op_in, &flags_in, &op_flags_in, &op_dtypes_in,
- &op_axes_in)) {
+ &op_axes_in, &buffersize)) {
return -1;
}
@@ -170,6 +170,14 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
}
/* Use switch statements to quickly isolate the right flag */
switch (str[0]) {
+ case 'b':
+ if (strcmp(str, "buffered") == 0) {
+ flag = NPY_ITER_BUFFERED;
+ }
+ else if (strcmp(str, "buffered_growinner") == 0) {
+ flag = NPY_ITER_BUFFERED_GROWINNER;
+ }
+ break;
case 'c':
if (length >= 6) switch (str[5]) {
case 'e':
@@ -531,7 +539,8 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
self->iter = NpyIter_MultiNew(niter, op, flags, op_flags,
(PyArray_Descr**)op_request_dtypes,
- oa_ndim, oa_ndim > 0 ? op_axes : NULL);
+ oa_ndim, oa_ndim > 0 ? op_axes : NULL,
+ buffersize);
if (self->iter == NULL) {
goto fail;