diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/setup.py | 7 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 1024 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.h | 153 | ||||
-rw-r--r-- | numpy/core/src/multiarray/new_iterator.c.src | 841 | ||||
-rw-r--r-- | numpy/core/src/multiarray/new_iterator.h | 8 | ||||
-rw-r--r-- | numpy/core/src/multiarray/new_iterator_pywrap.c | 19 |
6 files changed, 1964 insertions, 88 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py index acf2726a7..442bcee57 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -601,7 +601,8 @@ def configuration(parent_package='',top_path=None): subpath = join('src', 'multiarray') sources = [join(local_dir, subpath, 'scalartypes.c.src'), join(local_dir, subpath, 'arraytypes.c.src'), - join(local_dir, subpath, 'new_iterator.c.src')] + join(local_dir, subpath, 'new_iterator.c.src'), + join(local_dir, subpath, 'lowlevel_strided_loops.c.src')] # numpy.distutils generate .c from .c.src in weird directories, we have # to add them there as they depend on the build_dir @@ -737,7 +738,8 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'shape.h'), join('src', 'multiarray', 'ucsnarrow.h'), join('src', 'multiarray', 'usertypes.h'), - join('src', 'multiarray', 'new_iterator.h')] + join('src', 'multiarray', 'new_iterator.h'), + join('src', 'multiarray', 'lowlevel_strided_loops.h')] multiarray_src = [join('src', 'multiarray', 'multiarraymodule.c'), join('src', 'multiarray', 'hashdescr.c'), @@ -768,6 +770,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'arraytypes.c.src'), join('src', 'multiarray', 'scalartypes.c.src'), join('src', 'multiarray', 'new_iterator.c.src'), + join('src', 'multiarray', 'lowlevel_strided_loops.c.src'), join('src', 'multiarray', 'new_iterator_pywrap.c')] if PYTHON_HAS_UNICODE_WIDE: diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src new file mode 100644 index 000000000..b51f1e680 --- /dev/null +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -0,0 +1,1024 @@ +/* + * This file contains low-level loops for copying and byte-swapping + * strided data. + */ + +#define PY_SSIZE_T_CLEAN +#include "Python.h" +#include "structmember.h" + +#define _MULTIARRAYMODULE +#include <numpy/ndarrayobject.h> +#include <numpy/ufuncobject.h> +#include <numpy/npy_cpu.h> + +#include "lowlevel_strided_loops.h" + +#if ! (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) +//# define NPY_USE_UNALIGNED_ACCESS +#endif + +#define _NPY_NOP1(x) (x) +#define _NPY_NOP2(x) (x) +#define _NPY_NOP4(x) (x) +#define _NPY_NOP8(x) (x) + +#define _NPY_SWAP2(x) (((((npy_uint16)x)&0xffu) << 8) | \ + (((npy_uint16)x) >> 8)) + +#define _NPY_SWAP4(x) (((((npy_uint32)x)&0xffu) << 24) | \ + ((((npy_uint32)x)&0xff00u) << 8) | \ + ((((npy_uint32)x)&0xff0000u) >> 8) | \ + (((npy_uint32)x) >> 24)) + +#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \ + ((((npy_uint32)x)&0xff00u) >> 8) | \ + ((((npy_uint32)x)&0xff0000u) << 8) | \ + (((npy_uint32)x) >> 8)) + +#define _NPY_SWAP8(x) (((((npy_uint64)x)&0xffu) << 56) | \ + ((((npy_uint64)x)&0xff00u) << 40) | \ + ((((npy_uint64)x)&0xff0000u) << 24) | \ + ((((npy_uint64)x)&0xff000000u) << 8) | \ + ((((npy_uint64)x)&0xff00000000u) >> 8) | \ + ((((npy_uint64)x)&0xff0000000000u) >> 24) | \ + ((((npy_uint64)x)&0xff000000000000u) >> 40) | \ + (((npy_uint64)x) >> 56)) + +#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffu) << 24) | \ + ((((npy_uint64)x)&0xff00u) << 8) | \ + ((((npy_uint64)x)&0xff0000u) >> 8) | \ + ((((npy_uint64)x)&0xff000000u) >> 24) | \ + ((((npy_uint64)x)&0xff00000000u) << 24) | \ + ((((npy_uint64)x)&0xff0000000000u) << 8) | \ + ((((npy_uint64)x)&0xff000000000000u) >> 8) | \ + (((npy_uint64)x) >> 24)) + +#define _NPY_SWAP_INPLACE2(x) { \ + char a = (x)[0]; (x)[0] = (x)[1]; (x)[1] = a; \ + } + +#define _NPY_SWAP_INPLACE4(x) { \ + char a = (x)[0]; (x)[0] = (x)[3]; (x)[3] = a; \ + a = (x)[1]; (x)[1] = (x)[2]; (x)[2] = a; \ + } + +#define _NPY_SWAP_INPLACE8(x) { \ + char a = (x)[0]; (x)[0] = (x)[7]; (x)[7] = a; \ + a = (x)[1]; (x)[1] = (x)[6]; (x)[6] = a; \ + a = (x)[2]; (x)[2] = (x)[5]; (x)[5] = a; \ + a = (x)[3]; (x)[3] = (x)[4]; (x)[4] = a; \ + } + +#define _NPY_SWAP_INPLACE16(x) { \ + char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \ + a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \ + a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \ + a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \ + a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \ + a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \ + a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \ + a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \ + } + +/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/ + +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + * #elsize_half = 0, 1, 2, 4, 8# + * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128# + */ +/**begin repeat1 + * #oper = strided_to_strided, strided_to_contig, + * contig_to_strided, contig_to_contig# + * #src_contig = 0, 0, 1 ,1# + * #dst_contig = 0, 1, 0 ,1# + */ +/**begin repeat2 + * #swap = _NPY_NOP, _NPY_NOP, _NPY_SWAP_INPLACE, _NPY_SWAP, + * _NPY_SWAP_INPLACE, _NPY_SWAP_PAIR# + * #prefix = , _aligned, _swap, _aligned_swap, _swap_pair, _aligned_swap_pair# + * #is_aligned = 0, 1, 0, 1, 0, 1# + * #minelsize = 1, 1, 2, 2, 4, 4# + * #is_swap = 0, 0, 1, 1, 2, 2# + */ + +#if (@elsize@ >= @minelsize@) && \ + (@elsize@ > 1 || @is_aligned@) && \ + (!defined(NPY_USE_UNALIGNED_ACCESS) || @is_aligned@) + + +#if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0 +static void +@prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp NPY_UNUSED(itemsize), + void *NPY_UNUSED(data)) +{ + while (N > 0) { +#if @is_aligned@ + + /* aligned copy and swap */ +# if @elsize@ != 16 + (*((@type@ *)dst)) = @swap@@elsize@(*((@type@ *)src)); +# else +# if @is_swap@ == 0 + (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src)); + (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1)); +# elif @is_swap@ == 1 + (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1)); + (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src)); +# elif @is_swap@ == 2 + (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src)); + (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1)); +# endif +# endif + +#else + + /* unaligned copy and swap */ + memcpy(dst, src, @elsize@); +# if @is_swap@ == 1 + @swap@@elsize@(dst); +# elif @is_swap@ == 2 + @swap@@elsize_half@(dst); + @swap@@elsize_half@(dst + @elsize_half@); +# endif + +#endif + +#if @dst_contig@ + dst += @elsize@; +#else + dst += dst_stride; +#endif + +#if @src_contig@ + src += @elsize@; +#else + src += src_stride; +#endif + + --N; + } +} +#endif + + +/* specialized copy and swap for source stride 0 */ +#if (@src_contig@ == 0) && @is_aligned@ +static void +@prefix@_@oper@_size@elsize@_srcstride0(char *dst, + npy_intp dst_stride, + char *src, npy_intp NPY_UNUSED(src_stride), + npy_intp N, npy_intp NPY_UNUSED(itemsize), + void *NPY_UNUSED(data)) +{ +#if @elsize@ != 16 + @type@ temp = @swap@@elsize@(*((@type@ *)src)); +#else + npy_uint64 temp0, temp1; +# if @is_swap@ == 0 + temp0 = (*((npy_uint64 *)src)); + temp1 = (*((npy_uint64 *)src + 1)); +# elif @is_swap@ == 1 + temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1)); + temp1 = _NPY_SWAP8(*((npy_uint64 *)src)); +# elif @is_swap@ == 2 + temp0 = _NPY_SWAP8(*((npy_uint64 *)src)); + temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1)); +# endif +#endif + while (N > 0) { +#if @elsize@ != 16 + *((@type@ *)dst) = temp; +#else + *((npy_uint64 *)dst) = temp0; + *((npy_uint64 *)dst + 1) = temp1; +#endif +#if @dst_contig@ + dst += @elsize@; +#else + dst += dst_stride; +#endif + --N; + } +} +#endif + +#endif/* @elsize@ >= @minelsize@ */ + +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + +static void +_strided_to_strided(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + while (N > 0) { + memcpy(dst, src, itemsize); + dst += dst_stride; + src += src_stride; + --N; + } +} + +static void +_swap_strided_to_strided(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + char *a, *b, c; + + while (N > 0) { + memcpy(dst, src, itemsize); + /* general in-place swap */ + a = dst; + b = dst + itemsize - 1; + while (a < b) { + c = *a; + *a = *b; + *b = c; + ++a; --b; + } + dst += dst_stride; + src += src_stride; + --N; + } +} + +static void +_swap_pair_strided_to_strided(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + char *a, *b, c; + npy_intp itemsize_half = itemsize / 2; + + while (N > 0) { + memcpy(dst, src, itemsize); + /* general in-place swap */ + a = dst; + b = dst + itemsize_half - 1; + while (a < b) { + c = *a; + *a = *b; + *b = c; + ++a; --b; + } + /* general in-place swap */ + a = dst + itemsize_half; + b = dst + 2*itemsize_half - 1; + while (a < b) { + c = *a; + *a = *b; + *b = c; + ++a; --b; + } + dst += dst_stride; + src += src_stride; + --N; + } +} + +static void +_strided_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride), + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + while (N > 0) { + memcpy(dst, src, itemsize); + dst += itemsize; + src += src_stride; + --N; + } +} + +static void +_contig_to_strided(char *dst, npy_intp dst_stride, + char *src, npy_intp NPY_UNUSED(src_stride), + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + while (N > 0) { + memcpy(dst, src, itemsize); + dst += dst_stride; + src += itemsize; + --N; + } +} + +static void +_contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride), + char *src, npy_intp NPY_UNUSED(src_stride), + npy_intp N, npy_intp itemsize, + void *NPY_UNUSED(data)) +{ + memcpy(dst, src, itemsize*N); +} + + +NPY_NO_EXPORT PyArray_StridedTransferFn +PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride, + npy_intp dst_stride, npy_intp itemsize) +{ +/* + * Skip the "unaligned" versions on CPUs which support unaligned + * memory accesses. + */ +#ifndef NPY_USE_UNALIGNED_ACCESS + if (aligned) { +#endif/*!NPY_USE_UNALIGNED_ACCESS*/ + + /* contiguous dst */ + if (itemsize != 0 && dst_stride == itemsize) { + /* constant src */ + if (src_stride == 0) { + switch (itemsize) { +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + */ + case @elsize@: + return + &_aligned_strided_to_contig_size@elsize@_srcstride0; +/**end repeat**/ + } + } + /* contiguous src */ + else if (src_stride == itemsize) { + return &_contig_to_contig; + } + /* general src */ + else { + switch (itemsize) { +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + */ + case @elsize@: + return &_aligned_strided_to_contig_size@elsize@; +/**end repeat**/ + } + } + + return &_strided_to_contig; + } + /* general dst */ + else { + /* constant src */ + if (src_stride == 0) { + switch (itemsize) { +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + */ + case @elsize@: + return + &_aligned_strided_to_strided_size@elsize@_srcstride0; +/**end repeat**/ + } + } + /* contiguous src */ + else if (src_stride == itemsize) { + switch (itemsize) { +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + */ + case @elsize@: + return &_aligned_contig_to_strided_size@elsize@; +/**end repeat**/ + } + + return &_contig_to_strided; + } + else { + switch (itemsize) { +/**begin repeat + * #elsize = 1, 2, 4, 8, 16# + */ + case @elsize@: + return &_aligned_strided_to_strided_size@elsize@; +/**end repeat**/ + } + } + } + +#ifndef NPY_USE_UNALIGNED_ACCESS + } + else { + /* contiguous dst */ + if (itemsize != 0 && dst_stride == itemsize) { + /* contiguous src */ + if (itemsize != 0 && src_stride == itemsize) { + return &_contig_to_contig; + } + /* general src */ + else { + switch (itemsize) { + case 1: + return &_aligned_strided_to_contig_size1; +/**begin repeat + * #elsize = 2, 4, 8, 16# + */ + case @elsize@: + return &_strided_to_contig_size@elsize@; +/**end repeat**/ + } + } + + return &_strided_to_contig; + } + /* general dst */ + else { + /* contiguous src */ + if (itemsize != 0 && src_stride == itemsize) { + switch (itemsize) { + case 1: + return &_aligned_contig_to_strided_size1; +/**begin repeat + * #elsize = 2, 4, 8, 16# + */ + case @elsize@: + return &_contig_to_strided_size@elsize@; +/**end repeat**/ + } + + return &_contig_to_strided; + } + /* general src */ + else { + switch (itemsize) { + case 1: + return &_aligned_strided_to_strided_size1; +/**begin repeat + * #elsize = 2, 4, 8, 16# + */ + case @elsize@: + return &_strided_to_strided_size@elsize@; +/**end repeat**/ + } + } + } + } +#endif/*!NPY_USE_UNALIGNED_ACCESS*/ + + return &_strided_to_strided; +} + +/* + * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are + * nearly identical, so can do a repeat for them. + */ +/**begin repeat + * #function = PyArray_GetStridedCopySwapFn, PyArray_GetStridedCopySwapPairFn# + * #tag = , _pair# + * #not_pair = 1, 0# + */ + +NPY_NO_EXPORT PyArray_StridedTransferFn +@function@(npy_intp aligned, npy_intp src_stride, + npy_intp dst_stride, npy_intp itemsize) +{ +/* + * Skip the "unaligned" versions on CPUs which support unaligned + * memory accesses. + */ +#ifndef NPY_USE_UNALIGNED_ACCESS + if (aligned) { +#endif/*!NPY_USE_UNALIGNED_ACCESS*/ + + /* contiguous dst */ + if (itemsize != 0 && dst_stride == itemsize) { + /* constant src */ + if (src_stride == 0) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return + &_aligned_swap@tag@_strided_to_contig_size@elsize@_srcstride0; +#endif +/**end repeat1**/ + } + } + /* contiguous src */ + else if (src_stride == itemsize) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_aligned_swap@tag@_contig_to_contig_size@elsize@; +#endif +/**end repeat1**/ + } + } + /* general src */ + else { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_aligned_swap@tag@_strided_to_contig_size@elsize@; +#endif +/**end repeat1**/ + } + } + } + /* general dst */ + else { + /* constant src */ + if (src_stride == 0) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return + &_aligned_swap@tag@_strided_to_strided_size@elsize@_srcstride0; +#endif +/**end repeat1**/ + } + } + /* contiguous src */ + else if (src_stride == itemsize) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_aligned_swap@tag@_contig_to_strided_size@elsize@; +#endif +/**end repeat1**/ + } + + return &_contig_to_strided; + } + else { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_aligned_swap@tag@_strided_to_strided_size@elsize@; +#endif +/**end repeat1**/ + } + } + } + +#ifndef NPY_USE_UNALIGNED_ACCESS + } + else { + /* contiguous dst */ + if (itemsize != 0 && dst_stride == itemsize) { + /* contiguous src */ + if (itemsize != 0 && src_stride == itemsize) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_swap@tag@_contig_to_contig_size@elsize@; +#endif +/**end repeat1**/ + } + } + /* general src */ + else { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_swap@tag@_strided_to_contig_size@elsize@; +#endif +/**end repeat1**/ + } + } + + return &_strided_to_contig; + } + /* general dst */ + else { + /* contiguous src */ + if (itemsize != 0 && src_stride == itemsize) { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_swap@tag@_contig_to_strided_size@elsize@; +#endif +/**end repeat1**/ + } + + return &_contig_to_strided; + } + /* general src */ + else { + switch (itemsize) { +/**begin repeat1 + * #elsize = 2, 4, 8, 16# + */ +#if @not_pair@ || @elsize@ > 2 + case @elsize@: + return &_swap@tag@_strided_to_strided_size@elsize@; +#endif +/**end repeat1**/ + } + } + } + } +#endif/*!NPY_USE_UNALIGNED_ACCESS*/ + + return &_swap@tag@_strided_to_strided; +} + +/**end repeat**/ + +/* Does a simple aligned cast */ +typedef struct { + void *freefunc; + PyArray_VectorUnaryFunc *castfunc; +} _strided_cast_data; + +static void +_aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *data) +{ + PyArray_VectorUnaryFunc *castfunc = ((_strided_cast_data *)data)->castfunc; + + while (N > 0) { + castfunc(src, dst, 1, NULL, NULL); + dst += dst_stride; + src += src_stride; + --N; + } +} + +static void +_aligned_contig_to_contig_cast(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *data) +{ + PyArray_VectorUnaryFunc *castfunc = ((_strided_cast_data *)data)->castfunc; + + castfunc(src, dst, N, NULL, NULL); +} + + + +NPY_NO_EXPORT int +PyArray_GetTransferFunction(int aligned, + npy_intp src_stride, npy_intp dst_stride, + PyArray_Descr *from, PyArray_Descr *to, + PyArray_StridedTransferFn *outstransfer, + void **outtransferdata) +{ + /* First look at the possibilities of just a copy or swap */ + if (from->elsize == to->elsize && from->type_num < NPY_OBJECT && + to->type_num < NPY_OBJECT && + from->kind == to->kind) { + /* This is a straight copy */ + if (from->elsize == 1 || PyArray_ISNBO(from->byteorder) == + PyArray_ISNBO(to->byteorder)) { + *outstransfer = PyArray_GetStridedCopyFn(aligned, + src_stride, dst_stride, + from->elsize); + *outtransferdata = NULL; + return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED; + } + /* This is a straight copy + byte swap */ + else if (!PyTypeNum_ISCOMPLEX(from->type_num)) { + *outstransfer = PyArray_GetStridedCopySwapFn(aligned, + src_stride, dst_stride, + from->elsize); + *outtransferdata = NULL; + return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED; + } + /* This is a straight copy + element pair byte swap */ + else { + *outstransfer = PyArray_GetStridedCopySwapPairFn(aligned, + src_stride, dst_stride, + from->elsize); + *outtransferdata = NULL; + return (*outstransfer == NULL) ? NPY_FAIL : NPY_SUCCEED; + } + } + + /* TODO check for fields & subarrays */ + + /* Check whether a simple cast will suffice */ + if (from->type_num < NPY_OBJECT && to->type_num < NPY_OBJECT && + PyArray_ISNBO(from->type_num) && PyArray_ISNBO(to->type_num)) { + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(from, to->type_num); + if (!castfunc) { + *outstransfer = NULL; + *outtransferdata = NULL; + return NPY_FAIL; + } + + if (aligned) { + /* Allocate the data that describes the cast */ + _strided_cast_data *data = + PyArray_malloc(sizeof(_strided_cast_data)); + if (data == NULL) { + PyErr_NoMemory(); + *outstransfer = NULL; + *outtransferdata = NULL; + return NPY_FAIL; + } + data->freefunc = (void*)&(PyArray_free); + data->castfunc = castfunc; + + /* Choose the contiguous cast if we can */ + if (src_stride == from->elsize && dst_stride == to->elsize) { + *outstransfer = _aligned_contig_to_contig_cast; + } + else { + *outstransfer = _aligned_strided_to_strided_cast; + } + *outtransferdata = data; + + return NPY_SUCCEED; + } + + /* TODO wrap the cast in an alignment operation */ + } + + /* TODO: write more complicated transfer code! */ + *outstransfer = NULL; + *outtransferdata = NULL; + PyErr_SetString(PyExc_RuntimeError, + "General transfer function support has not been written yet"); + return NPY_FAIL; +} + +typedef void (*_npy_stridedtransfer_dealloc)(void *); +NPY_NO_EXPORT void +PyArray_FreeStridedTransferData(void *transferdata) +{ + if (transferdata != NULL) { + _npy_stridedtransfer_dealloc dealloc = + *((_npy_stridedtransfer_dealloc *)transferdata); + dealloc(transferdata); + } +} + + + +NPY_NO_EXPORT npy_intp +PyArray_TransferNDimToStrided(npy_intp ndim, + char *dst, npy_intp dst_stride, + char *src, npy_intp *src_strides, npy_intp src_strides_inc, + npy_intp *coords, npy_intp coords_inc, + npy_intp *shape, npy_intp shape_inc, + npy_intp count, npy_intp itemsize, + PyArray_StridedTransferFn stransfer, + void *data) +{ + npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1; + + /* Finish off dimension 0 */ + coord0 = coords[0]; + shape0 = shape[0]; + src_stride0 = src_strides[0]; + N = shape0 - coord0; + if (N >= count) { + stransfer(dst, dst_stride, src, src_stride0, count, itemsize, data); + return 0; + } + stransfer(dst, dst_stride, src, src_stride0, N, itemsize, data); + count -= N; + + /* If it's 1-dimensional, there's no more to copy */ + if (ndim == 1) { + return count; + } + + /* Adjust the src and dst pointers */ + coord1 = (coords + coords_inc)[0]; + shape1 = (shape + shape_inc)[0]; + src_stride1 = (src_strides + src_strides_inc)[0]; + src = src - coord0*src_stride0 + src_stride1; + dst += N*dst_stride; + + /* Finish off dimension 1 */ + M = (shape1 - coord1 - 1); + N = shape0*M; + for (i = 0; i < M; ++i) { + if (shape0 >= count) { + stransfer(dst, dst_stride, src, src_stride0, + count, itemsize, data); + return 0; + } + else { + stransfer(dst, dst_stride, src, src_stride0, + shape0, itemsize, data); + } + count -= shape0; + src += src_stride1; + dst += shape0*dst_stride; + } + + /* If it's 2-dimensional, there's no more to copy */ + if (ndim == 2) { + return count; + } + + /* General-case loop for everything else */ + else { + /* Iteration structure for dimensions 2 and up */ + struct { + npy_intp coord, shape, src_stride; + } it[NPY_MAXDIMS]; + + /* Copy the coordinates and shape */ + coords += 2*coords_inc; + shape += 2*shape_inc; + src_strides += 2*src_strides_inc; + for (i = 0; i < ndim-2; ++i) { + it[i].coord = coords[0]; + it[i].shape = shape[0]; + it[i].src_stride = src_strides[0]; + coords += coords_inc; + shape += shape_inc; + src_strides += src_strides_inc; + } + + for (;;) { + /* Adjust the src pointer from the dimension 0 and 1 loop */ + src = src - shape1*src_stride1; + + /* Increment to the next coordinate */ + for (i = 0; i < ndim-2; ++i) { + src += it[i].src_stride; + if (++it[i].coord >= it[i].shape) { + it[i].coord = 0; + src -= it[i].src_stride*it[i].shape; + } + else { + break; + } + } + /* If the last dimension rolled over, we're done */ + if (i == ndim-2) { + return count; + } + + /* A loop for dimensions 0 and 1 */ + for (i = 0; i < shape1; ++i) { + if (shape0 >= count) { + stransfer(dst, dst_stride, src, src_stride0, + count, itemsize, data); + return 0; + } + else { + stransfer(dst, dst_stride, src, src_stride0, + shape0, itemsize, data); + } + count -= shape0; + src += src_stride1; + dst += shape0*dst_stride; + } + } + } +} + +NPY_NO_EXPORT npy_intp +PyArray_TransferStridedToNDim(npy_intp ndim, + char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc, + char *src, npy_intp src_stride, + npy_intp *coords, npy_intp coords_inc, + npy_intp *shape, npy_intp shape_inc, + npy_intp count, npy_intp itemsize, + PyArray_StridedTransferFn stransfer, + void *data) +{ + npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1; + + /* Finish off dimension 0 */ + coord0 = coords[0]; + shape0 = shape[0]; + dst_stride0 = dst_strides[0]; + N = shape0 - coord0; + if (N >= count) { + stransfer(dst, dst_stride0, src, src_stride, count, itemsize, data); + return 0; + } + stransfer(dst, dst_stride0, src, src_stride, N, itemsize, data); + count -= N; + + /* If it's 1-dimensional, there's no more to copy */ + if (ndim == 1) { + return count; + } + + /* Adjust the src and dst pointers */ + coord1 = (coords + coords_inc)[0]; + shape1 = (shape + shape_inc)[0]; + dst_stride1 = (dst_strides + dst_strides_inc)[0]; + dst = dst - coord0*dst_stride0 + dst_stride1; + src += N*src_stride; + + /* Finish off dimension 1 */ + M = (shape1 - coord1 - 1); + N = shape0*M; + for (i = 0; i < M; ++i) { + if (shape0 >= count) { + stransfer(dst, dst_stride0, src, src_stride, + count, itemsize, data); + return 0; + } + else { + stransfer(dst, dst_stride0, src, src_stride, + shape0, itemsize, data); + } + count -= shape0; + dst += dst_stride1; + src += shape0*src_stride; + } + + /* If it's 2-dimensional, there's no more to copy */ + if (ndim == 2) { + return count; + } + + /* General-case loop for everything else */ + else { + /* Iteration structure for dimensions 2 and up */ + struct { + npy_intp coord, shape, dst_stride; + } it[NPY_MAXDIMS]; + + /* Copy the coordinates and shape */ + coords += 2*coords_inc; + shape += 2*shape_inc; + dst_strides += 2*dst_strides_inc; + for (i = 0; i < ndim-2; ++i) { + it[i].coord = coords[0]; + it[i].shape = shape[0]; + it[i].dst_stride = dst_strides[0]; + coords += coords_inc; + shape += shape_inc; + dst_strides += dst_strides_inc; + } + + for (;;) { + /* Adjust the dst pointer from the dimension 0 and 1 loop */ + dst = dst - shape1*dst_stride1; + + /* Increment to the next coordinate */ + for (i = 0; i < ndim-2; ++i) { + dst += it[i].dst_stride; + if (++it[i].coord >= it[i].shape) { + it[i].coord = 0; + dst -= it[i].dst_stride*it[i].shape; + } + else { + break; + } + } + /* If the last dimension rolled over, we're done */ + if (i == ndim-2) { + return count; + } + + /* A loop for dimensions 0 and 1 */ + for (i = 0; i < shape1; ++i) { + if (shape0 >= count) { + stransfer(dst, dst_stride0, src, src_stride, + count, itemsize, data); + return 0; + } + else { + stransfer(dst, dst_stride0, src, src_stride, + shape0, itemsize, data); + } + count -= shape0; + dst += dst_stride1; + src += shape0*src_stride; + } + } + } +} diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.h b/numpy/core/src/multiarray/lowlevel_strided_loops.h new file mode 100644 index 000000000..b3d9c9de0 --- /dev/null +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.h @@ -0,0 +1,153 @@ +#ifndef __LOWLEVEL_STRIDED_LOOPS_H +#define __LOWLEVEL_STRIDED_LOOPS_H + +/* + * This function pointer is for functions that transfer an arbitrarily strided + * input to a an arbitrarily strided output. It may be a fully general + * function, or a specialized function when the strides or item size + * have special values. + * + * Examples of transfer functions are a straight copy, a byte-swap, + * and a casting operation, + * + * The 'transferdata' parameter is slightly special, and must always contain + * a pointer to a deallocation routine at its beginning. The function + * PyArray_FreeStridedTransferFn should be used to deallocate such + * pointers, and calls the function pointer. + * + */ +typedef void (*PyArray_StridedTransferFn)(char *dst, npy_intp dst_stride, + char *src, npy_intp src_stride, + npy_intp N, npy_intp itemsize, + void *transferdata); + +/* + * Deallocates a PyArray_StridedTransferFunction data object. See + * the comment with the function typedef for more details. + */ +NPY_NO_EXPORT void +PyArray_FreeStridedTransferData(void *transferdata); + +/* + * Gives back a function pointer to a specialized function for copying + * strided memory. Returns NULL if there is a problem with the inputs. + * + * aligned: + * Should be 1 if the src and dst pointers are always aligned, + * 0 otherwise. + * src_stride: + * Should be the src stride if it will always be the same, + * NPY_MAX_INTP otherwise. + * dst_stride: + * Should be the dst stride if it will always be the same, + * NPY_MAX_INTP otherwise. + * itemsize: + * Should be the item size if it will always be the same, 0 otherwise. + * + */ +NPY_NO_EXPORT PyArray_StridedTransferFn +PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride, + npy_intp dst_stride, npy_intp itemsize); + +/* + * Gives back a function pointer to a specialized function for copying + * and swapping strided memory. This assumes each element is a single + * value to be swapped. + * + * Parameters are as for PyArray_GetStridedCopyFn. + */ +NPY_NO_EXPORT PyArray_StridedTransferFn +PyArray_GetStridedCopySwapFn(npy_intp aligned, npy_intp src_stride, + npy_intp dst_stride, npy_intp itemsize); + +/* + * Gives back a function pointer to a specialized function for copying + * and swapping strided memory. This assumes each element is a pair + * of values, each of which needs to be swapped. + * + * Parameters are as for PyArray_GetStridedCopyFn. + */ +NPY_NO_EXPORT PyArray_StridedTransferFn +PyArray_GetStridedCopySwapPairFn(npy_intp aligned, npy_intp src_stride, + npy_intp dst_stride, npy_intp itemsize); + +/* + * If it's possible, gives back a transfer function which casts and/or + * byte swaps data with the dtype 'from' into data with the dtype 'to'. + * If the outtransferdata is populated with a non-NULL value, it + * must be deallocated with the PyArray_free function when the transfer + * function is no longer required. + * + * Returns NPY_SUCCEED or NPY_FAIL. + */ +NPY_NO_EXPORT int +PyArray_GetTransferFunction(int aligned, + npy_intp src_stride, npy_intp dst_stride, + PyArray_Descr *from, PyArray_Descr *to, + PyArray_StridedTransferFn *outstransfer, + void **outtransferdata); + +/* + * These two functions copy or convert the data of an n-dimensional array + * to/from a 1-dimensional strided buffer. These functions will only call + * 'stransfer' with the provided dst_stride/src_stride and + * dst_strides[0]/src_strides[0], so the caller can use those values to + * specialize the function. + * + * The return value is the number of elements it couldn't copy. A return value + * of 0 means all elements were copied, a larger value means the end of + * the n-dimensional array was reached before 'count' elements were copied. + * + * ndim: + * The number of dimensions of the n-dimensional array. + * dst/src: + * The destination or src starting pointer. + * dst_stride/src_stride: + * The stride of the 1-dimensional strided buffer + * dst_strides/src_strides: + * The strides of the n-dimensional array. + * dst_strides_inc/src_strides_inc: + * How much to add to the ..._strides pointer to get to the next stride. + * coords: + * The starting coordinates in the n-dimensional array. + * coords_inc: + * How much to add to the coords pointer to get to the next coordinate. + * shape: + * The shape of the n-dimensional array. + * shape_inc: + * How much to add to the shape pointer to get to the next shape entry. + * count: + * How many elements to transfer + * itemsize: + * How big each element is. If transfering between elements of different + * sizes, for example a casting operation, the 'stransfer' function + * should be specialized for that, in which case 'stransfer' will ignore + * this parameter. + * stransfer: + * The strided transfer function. + * transferdata: + * An auxiliary data pointer passed to the strided transfer function. + * If a non-NULL value is returned, it must be deallocated with the + * function PyArray_FreeStridedTransferData. + */ +NPY_NO_EXPORT npy_intp +PyArray_TransferNDimToStrided(npy_intp ndim, + char *dst, npy_intp dst_stride, + char *src, npy_intp *src_strides, npy_intp src_strides_inc, + npy_intp *coords, npy_intp coords_inc, + npy_intp *shape, npy_intp shape_inc, + npy_intp count, npy_intp itemsize, + PyArray_StridedTransferFn stransfer, + void *transferdata); + +NPY_NO_EXPORT npy_intp +PyArray_TransferStridedToNDim(npy_intp ndim, + char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc, + char *src, npy_intp src_stride, + npy_intp *coords, npy_intp coords_inc, + npy_intp *shape, npy_intp shape_inc, + npy_intp count, npy_intp itemsize, + PyArray_StridedTransferFn stransfer, + void *transferdata); + +#endif diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src index 62c5929ca..f4a1d0b9a 100644 --- a/numpy/core/src/multiarray/new_iterator.c.src +++ b/numpy/core/src/multiarray/new_iterator.c.src @@ -6,6 +6,7 @@ #include <numpy/ndarrayobject.h> #include "new_iterator.h" +#include "lowlevel_strided_loops.h" /* Adjust ptr to be the next 16-byte aligned position after adding 'amount' */ #define NPYITER_INC_BUFFERPTR(ptr, amount) \ @@ -34,21 +35,29 @@ #define NPY_ITFLAG_FORCEDORDER 0x020 /* The inner loop is handled outside the iterator */ #define NPY_ITFLAG_NOINNER 0x040 +/* The iterator is buffered */ +#define NPY_ITFLAG_BUFFER 0x080 +/* The iterator is buffered */ +#define NPY_ITFLAG_GROWINNER 0x100 /* Internal iterator per-operand iterator flags */ /* The operand will be written to */ -#define NPY_OP_ITFLAG_WRITE 0x001 +#define NPY_OP_ITFLAG_WRITE 0x01 /* The operand will be read from */ -#define NPY_OP_ITFLAG_READ 0x002 +#define NPY_OP_ITFLAG_READ 0x02 /* The operand may have a temporary copy made */ -#define NPY_OP_ITFLAG_COPY 0x004 +#define NPY_OP_ITFLAG_COPY 0x04 /* The operand needs casting */ -#define NPY_OP_ITFLAG_CAST 0x008 +#define NPY_OP_ITFLAG_CAST 0x08 /* The operand needs a byte swap and/or alignment operation */ -#define NPY_OP_ITFLAG_COPYSWAP 0x010 -/* The operand should not be allocated with a subtype */ -#define NPY_OP_ITFLAG_NOSUBTYPE 0x020 +#define NPY_OP_ITFLAG_COPYSWAP 0x10 +/* The operand never needs buffering */ +#define NPY_OP_ITFLAG_BUFNEVER 0x20 +/* The operand needs to be byte-swapped */ +#define NPY_OP_ITFLAG_BUFSWAP 0x40 +/* The operand is aligned */ +#define NPY_OP_ITFLAG_ALIGNED 0x80 /* Internal flag, for the type of operands */ #define NPY_ITER_OP_ARRAY 0 @@ -79,6 +88,8 @@ ((NPY_SIZEOF_INTP)*(niter)) #define NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter) \ (NPY_INTP_ALIGNED(niter)) +#define NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter) \ + ((itflags&NPY_ITFLAG_BUFFER) ? ((NPY_SIZEOF_INTP)*(3 + 7*niter)) : 0) /* Byte offsets of the iterator members */ #define NIT_ITERSIZE_OFFSET() \ @@ -98,9 +109,12 @@ #define NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) \ (NIT_OBJECTS_OFFSET(itflags, ndim, niter) + \ NIT_OBJECTS_SIZEOF(itflags, ndim, niter)) -#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \ +#define NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) \ (NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) + \ NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter)) +#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \ + (NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) + \ + NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter)) /* Internal-only ITERATOR DATA MEMBER ACCESS */ #define NIT_ITFLAGS(iter) \ @@ -121,9 +135,27 @@ (char*)(iter) + NIT_OBJECTS_OFFSET(itflags, ndim, niter))) #define NIT_OPITFLAGS(iter) ( \ (char*)(iter) + NIT_OPITFLAGS_OFFSET(itflags, ndim, niter)) +#define NIT_BUFFERDATA(iter) \ + ((char*)(iter) + NIT_BUFFERDATA_OFFSET(itflags, ndim, niter)) #define NIT_AXISDATA(iter) \ ((char*)(iter) + NIT_AXISDATA_OFFSET(itflags, ndim, niter)) +/* Internal-only BUFFERDATA MEMBER ACCESS */ +#define NBF_BUFFERSIZE(bufferdata) (*((npy_intp *)(bufferdata))) +#define NBF_SIZE(bufferdata) (*((npy_intp *)(bufferdata) + 1)) +#define NBF_POS(bufferdata) (*((npy_intp *)(bufferdata) + 2)) +#define NBF_STRIDES(bufferdata) ((npy_intp *)(bufferdata) + 3) +#define NBF_PTRS(bufferdata) ((char **)(bufferdata) + 3 + 1*(niter)) +#define NBF_READTRANSFERFN(bufferdata) \ + ((PyArray_StridedTransferFn *)(bufferdata) + 3 + 2*(niter)) +#define NBF_READTRANSFERDATA(bufferdata) \ + ((void **)(bufferdata) + 3 + 3*(niter)) +#define NBF_WRITETRANSFERFN(bufferdata) \ + ((PyArray_StridedTransferFn *)(bufferdata) + 3 + 4*(niter)) +#define NBF_WRITETRANSFERDATA(bufferdata) \ + ((void **)(bufferdata) + 3 + 5*(niter)) +#define NBF_BUFFERS(bufferdata) ((char **)(bufferdata) + 3 + 6*(niter)) + /* Internal-only AXISDATA MEMBER ACCESS. */ #define NAD_SHAPE(axisdata) (*((npy_intp *)(axisdata))) #define NAD_COORD(axisdata) (*((npy_intp *)(axisdata) + 1)) @@ -202,11 +234,20 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype, npy_intp op_ndim, npy_intp *shape, PyArray_Descr *op_dtype, npy_intp *op_axes); +static int +npyiter_allocate_buffers(NpyIter *iter); +static int +npyiter_jumpforward(NpyIter *iter, npy_intp count); +static void +npyiter_copy_from_buffers(NpyIter *iter); +static void +npyiter_copy_to_buffers(NpyIter *iter); + /* The constructor for an iterator over multiple objects */ NpyIter* NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, npy_uint32 *op_flags, PyArray_Descr **op_request_dtypes, - npy_intp oa_ndim, npy_intp **op_axes) + npy_intp oa_ndim, npy_intp **op_axes, npy_intp buffersize) { npy_uint32 itflags = NPY_ITFLAG_IDENTPERM; npy_intp idim, ndim = 0; @@ -215,10 +256,6 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, /* The iterator being constructed */ NpyIter *iter; - /* For tracking buffer space at the end of the iterator memory */ - npy_intp bufferspace = 0; - char *bufferptr = 0; - /* Per-operand values */ PyArrayObject *op[NPY_MAXARGS]; PyArray_Descr *op_dtype[NPY_MAXARGS]; @@ -228,6 +265,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, char **op_dataptr; npy_intp *perm; + char *bufferdata = NULL; char axes_dupcheck[NPY_MAXDIMS]; int any_allocate_if_null = 0, any_missing_dtypes = 0, allocate_output_scalars = 0; @@ -289,12 +327,12 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, } } - /* Checks the flags (C|F)_ORDER_INDEX, COORDS, and NO_INNER_ITERATION */ + /* Checks the global iterator flags */ if (!pyiter_check_global_flags(flags, &itflags)) { return NULL; } - /* If offsets were requested, make sure copying/buffering is disabled */ + /* If offsets were requested, make sure copying is disabled */ if (itflags&NPY_ITFLAG_HASOFFSETS) { for (iiter = 0; iiter < niter; ++iiter) { if (op_flags[iiter]&(NPY_ITER_COPY| @@ -307,6 +345,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, } } + /* + * If buffering is enabled, and no buffersize was given, use a default + * chosen to be big enough to get some amortization benefits, but + * small enough to be cache-friendly. + */ + if (itflags&NPY_ITFLAG_BUFFER && buffersize <= 0) { + buffersize = 256; + } + /* Prepare all the operands */ for (iiter = 0; iiter < niter; ++iiter) { /* @@ -386,18 +433,8 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, } /* Allocate memory for the iterator */ - if (bufferspace > 0) { - npy_intp itsize = NIT_SIZEOF_ITERATOR(itflags, ndim, niter); - iter = (NpyIter*) - PyArray_malloc(itsize + bufferspace); - /* Initialize the buffer pointer to just after the iterator memory */ - bufferptr = (char*)iter; - NPYITER_INC_BUFFERPTR(bufferptr, itsize); - } - else { - iter = (NpyIter*) - PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter)); - } + iter = (NpyIter*) + PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter)); /* Fill in the base data */ NIT_ITFLAGS(iter) = itflags; @@ -453,12 +490,33 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, /* Set resetindex to zero as well (it's just after the resetdataptr) */ op_dataptr[niter] = 0; + /* + * Initialize buffer data (must set the buffers and transferdata + * to NULL before we might deallocate the iterator). + */ + if (itflags&NPY_ITFLAG_BUFFER) { + bufferdata = NIT_BUFFERDATA(iter); + memset(NBF_BUFFERS(bufferdata), 0, (niter+1)*NPY_SIZEOF_INTP); + for (iiter = 0; iiter < niter; ++iiter) { + NBF_READTRANSFERDATA(bufferdata)[iiter] = NULL; + NBF_WRITETRANSFERDATA(bufferdata)[iiter] = NULL; + } + } + /* Fill in the AXISDATA arrays and set the ITERSIZE field */ if (!npyiter_fill_axisdata(iter, op, op_ndim, op_dataptr, op_axes)) { NpyIter_Deallocate(iter); return NULL; } + if (itflags&NPY_ITFLAG_BUFFER) { + /* No point in a buffer bigger than the iteration size */ + if (buffersize > NIT_ITERSIZE(iter)) { + buffersize = NIT_ITERSIZE(iter); + } + NBF_BUFFERSIZE(bufferdata) = buffersize; + } + /* * If an index was requested, compute the strides for it. * Note that we must do this before changing the order of the @@ -574,7 +632,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, PyTypeObject *op_subtype; /* Check whether the subtype was disabled */ - if (op_itflags[iiter]&NPY_OP_ITFLAG_NOSUBTYPE) { + if (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) { op_subtype = &PyArray_Type; } else { @@ -600,20 +658,16 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, */ npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter], PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL); + + /* Newly allocated arrays need no buffering */ + op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER; + op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST); } - else if (op_itflags[iiter]& - (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) { + else if ((op_itflags[iiter]& + (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) && + (op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) { PyArrayObject *temp; - /* Copying or buffering must be enabled for casting/conversion */ - if (!(op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) { - PyErr_SetString(PyExc_TypeError, - "Iterator input required copying or buffering, " - "but neither copying nor buffering was enabled"); - NpyIter_Deallocate(iter); - return NULL; - } - /* Allocate the temporary array, if possible */ temp = npyiter_new_temp_array(iter, &PyArray_Type, PyArray_NDIM(op[iiter]), @@ -651,7 +705,44 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, */ npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter], PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL); + + /* Copied arrays need no buffering */ + op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER; + op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST); + } + else { + /* + * Buffering must be enabled for casting/conversion if copy + * wasn't specified. + */ + if (op_itflags[iiter]& + (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP) && + !(itflags&NPY_ITFLAG_BUFFER)) { + PyErr_SetString(PyExc_TypeError, + "Iterator input required copying or buffering, " + "but neither copying nor buffering was enabled"); + NpyIter_Deallocate(iter); + return NULL; + } + + /* + * If the operand is aligned, any buffering can use aligned + * optimizations. + */ + if (PyArray_ISALIGNED(op[iiter])) { + op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED; + } + + /* + * If the operand has a different byte order than the + * desired data type, need a swap + */ + if (PyArray_ISNBO(PyArray_DESCR(op[iiter])->byteorder) != + PyArray_ISNBO(op_dtype[iiter]->byteorder)) { + op_itflags[iiter] |= NPY_OP_ITFLAG_BUFSWAP; + } } + } /* @@ -665,7 +756,7 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, } /* Now that the axes are finished, adjust ITERSIZE if necessary */ - if (itflags&NPY_ITFLAG_NOINNER) { + if ((itflags&NPY_ITFLAG_NOINNER) && !(itflags&NPY_ITFLAG_BUFFER)) { char *axisdata = NIT_AXISDATA(iter); NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata); } @@ -673,13 +764,29 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, /* Copy the per-op itflags into the iterator */ memcpy(NIT_OPITFLAGS(iter), op_itflags, niter); + if (itflags&NPY_ITFLAG_BUFFER) { + /* Allocate the buffers */ + if (!npyiter_allocate_buffers(iter)) { + NpyIter_Deallocate(iter); + return NULL; + } + + /* BUFFERED + NOINNER may not have a predictable itersize */ + if (itflags&NPY_ITFLAG_NOINNER) { + NIT_ITERSIZE(iter) = 0; + } + + /* Prepare the next buffers and set pos/size */ + npyiter_copy_to_buffers(iter); + } + return iter; } /* The constructor for an iterator over one object */ NpyIter* NpyIter_New(PyArrayObject *op, npy_uint32 flags, PyArray_Descr* dtype, - npy_intp a_ndim, npy_intp *axes) + npy_intp a_ndim, npy_intp *axes, npy_intp buffersize) { /* Split the flags into separate global and op flags */ npy_uint32 op_flags = flags&NPY_ITER_PER_OP_FLAGS; @@ -687,22 +794,52 @@ NpyIter_New(PyArrayObject *op, npy_uint32 flags, PyArray_Descr* dtype, if (a_ndim > 0) { return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype, - a_ndim, &axes); + a_ndim, &axes, buffersize); } else { - return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype, 0, NULL); + return NpyIter_MultiNew(1, &op, flags, &op_flags, &dtype, + 0, NULL, buffersize); } } int NpyIter_Deallocate(NpyIter *iter) { - /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/ + npy_uint32 itflags = NIT_ITFLAGS(iter); npy_intp ndim = NIT_NDIM(iter); npy_intp iiter, niter = NIT_NITER(iter); PyArray_Descr **dtype = NIT_DTYPES(iter); PyArrayObject **object = NIT_OBJECTS(iter); + /* Deallocate any buffers and buffering data */ + if (itflags&NPY_ITFLAG_BUFFER) { + char *bufferdata = NIT_BUFFERDATA(iter); + char **buffers; + void **transferdata; + + /* buffers */ + buffers = NBF_BUFFERS(bufferdata); + for(iiter = 0; iiter < niter; ++iiter, ++buffers) { + if (*buffers) { + PyArray_free(*buffers); + } + } + /* read bufferdata */ + transferdata = NBF_READTRANSFERDATA(bufferdata); + for(iiter = 0; iiter < niter; ++iiter, ++transferdata) { + if (*transferdata) { + PyArray_FreeStridedTransferData(*transferdata); + } + } + /* write bufferdata */ + transferdata = NBF_WRITETRANSFERDATA(bufferdata); + for(iiter = 0; iiter < niter; ++iiter, ++transferdata) { + if (*transferdata) { + PyArray_FreeStridedTransferData(*transferdata); + } + } + } + /* Deallocate all the dtypes and objects that were iterated */ for(iiter = 0; iiter < niter; ++iiter, ++dtype, ++object) { Py_XDECREF(*dtype); @@ -739,8 +876,6 @@ int NpyIter_RemoveInnerLoop(NpyIter *iter) npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); - char *axisdata; - /* Check conditions under which this can be done */ if (itflags&(NPY_ITFLAG_HASINDEX|NPY_ITFLAG_HASCOORDS)) { PyErr_SetString(PyExc_ValueError, @@ -754,8 +889,15 @@ int NpyIter_RemoveInnerLoop(NpyIter *iter) NIT_ITFLAGS(iter) = itflags; /* Adjust ITERSIZE */ - axisdata = NIT_AXISDATA(iter); - NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata); + if (itflags&NPY_ITFLAG_BUFFER) { + /* BUFFERED + NOINNER may not have a predictable itersize */ + NIT_ITERSIZE(iter) = 0; + + } + else { + char *axisdata = NIT_AXISDATA(iter); + NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata); + } } /* Reset the iterator */ @@ -789,6 +931,11 @@ void NpyIter_Reset(NpyIter *iter) ptrs[istrides] = resetdataptr[istrides]; } } + + if (itflags&NPY_ITFLAG_BUFFER) { + /* Prepare the next buffers and set pos/size */ + npyiter_copy_to_buffers(iter); + } } /* @@ -818,6 +965,13 @@ int NpyIter_GotoCoords(NpyIter *iter, npy_intp *coords) return NPY_FAIL; } + if (itflags&NPY_ITFLAG_BUFFER) { + PyErr_SetString(PyExc_ValueError, + "Cannot call GotoCoords on an iterator which " + "is buffered"); + return NPY_FAIL; + } + perm = NIT_PERM(iter); dataptr = NIT_RESETDATAPTR(iter); axisdata = NIT_AXISDATA(iter); @@ -897,6 +1051,13 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index) return NPY_FAIL; } + if (itflags&NPY_ITFLAG_BUFFER) { + PyErr_SetString(PyExc_ValueError, + "Cannot call GotoIndex on an iterator which " + "is buffered"); + return NPY_FAIL; + } + if (index < 0 || index >= NIT_ITERSIZE(iter)) { PyErr_SetString(PyExc_IndexError, "Iterator GotoIndex called with out-of-bounds " @@ -947,14 +1108,15 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index) return NPY_SUCCEED; } -/* SPECIALIZED iternext functions */ +/* SPECIALIZED iternext functions that handle the non-buffering part */ - /* The combination HASINDEX | NOINNER is excluded in the New functions */ /**begin repeat * #const_itflags = 0, * NPY_ITFLAG_HASINDEX, - * NPY_ITFLAG_NOINNER# - * #tag_itflags = 0, IND, NOINN# + * NPY_ITFLAG_NOINNER, + * NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER# + * #tag_itflags = 0, IND, NOINN, BUF, INDuBUF# */ /**begin repeat1 * #const_ndim = 1, 2, 100# @@ -965,7 +1127,7 @@ int NpyIter_GotoIndex(NpyIter *iter, npy_intp index) * #tag_niter = 1, 2, ANY# */ -/* Specialized iternext (@const_itflags@,@const_ndim@,@const_niter@) */ +/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_niter@) */ NPY_NO_EXPORT int npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@( NpyIter *iter) @@ -1107,6 +1269,53 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@( /**end repeat1**/ /**end repeat**/ +/* iternext function that handle the buffering part */ +NPY_NO_EXPORT int +npyiter_buffered_iternext(NpyIter *iter) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp ndim = NIT_NDIM(iter); + npy_intp niter = NIT_NITER(iter); + + char *bufferdata = NIT_BUFFERDATA(iter); + + /* + * If the iterator handles the inner loop, need to increment all + * the coordinates and pointers + */ + if (!(itflags&NPY_ITFLAG_NOINNER)) { + /* Increment within the buffer */ + if (++NBF_POS(bufferdata) < NBF_SIZE(bufferdata)) { + npy_intp iiter, *strides; + char **ptrs; + + strides = NBF_STRIDES(bufferdata); + ptrs = NBF_PTRS(bufferdata); + for (iiter = 0; iiter < niter; ++iiter) { + ptrs[iiter] += strides[iiter]; + } + return 1; + } + } + + /* Write back to the arrays */ + npyiter_copy_from_buffers(iter); + + /* Increment to the next buffer */ + if (!npyiter_jumpforward(iter, NBF_SIZE(bufferdata))) { + return 0; + } + + /* Prepare the next buffers and set pos/size */ + npyiter_copy_to_buffers(iter); + + return 1; +} + +/**end repeat2**/ +/**end repeat1**/ +/**end repeat**/ + /* Specialization of iternext for when the iteration size is 1 */ NPY_NO_EXPORT int npyiter_iternext_sizeone(NpyIter *iter) @@ -1130,11 +1339,18 @@ NpyIter_IterNext_Fn NpyIter_GetIterNext(NpyIter *iter) } /* + * If buffering is enabled, don't specialize further. + */ + if (itflags&NPY_ITFLAG_BUFFER) { + return &npyiter_buffered_iternext; + } + + /* * Ignore all the flags that don't affect the iterator memory - * layout or the iternext function. Currently only HASINDEX - * and NOINNER affect them. + * layout or the iternext function. Currently only HASINDEX, + * NOINNER, and BUFFER affect them. */ - itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NOINNER); + itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NOINNER|NPY_ITFLAG_BUFFER); /* Switch statements let the compiler optimize this most effectively */ switch (itflags) { @@ -1199,8 +1415,15 @@ NpyIter_IterNext_Fn NpyIter_GetIterNext(NpyIter *iter) * NPY_ITFLAG_IDENTPERM, * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM, * NPY_ITFLAG_NEGPERM, - * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM# - * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP# + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM, + * NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER# + * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP, + * BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF# */ NPY_NO_EXPORT void npyiter_getcoord_itflags@tag_itflags@(NpyIter *iter, npy_intp *outcoord) @@ -1264,7 +1487,8 @@ NpyIter_GetCoords_Fn NpyIter_GetGetCoords(NpyIter *iter) */ itflags &= (NPY_ITFLAG_HASINDEX | NPY_ITFLAG_IDENTPERM | - NPY_ITFLAG_NEGPERM); + NPY_ITFLAG_NEGPERM | + NPY_ITFLAG_BUFFER); switch (itflags) { /**begin repeat @@ -1273,8 +1497,15 @@ NpyIter_GetCoords_Fn NpyIter_GetGetCoords(NpyIter *iter) * NPY_ITFLAG_IDENTPERM, * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM, * NPY_ITFLAG_NEGPERM, - * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM# - * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP# + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM, + * NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER, + * NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER# + * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP, + * BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF# */ case @const_itflags@: return npyiter_getcoord_itflags@tag_itflags@; @@ -1326,7 +1557,7 @@ npy_intp NpyIter_GetIterSize(NpyIter *iter) int NpyIter_GetShape(NpyIter *iter, npy_intp *outshape) { - const npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_uint32 itflags = NIT_ITFLAGS(iter); npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); @@ -1363,9 +1594,16 @@ char **NpyIter_GetDataPtrArray(NpyIter *iter) npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); - char* axisdata = NIT_AXISDATA(iter); + char* data; - return NAD_PTRS(axisdata); + if (itflags&NPY_ITFLAG_BUFFER) { + data = NIT_BUFFERDATA(iter); + return NBF_PTRS(data); + } + else { + data = NIT_AXISDATA(iter); + return NAD_PTRS(data); + } } PyArray_Descr **NpyIter_GetDescrArray(NpyIter *iter) @@ -1488,22 +1726,38 @@ void NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags) npy_intp *NpyIter_GetInnerStrideArray(NpyIter *iter) { - /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/ + npy_uint32 itflags = NIT_ITFLAGS(iter); npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); - char* axisdata = NIT_AXISDATA(iter); - return NAD_STRIDES(axisdata); + char* data; + + if (itflags&NPY_ITFLAG_BUFFER) { + data = NIT_BUFFERDATA(iter); + return NBF_STRIDES(data); + } + else { + data = NIT_AXISDATA(iter); + return NAD_STRIDES(data); + } } npy_intp* NpyIter_GetInnerLoopSizePtr(NpyIter *iter) { - /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/ + npy_uint32 itflags = NIT_ITFLAGS(iter); npy_intp ndim = NIT_NDIM(iter); npy_intp niter = NIT_NITER(iter); - char* axisdata = NIT_AXISDATA(iter); - return &NAD_SHAPE(axisdata); + char* data; + + if (itflags&NPY_ITFLAG_BUFFER) { + data = NIT_BUFFERDATA(iter); + return &NBF_SIZE(data); + } + else { + data = NIT_AXISDATA(iter); + return &NAD_SHAPE(data); + } } /* Checks 'flags' for (C|F)_ORDER_INDEX, COORDS, and NO_INNER_ITERATION, @@ -1560,6 +1814,13 @@ pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags) } /* Check if offsets were requested instead of pointers */ if (flags&NPY_ITER_OFFSETS) { + /* Buffering is incompatible with offsets */ + if (flags&(NPY_ITER_BUFFERED|NPY_ITER_BUFFERED_GROWINNER)) { + PyErr_SetString(PyExc_ValueError, + "Iterator flag BUFFERED cannot be used " + "with the flag OFFSETS"); + return 0; + } (*itflags) |= NPY_ITFLAG_HASOFFSETS; } /* Check if the caller wants to handle inner iteration */ @@ -1572,6 +1833,13 @@ pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags) } (*itflags) |= NPY_ITFLAG_NOINNER; } + /* Buffering */ + if (flags&(NPY_ITER_BUFFERED|NPY_ITER_BUFFERED_GROWINNER)) { + (*itflags) |= NPY_ITFLAG_BUFFER; + if (flags&NPY_ITER_BUFFERED_GROWINNER) { + (*itflags) |= NPY_ITFLAG_GROWINNER; + } + } return 1; } @@ -1717,11 +1985,7 @@ pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype, /* NULL operands must be automatically allocated outputs */ if (*op == NULL) { /* ALLOCATE should be enabled */ - if (op_flags&NPY_ITER_ALLOCATE) { - if (op_flags&NPY_ITER_NO_SUBTYPE) { - *op_itflags |= NPY_OP_ITFLAG_NOSUBTYPE; - } - } else { + if (!(op_flags&NPY_ITER_ALLOCATE)) { PyErr_SetString(PyExc_ValueError, "Iterator input was NULL, but automatic allocation as an " "output wasn't requested"); @@ -2944,6 +3208,380 @@ npyiter_promote_types(int type1, int type2) return NPY_NOTYPE; } +static int +npyiter_allocate_buffers(NpyIter *iter) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp ndim = NIT_NDIM(iter); + npy_intp iiter, niter = NIT_NITER(iter); + + char *op_itflags = NIT_OPITFLAGS(iter); + char *bufferdata = NIT_BUFFERDATA(iter), *axisdata = NIT_AXISDATA(iter); + PyArrayObject **op = NIT_OBJECTS(iter); + PyArray_Descr **op_dtype = NIT_DTYPES(iter); + npy_intp *strides = NAD_STRIDES(axisdata), op_stride; + npy_intp buffersize = NBF_BUFFERSIZE(bufferdata); + char *buffer; + + PyArray_StridedTransferFn stransfer = NULL; + void *transferdata = NULL; + + for (iiter = 0; iiter < niter; ++iiter) { + char flags = op_itflags[iiter]; + op_stride = strides[iiter]; + + /* + * If we have determined that a buffer may be needed, + * allocate one. + */ + if (!(flags&NPY_OP_ITFLAG_BUFNEVER)) { + npy_intp itemsize = op_dtype[iiter]->elsize; + buffer = PyArray_malloc(itemsize*buffersize); + if (buffer == NULL) { + PyErr_NoMemory(); + return 0; + } + NBF_BUFFERS(bufferdata)[iiter] = buffer; + + /* Also need to get an appropriate transfer functions */ + if (flags&NPY_OP_ITFLAG_READ) { + if (PyArray_GetTransferFunction( + (flags&NPY_OP_ITFLAG_ALIGNED) != 0, + op_stride, + op_dtype[iiter]->elsize, + PyArray_DESCR(op[iiter]), + op_dtype[iiter], + &stransfer, + &transferdata) != NPY_SUCCEED) { + return 0; + } + NBF_READTRANSFERFN(bufferdata)[iiter] = stransfer; + NBF_READTRANSFERDATA(bufferdata)[iiter] = transferdata; + } + else { + NBF_READTRANSFERFN(bufferdata)[iiter] = NULL; + } + if (flags&NPY_OP_ITFLAG_WRITE) { + if (PyArray_GetTransferFunction( + (flags&NPY_OP_ITFLAG_ALIGNED) != 0, + op_dtype[iiter]->elsize, + op_stride, + op_dtype[iiter], + PyArray_DESCR(op[iiter]), + &stransfer, + &transferdata) != NPY_SUCCEED) { + return 0; + } + NBF_WRITETRANSFERFN(bufferdata)[iiter] = stransfer; + NBF_WRITETRANSFERDATA(bufferdata)[iiter] = transferdata; + } + else { + NBF_WRITETRANSFERFN(bufferdata)[iiter] = NULL; + } + } + } + + return 1; +} + +/* + * This function jumps the iterator forward by 'count' steps. + * It is used by the buffering code, and only affects the non-buffering + * data. + * + * Returns 1 if it landed on a valid element, 0 if it went + * past the end. + */ +static int +npyiter_jumpforward(NpyIter *iter, npy_intp count) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp idim, ndim = NIT_NDIM(iter); + npy_intp niter = NIT_NITER(iter); + + char *axisdata, **dataptr; + npy_intp sizeof_axisdata; + npy_intp delta, coord, shape; + npy_intp istrides, nstrides; + + sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter); + axisdata = NIT_AXISDATA(iter); + + /* Go forward through axisdata, incrementing the coordinates */ + for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) { + shape = NAD_SHAPE(axisdata); + delta = count % shape; + coord = NAD_COORD(axisdata) + delta; + count -= delta; + if (coord >= shape) { + coord -= shape; + count += shape; + } + NAD_COORD(axisdata) = coord; + if (count == 0) { + break; + } + count /= shape; + } + + /* If it incremented past the end, set to the last coordinate */ + if (count > 0) { + axisdata = NIT_AXISDATA(iter); + for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) { + NAD_COORD(axisdata) = NAD_SHAPE(axisdata)-1; + } + return 0; + } + + /* Go backwards through axisdata, applying the new coordinates */ + dataptr = NIT_RESETDATAPTR(iter); + nstrides = NAD_NSTRIDES(); + + /* + * Set the pointers, from the slowest-changing to the + * fastest-changing. The successive pointers accumulate + * the offsets, starting from the original data pointers. + */ + axisdata = NIT_AXISDATA(iter) + (ndim-1)*sizeof_axisdata; + for (idim = 0; idim < ndim; ++idim, axisdata -= sizeof_axisdata) { + npy_intp *strides; + char **ptrs; + + coord = NAD_COORD(axisdata); + strides = NAD_STRIDES(axisdata); + ptrs = NAD_PTRS(axisdata); + + for (istrides = 0; istrides < nstrides; ++istrides) { + ptrs[istrides] = dataptr[istrides] + coord*strides[istrides]; + } + + dataptr = ptrs; + } + + return 1; +} + +/* + * This checks how much space is left before we reach the end of + * the iterator, and whether the whole buffer can be done with one + * stride. + */ +static npy_intp +npyiter_checkspaceleft(NpyIter *iter, npy_intp count, + int *out_is_onestride) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp idim, ndim = NIT_NDIM(iter); + npy_intp niter = NIT_NITER(iter); + + char *axisdata; + npy_intp sizeof_axisdata; + npy_intp coord, shape; + npy_intp spaceleft = 1, factor = 1; + + sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter); + axisdata = NIT_AXISDATA(iter); + + /* Go forward through axisdata, calculating the space left */ + for (idim = 0; idim < ndim && spaceleft < count; + ++idim, axisdata += sizeof_axisdata) { + shape = NAD_SHAPE(axisdata); + coord = NAD_COORD(axisdata); + spaceleft += (shape-coord-1) * factor; + factor *= shape; + } + + /* If we broke out after dimension 0, it works with one stride */ + if (idim == 1) { + *out_is_onestride = 1; + } + else { + *out_is_onestride = 0; + } + + /* Return either count or how much space is left */ + if (spaceleft < count) { + return spaceleft; + } + else { + return count; + } +} + +/* + * This gets called after the the buffers have been exhausted, and + * their data needs to be written back to the arrays. The coordinates + * must be positioned for the beginning of the buffer. + */ +static void +npyiter_copy_from_buffers(NpyIter *iter) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp ndim = NIT_NDIM(iter); + npy_intp iiter, niter = NIT_NITER(iter); + + char *op_itflags = NIT_OPITFLAGS(iter); + char *bufferdata = NIT_BUFFERDATA(iter), + *axisdata = NIT_AXISDATA(iter); + + PyArray_Descr **dtypes = NIT_DTYPES(iter); + npy_intp transfersize = NBF_SIZE(bufferdata); + npy_intp *strides = NBF_STRIDES(bufferdata), + *ad_strides = NAD_STRIDES(axisdata); + char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata); + char **buffers = NBF_BUFFERS(bufferdata); + + PyArray_StridedTransferFn stransfer = NULL; + void *transferdata = NULL; + + npy_intp axisdata_incr = NIT_SIZEOF_AXISDATA(itflags, ndim, niter) / + NPY_SIZEOF_INTP; + + for (iiter = 0; iiter < niter; ++iiter) { + stransfer = NBF_WRITETRANSFERFN(bufferdata)[iiter]; + transferdata = NBF_WRITETRANSFERDATA(bufferdata)[iiter]; + if ((stransfer != NULL) && (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) { + /* Copy back only if the pointer was pointing to the buffer */ + npy_intp delta = (ptrs[iiter] - buffers[iiter]); + if (0 <= delta && delta <= transfersize*dtypes[iiter]->elsize) { + /*printf("transfer %p <- %p\n", ad_ptrs[iiter], buffers[iiter]);*/ + PyArray_TransferStridedToNDim(ndim, + ad_ptrs[iiter], &ad_strides[iiter], axisdata_incr, + buffers[iiter], strides[iiter], + &NAD_COORD(axisdata), axisdata_incr, + &NAD_SHAPE(axisdata), axisdata_incr, + transfersize, dtypes[iiter]->elsize, + stransfer, + transferdata); + } + } + } +} + +/* + * This gets called after the iterator has been positioned to coordinates + * for the start of a buffer. It decides which operands need a buffer, + * and copies the data into the buffers. + */ +static void +npyiter_copy_to_buffers(NpyIter *iter) +{ + npy_uint32 itflags = NIT_ITFLAGS(iter); + npy_intp ndim = NIT_NDIM(iter); + npy_intp iiter, niter = NIT_NITER(iter); + + char *op_itflags = NIT_OPITFLAGS(iter); + char *bufferdata = NIT_BUFFERDATA(iter), + *axisdata = NIT_AXISDATA(iter); + + PyArray_Descr **dtypes = NIT_DTYPES(iter); + npy_intp *strides = NBF_STRIDES(bufferdata), + *ad_strides = NAD_STRIDES(axisdata); + char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata); + char **buffers = NBF_BUFFERS(bufferdata); + npy_intp buffersize = NBF_BUFFERSIZE(bufferdata), transfersize; + int is_onestride = 0, any_buffered = 0; + + PyArray_StridedTransferFn stransfer = NULL; + void *transferdata = NULL; + + npy_intp axisdata_incr = NIT_SIZEOF_AXISDATA(itflags, ndim, niter) / + NPY_SIZEOF_INTP; + + transfersize = npyiter_checkspaceleft(iter, buffersize, &is_onestride); + NBF_SIZE(bufferdata) = transfersize; + NBF_POS(bufferdata) = 0; + + for (iiter = 0; iiter < niter; ++iiter) { + /* + * If the buffer is write-only, these two are NULL, and the buffer + * pointers will be set up but the read copy won't be done + */ + stransfer = NBF_READTRANSFERFN(bufferdata)[iiter]; + transferdata = NBF_READTRANSFERDATA(bufferdata)[iiter]; + switch (op_itflags[iiter]& + (NPY_OP_ITFLAG_BUFNEVER| + NPY_OP_ITFLAG_BUFSWAP| + NPY_OP_ITFLAG_ALIGNED| + NPY_OP_ITFLAG_CAST)) { + /* never need to buffer this operand */ + case NPY_OP_ITFLAG_BUFNEVER: + ptrs[iiter] = ad_ptrs[iiter]; + strides[iiter] = ad_strides[iiter]; + stransfer = NULL; + break; + /* aligned copy*/ + case NPY_OP_ITFLAG_ALIGNED: + /* + * Since all we're doing is copying the data to make + * it work with one stride, if it already does there's + * no need to copy. + */ + if (is_onestride) { + ptrs[iiter] = ad_ptrs[iiter]; + strides[iiter] = ad_strides[iiter]; + stransfer = NULL; + } + else { + /* In this case, the buffer is being used */ + ptrs[iiter] = buffers[iiter]; + strides[iiter] = dtypes[iiter]->elsize; + } + break; + /* unaligned copy */ + case 0: + /* + * If aligned data wasn't requested (as seen by the + * copyswap flag not being set), all we're doing is + * copying the data to make it work with one stride. + * If it already does there's no need to copy. + */ + if (is_onestride && + !(op_itflags[iiter]&NPY_OP_ITFLAG_COPYSWAP)) { + ptrs[iiter] = ad_ptrs[iiter]; + strides[iiter] = ad_strides[iiter]; + stransfer = NULL; + } + else { + /* In this case, the buffer is being used */ + ptrs[iiter] = buffers[iiter]; + strides[iiter] = dtypes[iiter]->elsize; + } + break; + default: + /* In this case, the buffer is being used */ + ptrs[iiter] = buffers[iiter]; + strides[iiter] = dtypes[iiter]->elsize; + break; + } + + if (stransfer != NULL) { + /*printf("transfer %p -> %p\n", ad_ptrs[iiter], ptrs[iiter]);*/ + any_buffered = 1; + PyArray_TransferNDimToStrided(ndim, + ptrs[iiter], strides[iiter], + ad_ptrs[iiter], &ad_strides[iiter], axisdata_incr, + &NAD_COORD(axisdata), axisdata_incr, + &NAD_SHAPE(axisdata), axisdata_incr, + transfersize, dtypes[iiter]->elsize, + stransfer, + transferdata); + } + + } + + /* + * If buffering wasn't needed, we can grow the inner + * loop to as large as possible. + */ + if (!any_buffered && (itflags&NPY_ITFLAG_GROWINNER)) { + npy_intp maxsize = NAD_SHAPE(axisdata)-NAD_COORD(axisdata); + if (maxsize > transfersize) { + NBF_SIZE(bufferdata) = maxsize; + } + } +} + /* For debugging */ NPY_NO_EXPORT void NpyIter_DebugPrint(NpyIter *iter) @@ -2972,6 +3610,10 @@ NpyIter_DebugPrint(NpyIter *iter) printf("FORCEDORDER "); if (itflags&NPY_ITFLAG_NOINNER) printf("NOINNER "); + if (itflags&NPY_ITFLAG_BUFFER) + printf("BUFFER "); + if (itflags&NPY_ITFLAG_GROWINNER) + printf("GROWINNER "); printf("\n"); printf("NDim: %d\n", (int)ndim); printf("NIter: %d\n", (int)niter); @@ -3018,22 +3660,63 @@ NpyIter_DebugPrint(NpyIter *iter) printf("OpItFlags:\n"); for (iiter = 0; iiter < niter; ++iiter) { printf(" Flags[%d]: ", (int)iiter); - if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE) - printf("WRITE "); if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_READ) printf("READ "); + if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE) + printf("WRITE "); if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_COPY) printf("COPY "); if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_CAST) printf("CAST "); if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_COPYSWAP) printf("COPYSWAP "); - if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_NOSUBTYPE) - printf("NOSUBTYPE "); + if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFNEVER) + printf("BUFNEVER "); + if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFSWAP) + printf("BUFSWAP "); + if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_ALIGNED) + printf("ALIGNED "); printf("\n"); } printf("\n"); + if (itflags&NPY_ITFLAG_BUFFER) { + char *bufferdata = NIT_BUFFERDATA(iter); + printf("BufferData:\n"); + printf(" BufferSize: %d\n", (int)NBF_BUFFERSIZE(bufferdata)); + printf(" Size: %d\n", (int)NBF_SIZE(bufferdata)); + printf(" Pos: %d\n", (int)NBF_POS(bufferdata)); + printf(" Strides: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%d ", (int)NBF_STRIDES(bufferdata)[iiter]); + printf("\n"); + printf(" Ptrs: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_PTRS(bufferdata)[iiter]); + printf("\n"); + printf(" ReadTransferFn: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_READTRANSFERFN(bufferdata)[iiter]); + printf("\n"); + printf(" ReadTransferData: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_READTRANSFERDATA(bufferdata)[iiter]); + printf("\n"); + printf(" WriteTransferFn: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_WRITETRANSFERFN(bufferdata)[iiter]); + printf("\n"); + printf(" WriteTransferData: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_WRITETRANSFERDATA(bufferdata)[iiter]); + printf("\n"); + printf(" Buffers: "); + for (iiter = 0; iiter < niter; ++iiter) + printf("%p ", NBF_BUFFERS(bufferdata)[iiter]); + printf("\n"); + printf("\n"); + } + axisdata = NIT_AXISDATA(iter); sizeof_axisdata = NIT_SIZEOF_AXISDATA(itflags, ndim, niter); for (idim = 0; idim < ndim; ++idim, axisdata += sizeof_axisdata) { diff --git a/numpy/core/src/multiarray/new_iterator.h b/numpy/core/src/multiarray/new_iterator.h index 63351471a..905642d79 100644 --- a/numpy/core/src/multiarray/new_iterator.h +++ b/numpy/core/src/multiarray/new_iterator.h @@ -16,13 +16,13 @@ typedef void (*NpyIter_GetCoords_Fn )(NpyIter *iter, /* Allocate a new iterator over one array object */ NpyIter* NpyIter_New(PyArrayObject* op, npy_uint32 flags, PyArray_Descr* dtype, - npy_intp a_ndim, npy_intp *axes); + npy_intp a_ndim, npy_intp *axes, npy_intp buffersize); /* Allocate a new iterator over multiple array objects */ NpyIter* NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags, npy_uint32 *op_flags, PyArray_Descr **op_request_dtypes, - npy_intp oa_ndim, npy_intp **op_axes); + npy_intp oa_ndim, npy_intp **op_axes, npy_intp buffersize); /* Removes coords support from an iterator */ int NpyIter_RemoveCoords(NpyIter *iter); @@ -107,6 +107,10 @@ NPY_NO_EXPORT void NpyIter_DebugPrint(NpyIter *iter); #define NPY_ITER_COMMON_DATA_TYPE 0x00000080 /* Produce offsets instead of pointers into the data */ #define NPY_ITER_OFFSETS 0x00000100 +/* Enables buffering */ +#define NPY_ITER_BUFFERED 0x00000200 +/* Enables buffering, and grows the inner loop when possible */ +#define NPY_ITER_BUFFERED_GROWINNER 0x00000400 /*** Per-operand flags that may be passed to the iterator constructors ***/ diff --git a/numpy/core/src/multiarray/new_iterator_pywrap.c b/numpy/core/src/multiarray/new_iterator_pywrap.c index 3ec9008fb..0695d7f4c 100644 --- a/numpy/core/src/multiarray/new_iterator_pywrap.c +++ b/numpy/core/src/multiarray/new_iterator_pywrap.c @@ -79,12 +79,12 @@ static int npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds) { static char *kwlist[] = {"op", "flags", "op_flags", "op_dtypes", - "op_axes"}; + "op_axes", "buffersize"}; PyObject *op_in, *flags_in = NULL, *op_flags_in = NULL, *op_dtypes_in = NULL, *op_axes_in = NULL; - npy_intp iiter; + npy_intp iiter, buffersize = 0; npy_intp niter = 0; PyArrayObject *op[NPY_MAXARGS]; @@ -101,9 +101,9 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds) return -1; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOOO", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOOOi", kwlist, &op_in, &flags_in, &op_flags_in, &op_dtypes_in, - &op_axes_in)) { + &op_axes_in, &buffersize)) { return -1; } @@ -170,6 +170,14 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds) } /* Use switch statements to quickly isolate the right flag */ switch (str[0]) { + case 'b': + if (strcmp(str, "buffered") == 0) { + flag = NPY_ITER_BUFFERED; + } + else if (strcmp(str, "buffered_growinner") == 0) { + flag = NPY_ITER_BUFFERED_GROWINNER; + } + break; case 'c': if (length >= 6) switch (str[5]) { case 'e': @@ -531,7 +539,8 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds) self->iter = NpyIter_MultiNew(niter, op, flags, op_flags, (PyArray_Descr**)op_request_dtypes, - oa_ndim, oa_ndim > 0 ? op_axes : NULL); + oa_ndim, oa_ndim > 0 ? op_axes : NULL, + buffersize); if (self->iter == NULL) { goto fail; |