diff options
Diffstat (limited to 'doc/source/reference')
-rw-r--r-- | doc/source/reference/arrays.nditer.cython.rst | 147 | ||||
-rw-r--r-- | doc/source/reference/arrays.nditer.rst | 181 | ||||
-rw-r--r-- | doc/source/reference/c-api/array.rst | 2 | ||||
-rw-r--r-- | doc/source/reference/c-api/index.rst | 2 | ||||
-rw-r--r-- | doc/source/reference/index.rst | 2 | ||||
-rw-r--r-- | doc/source/reference/maskedarray.baseclass.rst | 1 | ||||
-rw-r--r-- | doc/source/reference/routines.ma.rst | 1 |
7 files changed, 173 insertions, 163 deletions
diff --git a/doc/source/reference/arrays.nditer.cython.rst b/doc/source/reference/arrays.nditer.cython.rst new file mode 100644 index 000000000..2cc7763ed --- /dev/null +++ b/doc/source/reference/arrays.nditer.cython.rst @@ -0,0 +1,147 @@ +Putting the Inner Loop in Cython +================================ + +Those who want really good performance out of their low level operations +should strongly consider directly using the iteration API provided +in C, but for those who are not comfortable with C or C++, Cython +is a good middle ground with reasonable performance tradeoffs. For +the :class:`nditer` object, this means letting the iterator take care +of broadcasting, dtype conversion, and buffering, while giving the inner +loop to Cython. + +For our example, we'll create a sum of squares function. To start, +let's implement this function in straightforward Python. We want to +support an 'axis' parameter similar to the numpy :func:`sum` function, +so we will need to construct a list for the `op_axes` parameter. +Here's how this looks. + +.. admonition:: Example + + >>> def axis_to_axeslist(axis, ndim): + ... if axis is None: + ... return [-1] * ndim + ... else: + ... if type(axis) is not tuple: + ... axis = (axis,) + ... axeslist = [1] * ndim + ... for i in axis: + ... axeslist[i] = -1 + ... ax = 0 + ... for i in range(ndim): + ... if axeslist[i] != -1: + ... axeslist[i] = ax + ... ax += 1 + ... return axeslist + ... + >>> def sum_squares_py(arr, axis=None, out=None): + ... axeslist = axis_to_axeslist(axis, arr.ndim) + ... it = np.nditer([arr, out], flags=['reduce_ok', + ... 'buffered', 'delay_bufalloc'], + ... op_flags=[['readonly'], ['readwrite', 'allocate']], + ... op_axes=[None, axeslist], + ... op_dtypes=['float64', 'float64']) + ... with it: + ... it.operands[1][...] = 0 + ... it.reset() + ... for x, y in it: + ... y[...] += x*x + ... return it.operands[1] + ... + >>> a = np.arange(6).reshape(2,3) + >>> sum_squares_py(a) + array(55.0) + >>> sum_squares_py(a, axis=-1) + array([ 5., 50.]) + +To Cython-ize this function, we replace the inner loop (y[...] += x*x) with +Cython code that's specialized for the float64 dtype. With the +'external_loop' flag enabled, the arrays provided to the inner loop will +always be one-dimensional, so very little checking needs to be done. + +Here's the listing of sum_squares.pyx:: + + import numpy as np + cimport numpy as np + cimport cython + + def axis_to_axeslist(axis, ndim): + if axis is None: + return [-1] * ndim + else: + if type(axis) is not tuple: + axis = (axis,) + axeslist = [1] * ndim + for i in axis: + axeslist[i] = -1 + ax = 0 + for i in range(ndim): + if axeslist[i] != -1: + axeslist[i] = ax + ax += 1 + return axeslist + + @cython.boundscheck(False) + def sum_squares_cy(arr, axis=None, out=None): + cdef np.ndarray[double] x + cdef np.ndarray[double] y + cdef int size + cdef double value + + axeslist = axis_to_axeslist(axis, arr.ndim) + it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop', + 'buffered', 'delay_bufalloc'], + op_flags=[['readonly'], ['readwrite', 'allocate']], + op_axes=[None, axeslist], + op_dtypes=['float64', 'float64']) + with it: + it.operands[1][...] = 0 + it.reset() + for xarr, yarr in it: + x = xarr + y = yarr + size = x.shape[0] + for i in range(size): + value = x[i] + y[i] = y[i] + value * value + return it.operands[1] + +On this machine, building the .pyx file into a module looked like the +following, but you may have to find some Cython tutorials to tell you +the specifics for your system configuration.:: + + $ cython sum_squares.pyx + $ gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -I/usr/include/python2.7 -fno-strict-aliasing -o sum_squares.so sum_squares.c + +Running this from the Python interpreter produces the same answers +as our native Python/NumPy code did. + +.. admonition:: Example + + >>> from sum_squares import sum_squares_cy + >>> a = np.arange(6).reshape(2,3) + >>> sum_squares_cy(a) + array(55.0) + >>> sum_squares_cy(a, axis=-1) + array([ 5., 50.]) + +Doing a little timing in IPython shows that the reduced overhead and +memory allocation of the Cython inner loop is providing a very nice +speedup over both the straightforward Python code and an expression +using NumPy's built-in sum function.:: + + >>> a = np.random.rand(1000,1000) + + >>> timeit sum_squares_py(a, axis=-1) + 10 loops, best of 3: 37.1 ms per loop + + >>> timeit np.sum(a*a, axis=-1) + 10 loops, best of 3: 20.9 ms per loop + + >>> timeit sum_squares_cy(a, axis=-1) + 100 loops, best of 3: 11.8 ms per loop + + >>> np.all(sum_squares_cy(a, axis=-1) == np.sum(a*a, axis=-1)) + True + + >>> np.all(sum_squares_py(a, axis=-1) == np.sum(a*a, axis=-1)) + True diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst index 7dab09a71..2db12a408 100644 --- a/doc/source/reference/arrays.nditer.rst +++ b/doc/source/reference/arrays.nditer.rst @@ -1,5 +1,9 @@ .. currentmodule:: numpy +.. for doctests + The last section on Cython is 'included' at the end of this file. The tests + for that section are disabled. + .. _arrays.nditer: ********************* @@ -218,21 +222,21 @@ produce identical results to the ones in the previous section. >>> it = np.nditer(a, flags=['f_index']) >>> while not it.finished: ... print("%d <%d>" % (it[0], it.index), end=' ') - ... it.iternext() + ... is_not_finished = it.iternext() ... 0 <0> 1 <2> 2 <4> 3 <1> 4 <3> 5 <5> >>> it = np.nditer(a, flags=['multi_index']) >>> while not it.finished: ... print("%d <%s>" % (it[0], it.multi_index), end=' ') - ... it.iternext() + ... is_not_finished = it.iternext() ... 0 <(0, 0)> 1 <(0, 1)> 2 <(0, 2)> 3 <(1, 0)> 4 <(1, 1)> 5 <(1, 2)> >>> with np.nditer(a, flags=['multi_index'], op_flags=['writeonly']) as it: ... while not it.finished: ... it[0] = it.multi_index[1] - it.multi_index[0] - ... it.iternext() + ... is_not_finished = it.iternext() ... >>> a array([[ 0, 1, 2], @@ -316,12 +320,13 @@ specified as an iterator flag. ... op_dtypes=['complex128']): ... print(np.sqrt(x), end=' ') ... - 1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j) + 1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j) >>> for x in np.nditer(a, flags=['buffered'], op_dtypes=['complex128']): ... print(np.sqrt(x), end=' ') ... - 1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j) + 1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j) + The iterator uses NumPy's casting rules to determine whether a specific conversion is permitted. By default, it enforces 'safe' casting. This means, @@ -405,8 +410,8 @@ which includes the input shapes to help diagnose the problem. ... print("%d:%d" % (x,y), end=' ') ... Traceback (most recent call last): - File "<stdin>", line 1, in <module> - ValueError: operands could not be broadcast together with shapes (2) (2,3) + ... + ValueError: operands could not be broadcast together with shapes (2,) (2,3) Iterator-Allocated Output Arrays -------------------------------- @@ -482,9 +487,9 @@ reasons. >>> square(np.arange(6).reshape(2,3), out=b) Traceback (most recent call last): - File "<stdin>", line 1, in <module> - File "<stdin>", line 4, in square - ValueError: non-broadcastable output operand with shape (3) doesn't match the broadcast shape (2,3) + ... + ValueError: non-broadcastable output operand with shape (3,) doesn't + match the broadcast shape (2,3) Outer Product Iteration ----------------------- @@ -550,7 +555,7 @@ For a simple example, consider taking the sum of all elements in an array. >>> a = np.arange(24).reshape(2,3,4) >>> b = np.array(0) - >>> with np.nditer([a, b], flags=['reduce_ok', 'external_loop'], + >>> with np.nditer([a, b], flags=['reduce_ok'], ... op_flags=[['readonly'], ['readwrite']]) as it: ... for x,y in it: ... y[...] += x @@ -568,7 +573,7 @@ sums along the last axis of `a`. .. admonition:: Example >>> a = np.arange(24).reshape(2,3,4) - >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop'], + >>> it = np.nditer([a, None], flags=['reduce_ok'], ... op_flags=[['readonly'], ['readwrite', 'allocate']], ... op_axes=[None, [0,1,-1]]) >>> with it: @@ -602,7 +607,7 @@ buffering. .. admonition:: Example >>> a = np.arange(24).reshape(2,3,4) - >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop', + >>> it = np.nditer([a, None], flags=['reduce_ok', ... 'buffered', 'delay_bufalloc'], ... op_flags=[['readonly'], ['readwrite', 'allocate']], ... op_axes=[None, [0,1,-1]]) @@ -617,150 +622,8 @@ buffering. array([[ 6, 22, 38], [54, 70, 86]]) -Putting the Inner Loop in Cython -================================ - -Those who want really good performance out of their low level operations -should strongly consider directly using the iteration API provided -in C, but for those who are not comfortable with C or C++, Cython -is a good middle ground with reasonable performance tradeoffs. For -the :class:`nditer` object, this means letting the iterator take care -of broadcasting, dtype conversion, and buffering, while giving the inner -loop to Cython. - -For our example, we'll create a sum of squares function. To start, -let's implement this function in straightforward Python. We want to -support an 'axis' parameter similar to the numpy :func:`sum` function, -so we will need to construct a list for the `op_axes` parameter. -Here's how this looks. - -.. admonition:: Example - - >>> def axis_to_axeslist(axis, ndim): - ... if axis is None: - ... return [-1] * ndim - ... else: - ... if type(axis) is not tuple: - ... axis = (axis,) - ... axeslist = [1] * ndim - ... for i in axis: - ... axeslist[i] = -1 - ... ax = 0 - ... for i in range(ndim): - ... if axeslist[i] != -1: - ... axeslist[i] = ax - ... ax += 1 - ... return axeslist - ... - >>> def sum_squares_py(arr, axis=None, out=None): - ... axeslist = axis_to_axeslist(axis, arr.ndim) - ... it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop', - ... 'buffered', 'delay_bufalloc'], - ... op_flags=[['readonly'], ['readwrite', 'allocate']], - ... op_axes=[None, axeslist], - ... op_dtypes=['float64', 'float64']) - ... with it: - ... it.operands[1][...] = 0 - ... it.reset() - ... for x, y in it: - ... y[...] += x*x - ... return it.operands[1] - ... - >>> a = np.arange(6).reshape(2,3) - >>> sum_squares_py(a) - array(55.0) - >>> sum_squares_py(a, axis=-1) - array([ 5., 50.]) - -To Cython-ize this function, we replace the inner loop (y[...] += x*x) with -Cython code that's specialized for the float64 dtype. With the -'external_loop' flag enabled, the arrays provided to the inner loop will -always be one-dimensional, so very little checking needs to be done. - -Here's the listing of sum_squares.pyx:: - - import numpy as np - cimport numpy as np - cimport cython - - def axis_to_axeslist(axis, ndim): - if axis is None: - return [-1] * ndim - else: - if type(axis) is not tuple: - axis = (axis,) - axeslist = [1] * ndim - for i in axis: - axeslist[i] = -1 - ax = 0 - for i in range(ndim): - if axeslist[i] != -1: - axeslist[i] = ax - ax += 1 - return axeslist - - @cython.boundscheck(False) - def sum_squares_cy(arr, axis=None, out=None): - cdef np.ndarray[double] x - cdef np.ndarray[double] y - cdef int size - cdef double value - - axeslist = axis_to_axeslist(axis, arr.ndim) - it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop', - 'buffered', 'delay_bufalloc'], - op_flags=[['readonly'], ['readwrite', 'allocate']], - op_axes=[None, axeslist], - op_dtypes=['float64', 'float64']) - with it: - it.operands[1][...] = 0 - it.reset() - for xarr, yarr in it: - x = xarr - y = yarr - size = x.shape[0] - for i in range(size): - value = x[i] - y[i] = y[i] + value * value - return it.operands[1] - -On this machine, building the .pyx file into a module looked like the -following, but you may have to find some Cython tutorials to tell you -the specifics for your system configuration.:: - - $ cython sum_squares.pyx - $ gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -I/usr/include/python2.7 -fno-strict-aliasing -o sum_squares.so sum_squares.c - -Running this from the Python interpreter produces the same answers -as our native Python/NumPy code did. - -.. admonition:: Example - - >>> from sum_squares import sum_squares_cy - >>> a = np.arange(6).reshape(2,3) - >>> sum_squares_cy(a) - array(55.0) - >>> sum_squares_cy(a, axis=-1) - array([ 5., 50.]) - -Doing a little timing in IPython shows that the reduced overhead and -memory allocation of the Cython inner loop is providing a very nice -speedup over both the straightforward Python code and an expression -using NumPy's built-in sum function.:: - - >>> a = np.random.rand(1000,1000) - - >>> timeit sum_squares_py(a, axis=-1) - 10 loops, best of 3: 37.1 ms per loop - - >>> timeit np.sum(a*a, axis=-1) - 10 loops, best of 3: 20.9 ms per loop - - >>> timeit sum_squares_cy(a, axis=-1) - 100 loops, best of 3: 11.8 ms per loop - - >>> np.all(sum_squares_cy(a, axis=-1) == np.sum(a*a, axis=-1)) - True +.. for doctests + Include Cython section separately. Those tests are skipped entirely via an + entry in RST_SKIPLIST - >>> np.all(sum_squares_py(a, axis=-1) == np.sum(a*a, axis=-1)) - True +.. include:: arrays.nditer.cython.rst diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst index 46af1b45b..22b15fc57 100644 --- a/doc/source/reference/c-api/array.rst +++ b/doc/source/reference/c-api/array.rst @@ -834,7 +834,7 @@ General check of Python Type .. c:function:: PyArray_IsPythonScalar(op) Evaluates true if *op* is a builtin Python scalar object (int, - float, complex, str, unicode, long, bool). + float, complex, bytes, str, long, bool). .. c:function:: PyArray_IsAnyScalar(op) diff --git a/doc/source/reference/c-api/index.rst b/doc/source/reference/c-api/index.rst index 56fe8e473..bb1ed154e 100644 --- a/doc/source/reference/c-api/index.rst +++ b/doc/source/reference/c-api/index.rst @@ -21,7 +21,7 @@ experience at first. Be assured that the task becomes easier with practice, and you may be surprised at how simple the C-code can be to understand. Even if you don't think you can write C-code from scratch, it is much easier to understand and modify already-written source code -then create it *de novo*. +than create it *de novo*. Python extensions are especially straightforward to understand because they all have a very similar structure. Admittedly, NumPy is not a diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 6742d605a..d042a1849 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -12,7 +12,7 @@ NumPy Reference This reference manual details functions, modules, and objects included in NumPy, describing what they are and what they do. -For learning how to use NumPy, see also :ref:`user`. +For learning how to use NumPy, see the :ref:`complete documentation <manual>`. .. toctree:: diff --git a/doc/source/reference/maskedarray.baseclass.rst b/doc/source/reference/maskedarray.baseclass.rst index 9864f21ea..57bbaa8f8 100644 --- a/doc/source/reference/maskedarray.baseclass.rst +++ b/doc/source/reference/maskedarray.baseclass.rst @@ -125,7 +125,6 @@ Conversion MaskedArray.__float__ MaskedArray.__int__ - MaskedArray.__long__ MaskedArray.view MaskedArray.astype diff --git a/doc/source/reference/routines.ma.rst b/doc/source/reference/routines.ma.rst index 5b2098c7a..346ce2a1b 100644 --- a/doc/source/reference/routines.ma.rst +++ b/doc/source/reference/routines.ma.rst @@ -396,6 +396,7 @@ Miscellanea ma.allequal ma.allclose ma.apply_along_axis + ma.apply_over_axes ma.arange ma.choose ma.ediff1d |