1 files changed, 22 insertions, 159 deletions
diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst
index 7dab09a71..2db12a408 100644
--- a/doc/source/reference/arrays.nditer.rst
+++ b/doc/source/reference/arrays.nditer.rst
@@ -1,5 +1,9 @@
 .. currentmodule:: numpy
 
+.. for doctests
+   The last section on Cython is 'included' at the end of this file. The tests
+   for that section are disabled.
+
 .. _arrays.nditer:
 
 *********************
@@ -218,21 +222,21 @@ produce identical results to the ones in the previous section.
     >>> it = np.nditer(a, flags=['f_index'])
     >>> while not it.finished:
     ...     print("%d <%d>" % (it[0], it.index), end=' ')
-    ...     it.iternext()
+    ...     is_not_finished = it.iternext()
     ...
     0 <0> 1 <2> 2 <4> 3 <1> 4 <3> 5 <5>
 
     >>> it = np.nditer(a, flags=['multi_index'])
     >>> while not it.finished:
     ...     print("%d <%s>" % (it[0], it.multi_index), end=' ')
-    ...     it.iternext()
+    ...     is_not_finished = it.iternext()
     ...
     0 <(0, 0)> 1 <(0, 1)> 2 <(0, 2)> 3 <(1, 0)> 4 <(1, 1)> 5 <(1, 2)>
 
     >>> with np.nditer(a, flags=['multi_index'], op_flags=['writeonly']) as it:
     ...     while not it.finished:
     ...         it[0] = it.multi_index[1] - it.multi_index[0]
-    ...         it.iternext()
+    ...         is_not_finished = it.iternext()
     ...
     >>> a
     array([[ 0,  1,  2],
@@ -316,12 +320,13 @@ specified as an iterator flag.
     ...                 op_dtypes=['complex128']):
     ...     print(np.sqrt(x), end=' ')
     ...
-    1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j)
+    1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j)
 
     >>> for x in np.nditer(a, flags=['buffered'], op_dtypes=['complex128']):
     ...     print(np.sqrt(x), end=' ')
     ...
-    1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j)
+    1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j)
+
 
 The iterator uses NumPy's casting rules to determine whether a specific
 conversion is permitted. By default, it enforces 'safe' casting. This means,
@@ -405,8 +410,8 @@ which includes the input shapes to help diagnose the problem.
     ...     print("%d:%d" % (x,y), end=' ')
     ...
     Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-    ValueError: operands could not be broadcast together with shapes (2) (2,3)
+    ...
+    ValueError: operands could not be broadcast together with shapes (2,) (2,3)
 
 Iterator-Allocated Output Arrays
 --------------------------------
@@ -482,9 +487,9 @@ reasons.
 
     >>> square(np.arange(6).reshape(2,3), out=b)
     Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-      File "<stdin>", line 4, in square
-    ValueError: non-broadcastable output operand with shape (3) doesn't match the broadcast shape (2,3)
+      ...
+    ValueError: non-broadcastable output operand with shape (3,) doesn't
+    match the broadcast shape (2,3)
 
 Outer Product Iteration
 -----------------------
@@ -550,7 +555,7 @@ For a simple example, consider taking the sum of all elements in an array.
 
     >>> a = np.arange(24).reshape(2,3,4)
     >>> b = np.array(0)
-    >>> with np.nditer([a, b], flags=['reduce_ok', 'external_loop'],
+    >>> with np.nditer([a, b], flags=['reduce_ok'],
     ...                     op_flags=[['readonly'], ['readwrite']]) as it:
     ...     for x,y in it:
     ...         y[...] += x
@@ -568,7 +573,7 @@ sums along the last axis of `a`.
 .. admonition:: Example
 
     >>> a = np.arange(24).reshape(2,3,4)
-    >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop'],
+    >>> it = np.nditer([a, None], flags=['reduce_ok'],
     ...             op_flags=[['readonly'], ['readwrite', 'allocate']],
     ...             op_axes=[None, [0,1,-1]])
     >>> with it:
@@ -602,7 +607,7 @@ buffering.
 .. admonition:: Example
 
     >>> a = np.arange(24).reshape(2,3,4)
-    >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop',
+    >>> it = np.nditer([a, None], flags=['reduce_ok',
     ...                                  'buffered', 'delay_bufalloc'],
     ...             op_flags=[['readonly'], ['readwrite', 'allocate']],
     ...             op_axes=[None, [0,1,-1]])
@@ -617,150 +622,8 @@ buffering.
     array([[ 6, 22, 38],
            [54, 70, 86]])
 
-Putting the Inner Loop in Cython
-================================
-
-Those who want really good performance out of their low level operations
-should strongly consider directly using the iteration API provided
-in C, but for those who are not comfortable with C or C++, Cython
-is a good middle ground with reasonable performance tradeoffs. For
-the :class:`nditer` object, this means letting the iterator take care
-of broadcasting, dtype conversion, and buffering, while giving the inner
-loop to Cython.
-
-For our example, we'll create a sum of squares function. To start,
-let's implement this function in straightforward Python. We want to
-support an 'axis' parameter similar to the numpy :func:`sum` function,
-so we will need to construct a list for the `op_axes` parameter.
-Here's how this looks.
-
-.. admonition:: Example
-
-    >>> def axis_to_axeslist(axis, ndim):
-    ...     if axis is None:
-    ...         return [-1] * ndim
-    ...     else:
-    ...         if type(axis) is not tuple:
-    ...             axis = (axis,)
-    ...         axeslist = [1] * ndim
-    ...         for i in axis:
-    ...             axeslist[i] = -1
-    ...         ax = 0
-    ...         for i in range(ndim):
-    ...             if axeslist[i] != -1:
-    ...                 axeslist[i] = ax
-    ...                 ax += 1
-    ...         return axeslist
-    ...
-    >>> def sum_squares_py(arr, axis=None, out=None):
-    ...     axeslist = axis_to_axeslist(axis, arr.ndim)
-    ...     it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop',
-    ...                                       'buffered', 'delay_bufalloc'],
-    ...                 op_flags=[['readonly'], ['readwrite', 'allocate']],
-    ...                 op_axes=[None, axeslist],
-    ...                 op_dtypes=['float64', 'float64'])
-    ...     with it:
-    ...         it.operands[1][...] = 0
-    ...         it.reset()
-    ...         for x, y in it:
-    ...             y[...] += x*x
-    ...         return it.operands[1]
-    ...
-    >>> a = np.arange(6).reshape(2,3)
-    >>> sum_squares_py(a)
-    array(55.0)
-    >>> sum_squares_py(a, axis=-1)
-    array([  5.,  50.])
-
-To Cython-ize this function, we replace the inner loop (y[...] += x*x) with
-Cython code that's specialized for the float64 dtype. With the
-'external_loop' flag enabled, the arrays provided to the inner loop will
-always be one-dimensional, so very little checking needs to be done.
-
-Here's the listing of sum_squares.pyx::
-
-    import numpy as np
-    cimport numpy as np
-    cimport cython
-
-    def axis_to_axeslist(axis, ndim):
-        if axis is None:
-            return [-1] * ndim
-        else:
-            if type(axis) is not tuple:
-                axis = (axis,)
-            axeslist = [1] * ndim
-            for i in axis:
-                axeslist[i] = -1
-            ax = 0
-            for i in range(ndim):
-                if axeslist[i] != -1:
-                    axeslist[i] = ax
-                    ax += 1
-            return axeslist
-
-    @cython.boundscheck(False)
-    def sum_squares_cy(arr, axis=None, out=None):
-        cdef np.ndarray[double] x
-        cdef np.ndarray[double] y
-        cdef int size
-        cdef double value
-
-        axeslist = axis_to_axeslist(axis, arr.ndim)
-        it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop',
-                                          'buffered', 'delay_bufalloc'],
-                    op_flags=[['readonly'], ['readwrite', 'allocate']],
-                    op_axes=[None, axeslist],
-                    op_dtypes=['float64', 'float64'])
-        with it:
-            it.operands[1][...] = 0
-            it.reset()
-            for xarr, yarr in it:
-                x = xarr
-                y = yarr
-                size = x.shape[0]
-                for i in range(size):
-                   value = x[i]
-                   y[i] = y[i] + value * value
-            return it.operands[1]
-
-On this machine, building the .pyx file into a module looked like the
-following, but you may have to find some Cython tutorials to tell you
-the specifics for your system configuration.::
-
-    $ cython sum_squares.pyx
-    $ gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -I/usr/include/python2.7 -fno-strict-aliasing -o sum_squares.so sum_squares.c
-
-Running this from the Python interpreter produces the same answers
-as our native Python/NumPy code did.
-
-.. admonition:: Example
-
-    >>> from sum_squares import sum_squares_cy
-    >>> a = np.arange(6).reshape(2,3)
-    >>> sum_squares_cy(a)
-    array(55.0)
-    >>> sum_squares_cy(a, axis=-1)
-    array([  5.,  50.])
-
-Doing a little timing in IPython shows that the reduced overhead and
-memory allocation of the Cython inner loop is providing a very nice
-speedup over both the straightforward Python code and an expression
-using NumPy's built-in sum function.::
-
-    >>> a = np.random.rand(1000,1000)
-
-    >>> timeit sum_squares_py(a, axis=-1)
-    10 loops, best of 3: 37.1 ms per loop
-
-    >>> timeit np.sum(a*a, axis=-1)
-    10 loops, best of 3: 20.9 ms per loop
-
-    >>> timeit sum_squares_cy(a, axis=-1)
-    100 loops, best of 3: 11.8 ms per loop
-
-    >>> np.all(sum_squares_cy(a, axis=-1) == np.sum(a*a, axis=-1))
-    True
+.. for doctests
+   Include Cython section separately. Those tests are skipped entirely via an
+   entry in RST_SKIPLIST
 
-    >>> np.all(sum_squares_py(a, axis=-1) == np.sum(a*a, axis=-1))
-    True
+.. include:: arrays.nditer.cython.rst