7 files changed, 173 insertions, 163 deletions
diff --git a/doc/source/reference/arrays.nditer.cython.rst b/doc/source/reference/arrays.nditer.cython.rst
new file mode 100644
index 000000000..2cc7763ed
--- /dev/null
+++ b/doc/source/reference/arrays.nditer.cython.rst
@@ -0,0 +1,147 @@
+Putting the Inner Loop in Cython
+================================
+
+Those who want really good performance out of their low level operations
+should strongly consider directly using the iteration API provided
+in C, but for those who are not comfortable with C or C++, Cython
+is a good middle ground with reasonable performance tradeoffs. For
+the :class:`nditer` object, this means letting the iterator take care
+of broadcasting, dtype conversion, and buffering, while giving the inner
+loop to Cython.
+
+For our example, we'll create a sum of squares function. To start,
+let's implement this function in straightforward Python. We want to
+support an 'axis' parameter similar to the numpy :func:`sum` function,
+so we will need to construct a list for the `op_axes` parameter.
+Here's how this looks.
+
+.. admonition:: Example
+
+    >>> def axis_to_axeslist(axis, ndim):
+    ...     if axis is None:
+    ...         return [-1] * ndim
+    ...     else:
+    ...         if type(axis) is not tuple:
+    ...             axis = (axis,)
+    ...         axeslist = [1] * ndim
+    ...         for i in axis:
+    ...             axeslist[i] = -1
+    ...         ax = 0
+    ...         for i in range(ndim):
+    ...             if axeslist[i] != -1:
+    ...                 axeslist[i] = ax
+    ...                 ax += 1
+    ...         return axeslist
+    ...
+    >>> def sum_squares_py(arr, axis=None, out=None):
+    ...     axeslist = axis_to_axeslist(axis, arr.ndim)
+    ...     it = np.nditer([arr, out], flags=['reduce_ok',
+    ...                                       'buffered', 'delay_bufalloc'],
+    ...                 op_flags=[['readonly'], ['readwrite', 'allocate']],
+    ...                 op_axes=[None, axeslist],
+    ...                 op_dtypes=['float64', 'float64'])
+    ...     with it:
+    ...         it.operands[1][...] = 0
+    ...         it.reset()
+    ...         for x, y in it:
+    ...             y[...] += x*x
+    ...         return it.operands[1]
+    ...
+    >>> a = np.arange(6).reshape(2,3)
+    >>> sum_squares_py(a)
+    array(55.0)
+    >>> sum_squares_py(a, axis=-1)
+    array([  5.,  50.])
+
+To Cython-ize this function, we replace the inner loop (y[...] += x*x) with
+Cython code that's specialized for the float64 dtype. With the
+'external_loop' flag enabled, the arrays provided to the inner loop will
+always be one-dimensional, so very little checking needs to be done.
+
+Here's the listing of sum_squares.pyx::
+
+    import numpy as np
+    cimport numpy as np
+    cimport cython
+
+    def axis_to_axeslist(axis, ndim):
+        if axis is None:
+            return [-1] * ndim
+        else:
+            if type(axis) is not tuple:
+                axis = (axis,)
+            axeslist = [1] * ndim
+            for i in axis:
+                axeslist[i] = -1
+            ax = 0
+            for i in range(ndim):
+                if axeslist[i] != -1:
+                    axeslist[i] = ax
+                    ax += 1
+            return axeslist
+
+    @cython.boundscheck(False)
+    def sum_squares_cy(arr, axis=None, out=None):
+        cdef np.ndarray[double] x
+        cdef np.ndarray[double] y
+        cdef int size
+        cdef double value
+
+        axeslist = axis_to_axeslist(axis, arr.ndim)
+        it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop',
+                                          'buffered', 'delay_bufalloc'],
+                    op_flags=[['readonly'], ['readwrite', 'allocate']],
+                    op_axes=[None, axeslist],
+                    op_dtypes=['float64', 'float64'])
+        with it:
+            it.operands[1][...] = 0
+            it.reset()
+            for xarr, yarr in it:
+                x = xarr
+                y = yarr
+                size = x.shape[0]
+                for i in range(size):
+                   value = x[i]
+                   y[i] = y[i] + value * value
+            return it.operands[1]
+
+On this machine, building the .pyx file into a module looked like the
+following, but you may have to find some Cython tutorials to tell you
+the specifics for your system configuration.::
+
+    $ cython sum_squares.pyx
+    $ gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -I/usr/include/python2.7 -fno-strict-aliasing -o sum_squares.so sum_squares.c
+
+Running this from the Python interpreter produces the same answers
+as our native Python/NumPy code did.
+
+.. admonition:: Example
+
+    >>> from sum_squares import sum_squares_cy
+    >>> a = np.arange(6).reshape(2,3)
+    >>> sum_squares_cy(a)
+    array(55.0)
+    >>> sum_squares_cy(a, axis=-1)
+    array([  5.,  50.])
+
+Doing a little timing in IPython shows that the reduced overhead and
+memory allocation of the Cython inner loop is providing a very nice
+speedup over both the straightforward Python code and an expression
+using NumPy's built-in sum function.::
+
+    >>> a = np.random.rand(1000,1000)
+
+    >>> timeit sum_squares_py(a, axis=-1)
+    10 loops, best of 3: 37.1 ms per loop
+
+    >>> timeit np.sum(a*a, axis=-1)
+    10 loops, best of 3: 20.9 ms per loop
+
+    >>> timeit sum_squares_cy(a, axis=-1)
+    100 loops, best of 3: 11.8 ms per loop
+
+    >>> np.all(sum_squares_cy(a, axis=-1) == np.sum(a*a, axis=-1))
+    True
+
+    >>> np.all(sum_squares_py(a, axis=-1) == np.sum(a*a, axis=-1))
+    True
diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst
index 7dab09a71..2db12a408 100644
--- a/doc/source/reference/arrays.nditer.rst
+++ b/doc/source/reference/arrays.nditer.rst
@@ -1,5 +1,9 @@
 .. currentmodule:: numpy
 
+.. for doctests
+   The last section on Cython is 'included' at the end of this file. The tests
+   for that section are disabled.
+
 .. _arrays.nditer:
 
 *********************
@@ -218,21 +222,21 @@ produce identical results to the ones in the previous section.
     >>> it = np.nditer(a, flags=['f_index'])
     >>> while not it.finished:
     ...     print("%d <%d>" % (it[0], it.index), end=' ')
-    ...     it.iternext()
+    ...     is_not_finished = it.iternext()
     ...
     0 <0> 1 <2> 2 <4> 3 <1> 4 <3> 5 <5>
 
     >>> it = np.nditer(a, flags=['multi_index'])
     >>> while not it.finished:
     ...     print("%d <%s>" % (it[0], it.multi_index), end=' ')
-    ...     it.iternext()
+    ...     is_not_finished = it.iternext()
     ...
     0 <(0, 0)> 1 <(0, 1)> 2 <(0, 2)> 3 <(1, 0)> 4 <(1, 1)> 5 <(1, 2)>
 
     >>> with np.nditer(a, flags=['multi_index'], op_flags=['writeonly']) as it:
     ...     while not it.finished:
     ...         it[0] = it.multi_index[1] - it.multi_index[0]
-    ...         it.iternext()
+    ...         is_not_finished = it.iternext()
     ...
     >>> a
     array([[ 0,  1,  2],
@@ -316,12 +320,13 @@ specified as an iterator flag.
     ...                 op_dtypes=['complex128']):
     ...     print(np.sqrt(x), end=' ')
     ...
-    1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j)
+    1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j)
 
     >>> for x in np.nditer(a, flags=['buffered'], op_dtypes=['complex128']):
     ...     print(np.sqrt(x), end=' ')
     ...
-    1.73205080757j 1.41421356237j 1j 0j (1+0j) (1.41421356237+0j)
+    1.7320508075688772j 1.4142135623730951j 1j 0j (1+0j) (1.4142135623730951+0j)
+
 
 The iterator uses NumPy's casting rules to determine whether a specific
 conversion is permitted. By default, it enforces 'safe' casting. This means,
@@ -405,8 +410,8 @@ which includes the input shapes to help diagnose the problem.
     ...     print("%d:%d" % (x,y), end=' ')
     ...
     Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-    ValueError: operands could not be broadcast together with shapes (2) (2,3)
+    ...
+    ValueError: operands could not be broadcast together with shapes (2,) (2,3)
 
 Iterator-Allocated Output Arrays
 --------------------------------
@@ -482,9 +487,9 @@ reasons.
 
     >>> square(np.arange(6).reshape(2,3), out=b)
     Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-      File "<stdin>", line 4, in square
-    ValueError: non-broadcastable output operand with shape (3) doesn't match the broadcast shape (2,3)
+      ...
+    ValueError: non-broadcastable output operand with shape (3,) doesn't
+    match the broadcast shape (2,3)
 
 Outer Product Iteration
 -----------------------
@@ -550,7 +555,7 @@ For a simple example, consider taking the sum of all elements in an array.
 
     >>> a = np.arange(24).reshape(2,3,4)
     >>> b = np.array(0)
-    >>> with np.nditer([a, b], flags=['reduce_ok', 'external_loop'],
+    >>> with np.nditer([a, b], flags=['reduce_ok'],
     ...                     op_flags=[['readonly'], ['readwrite']]) as it:
     ...     for x,y in it:
     ...         y[...] += x
@@ -568,7 +573,7 @@ sums along the last axis of `a`.
 .. admonition:: Example
 
     >>> a = np.arange(24).reshape(2,3,4)
-    >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop'],
+    >>> it = np.nditer([a, None], flags=['reduce_ok'],
     ...             op_flags=[['readonly'], ['readwrite', 'allocate']],
     ...             op_axes=[None, [0,1,-1]])
     >>> with it:
@@ -602,7 +607,7 @@ buffering.
 .. admonition:: Example
 
     >>> a = np.arange(24).reshape(2,3,4)
-    >>> it = np.nditer([a, None], flags=['reduce_ok', 'external_loop',
+    >>> it = np.nditer([a, None], flags=['reduce_ok',
     ...                                  'buffered', 'delay_bufalloc'],
     ...             op_flags=[['readonly'], ['readwrite', 'allocate']],
     ...             op_axes=[None, [0,1,-1]])
@@ -617,150 +622,8 @@ buffering.
     array([[ 6, 22, 38],
            [54, 70, 86]])
 
-Putting the Inner Loop in Cython
-================================
-
-Those who want really good performance out of their low level operations
-should strongly consider directly using the iteration API provided
-in C, but for those who are not comfortable with C or C++, Cython
-is a good middle ground with reasonable performance tradeoffs. For
-the :class:`nditer` object, this means letting the iterator take care
-of broadcasting, dtype conversion, and buffering, while giving the inner
-loop to Cython.
-
-For our example, we'll create a sum of squares function. To start,
-let's implement this function in straightforward Python. We want to
-support an 'axis' parameter similar to the numpy :func:`sum` function,
-so we will need to construct a list for the `op_axes` parameter.
-Here's how this looks.
-
-.. admonition:: Example
-
-    >>> def axis_to_axeslist(axis, ndim):
-    ...     if axis is None:
-    ...         return [-1] * ndim
-    ...     else:
-    ...         if type(axis) is not tuple:
-    ...             axis = (axis,)
-    ...         axeslist = [1] * ndim
-    ...         for i in axis:
-    ...             axeslist[i] = -1
-    ...         ax = 0
-    ...         for i in range(ndim):
-    ...             if axeslist[i] != -1:
-    ...                 axeslist[i] = ax
-    ...                 ax += 1
-    ...         return axeslist
-    ...
-    >>> def sum_squares_py(arr, axis=None, out=None):
-    ...     axeslist = axis_to_axeslist(axis, arr.ndim)
-    ...     it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop',
-    ...                                       'buffered', 'delay_bufalloc'],
-    ...                 op_flags=[['readonly'], ['readwrite', 'allocate']],
-    ...                 op_axes=[None, axeslist],
-    ...                 op_dtypes=['float64', 'float64'])
-    ...     with it:
-    ...         it.operands[1][...] = 0
-    ...         it.reset()
-    ...         for x, y in it:
-    ...             y[...] += x*x
-    ...         return it.operands[1]
-    ...
-    >>> a = np.arange(6).reshape(2,3)
-    >>> sum_squares_py(a)
-    array(55.0)
-    >>> sum_squares_py(a, axis=-1)
-    array([  5.,  50.])
-
-To Cython-ize this function, we replace the inner loop (y[...] += x*x) with
-Cython code that's specialized for the float64 dtype. With the
-'external_loop' flag enabled, the arrays provided to the inner loop will
-always be one-dimensional, so very little checking needs to be done.
-
-Here's the listing of sum_squares.pyx::
-
-    import numpy as np
-    cimport numpy as np
-    cimport cython
-
-    def axis_to_axeslist(axis, ndim):
-        if axis is None:
-            return [-1] * ndim
-        else:
-            if type(axis) is not tuple:
-                axis = (axis,)
-            axeslist = [1] * ndim
-            for i in axis:
-                axeslist[i] = -1
-            ax = 0
-            for i in range(ndim):
-                if axeslist[i] != -1:
-                    axeslist[i] = ax
-                    ax += 1
-            return axeslist
-
-    @cython.boundscheck(False)
-    def sum_squares_cy(arr, axis=None, out=None):
-        cdef np.ndarray[double] x
-        cdef np.ndarray[double] y
-        cdef int size
-        cdef double value
-
-        axeslist = axis_to_axeslist(axis, arr.ndim)
-        it = np.nditer([arr, out], flags=['reduce_ok', 'external_loop',
-                                          'buffered', 'delay_bufalloc'],
-                    op_flags=[['readonly'], ['readwrite', 'allocate']],
-                    op_axes=[None, axeslist],
-                    op_dtypes=['float64', 'float64'])
-        with it:
-            it.operands[1][...] = 0
-            it.reset()
-            for xarr, yarr in it:
-                x = xarr
-                y = yarr
-                size = x.shape[0]
-                for i in range(size):
-                   value = x[i]
-                   y[i] = y[i] + value * value
-            return it.operands[1]
-
-On this machine, building the .pyx file into a module looked like the
-following, but you may have to find some Cython tutorials to tell you
-the specifics for your system configuration.::
-
-    $ cython sum_squares.pyx
-    $ gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -I/usr/include/python2.7 -fno-strict-aliasing -o sum_squares.so sum_squares.c
-
-Running this from the Python interpreter produces the same answers
-as our native Python/NumPy code did.
-
-.. admonition:: Example
-
-    >>> from sum_squares import sum_squares_cy
-    >>> a = np.arange(6).reshape(2,3)
-    >>> sum_squares_cy(a)
-    array(55.0)
-    >>> sum_squares_cy(a, axis=-1)
-    array([  5.,  50.])
-
-Doing a little timing in IPython shows that the reduced overhead and
-memory allocation of the Cython inner loop is providing a very nice
-speedup over both the straightforward Python code and an expression
-using NumPy's built-in sum function.::
-
-    >>> a = np.random.rand(1000,1000)
-
-    >>> timeit sum_squares_py(a, axis=-1)
-    10 loops, best of 3: 37.1 ms per loop
-
-    >>> timeit np.sum(a*a, axis=-1)
-    10 loops, best of 3: 20.9 ms per loop
-
-    >>> timeit sum_squares_cy(a, axis=-1)
-    100 loops, best of 3: 11.8 ms per loop
-
-    >>> np.all(sum_squares_cy(a, axis=-1) == np.sum(a*a, axis=-1))
-    True
+.. for doctests
+   Include Cython section separately. Those tests are skipped entirely via an
+   entry in RST_SKIPLIST
 
-    >>> np.all(sum_squares_py(a, axis=-1) == np.sum(a*a, axis=-1))
-    True
+.. include:: arrays.nditer.cython.rst
diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst
index 46af1b45b..22b15fc57 100644
--- a/doc/source/reference/c-api/array.rst
+++ b/doc/source/reference/c-api/array.rst
@@ -834,7 +834,7 @@ General check of Python Type
 .. c:function:: PyArray_IsPythonScalar(op)
 
     Evaluates true if *op* is a builtin Python scalar object (int,
-    float, complex, str, unicode, long, bool).
+    float, complex, bytes, str, long, bool).
 
 .. c:function:: PyArray_IsAnyScalar(op)
 
diff --git a/doc/source/reference/c-api/index.rst b/doc/source/reference/c-api/index.rst
index 56fe8e473..bb1ed154e 100644
--- a/doc/source/reference/c-api/index.rst
+++ b/doc/source/reference/c-api/index.rst
@@ -21,7 +21,7 @@ experience at first. Be assured that the task becomes easier with
 practice, and you may be surprised at how simple the C-code can be to
 understand. Even if you don't think you can write C-code from scratch,
 it is much easier to understand and modify already-written source code
-then create it *de novo*.
+than create it *de novo*.
 
 Python extensions are especially straightforward to understand because
 they all have a very similar structure. Admittedly, NumPy is not a
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 6742d605a..d042a1849 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -12,7 +12,7 @@ NumPy Reference
 
 This reference manual details functions, modules, and objects
 included in NumPy, describing what they are and what they do.
-For learning how to use NumPy, see also :ref:`user`.
+For learning how to use NumPy, see the :ref:`complete documentation <manual>`.
 
 
 .. toctree::
diff --git a/doc/source/reference/maskedarray.baseclass.rst b/doc/source/reference/maskedarray.baseclass.rst
index 9864f21ea..57bbaa8f8 100644
--- a/doc/source/reference/maskedarray.baseclass.rst
+++ b/doc/source/reference/maskedarray.baseclass.rst
@@ -125,7 +125,6 @@ Conversion
 
    MaskedArray.__float__
    MaskedArray.__int__
-   MaskedArray.__long__
 
    MaskedArray.view
    MaskedArray.astype
diff --git a/doc/source/reference/routines.ma.rst b/doc/source/reference/routines.ma.rst
index 5b2098c7a..346ce2a1b 100644
--- a/doc/source/reference/routines.ma.rst
+++ b/doc/source/reference/routines.ma.rst
@@ -396,6 +396,7 @@ Miscellanea
    ma.allequal
    ma.allclose
    ma.apply_along_axis
+   ma.apply_over_axes
    ma.arange
    ma.choose
    ma.ediff1d