16 files changed, 253 insertions, 355 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index b8223f7c6..f550e2d41 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -901,14 +901,6 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     ----------
     a : array_like
         The array for which to count non-zeros.
-    axis : None or int or tuple of ints, optional
-        Axis or axes along which a reduction is performed.
-        The default (`axis` = None) is perform a reduction over all
-        the dimensions of the input array.
-    keepdims : bool, optional
-        If this is set to True, the axes which are reduced are left
-        in the result as dimensions with size one. With this option,
-        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -925,11 +917,6 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     4
     >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]])
     5
-    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1)
-    array([2, 3])
-    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1, keepdims=True)
-    array([[2],
-           [3]])
     """)
 
 add_newdoc('numpy.core.multiarray','set_typeDict',
diff --git a/numpy/core/SConscript b/numpy/core/SConscript
index 5d6f5fe41..6f4484b6f 100644
--- a/numpy/core/SConscript
+++ b/numpy/core/SConscript
@@ -469,7 +469,6 @@ if ENABLE_SEPARATE_COMPILATION:
         pjoin('src', 'multiarray', 'item_selection.c'),
         pjoin('src', 'multiarray', 'calculation.c'),
         pjoin('src', 'multiarray', 'common.c'),
-        pjoin('src', 'multiarray', 'reduction.c'),
         pjoin('src', 'multiarray', 'refcount.c'),
         pjoin('src', 'multiarray', 'conversion_utils.c'),
         pjoin('src', 'multiarray', 'usertypes.c'),
@@ -498,6 +497,7 @@ env.DistutilsPythonExtension('multiarray_tests', source=multiarray_tests_src)
 if ENABLE_SEPARATE_COMPILATION:
     umathmodule_src.extend([pjoin('src', 'umath', 'ufunc_object.c')])
     umathmodule_src.extend([pjoin('src', 'umath', 'ufunc_type_resolution.c')])
+    umathmodule_src.extend([pjoin('src', 'multiarray', 'reduction.c')]),
     umathmodule_src.extend(umath_loops_src)
 else:
     umathmodule_src = [pjoin('src', 'umath', 'umathmodule_onefile.c')]
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 3599f47c7..99ea072ca 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -9,5 +9,5 @@
 # Version 6 (NumPy 1.6) added new iterator, half float and casting functions,
 # PyArray_CountNonzero, PyArray_NewLikeArray and PyArray_MatrixProduct2.
 0x00000006 = e61d5dc51fa1c6459328266e215d6987
-# Version 7 (NumPy 1.7) added API for NA, improved datetime64, misc utilities.
-0x00000007 = 280023b3ecfc2ad0326874917f6f16f9
+# Version 7 (NumPy 1.7) improved datetime64, misc utilities.
+0x00000007 = 1768b6c404a3d5a2a6bfe7c68f89e3aa
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 7973fd373..3860fe6d7 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -48,7 +48,6 @@ API_FILES = [join('multiarray', 'array_assign_array.c'),
              join('multiarray', 'nditer_pywrap.c'),
              join('multiarray', 'nditer_templ.c.src'),
              join('multiarray', 'number.c'),
-             join('multiarray', 'reduction.c'),
              join('multiarray', 'refcount.c'),
              join('multiarray', 'scalartypes.c.src'),
              join('multiarray', 'scalarapi.c'),
@@ -58,6 +57,7 @@ API_FILES = [join('multiarray', 'array_assign_array.c'),
              join('umath', 'loops.c.src'),
              join('umath', 'ufunc_object.c'),
              join('umath', 'ufunc_type_resolution.c'),
+             join('umath', 'reduction.c'),
             ]
 THIS_DIR = os.path.dirname(__file__)
 API_FILES = [os.path.join(THIS_DIR, '..', 'src', a) for a in API_FILES]
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index befa099ae..fd2b9628e 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -319,12 +319,11 @@ multiarray_funcs_api = {
     # End 1.6 API
     'NpyIter_IsFirstVisit':                 281,
     'PyArray_SetBaseObject':                282,
-    'PyArray_ReduceWrapper':                283,
-    'PyArray_CreateSortedStridePerm':       284,
-    'PyArray_RemoveAxesInPlace':            285,
-    'PyArray_DebugPrint':                   286,
-    'PyArray_FailUnlessWriteable':          287,
-    'PyArray_SetUpdateIfCopyBase':          288,
+    'PyArray_CreateSortedStridePerm':       283,
+    'PyArray_RemoveAxesInPlace':            284,
+    'PyArray_DebugPrint':                   285,
+    'PyArray_FailUnlessWriteable':          286,
+    'PyArray_SetUpdateIfCopyBase':          287,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 3862865ae..9605c20f8 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1670,112 +1670,6 @@ typedef struct {
 } npy_stride_sort_item;
 
 /************************************************************
- * Typedefs used by PyArray_ReduceWrapper, new in 1.7.
- ************************************************************/
-
-/*
- * This is a function for assigning a reduction identity to the result,
- * before doing the reduction computation. The
- * value in 'data' is passed through from PyArray_ReduceWrapper.
- *
- * This function could, for example, simply be a call like
- *      return PyArray_AssignZero(result, NULL, NULL);
- *
- * It should return -1 on failure, or 0 on success.
- */
-typedef int (PyArray_AssignReduceIdentityFunc)(PyArrayObject *result,
-                                               void *data);
-
-/*
- * This is a function for the reduce loop.
- *
- * The needs_api parameter indicates whether it's ok to release the GIL during
- * the loop, such as when the iternext() function never calls
- * a function which could raise a Python exception.
- *
- * Ths skip_first_count parameter indicates how many elements need to be
- * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
- * when the 'assign_identity' parameter was NULL when calling
- * PyArray_ReduceWrapper.
- *
- * The loop gets two data pointers and two strides, and should
- * look roughly like this:
- *  {
- *      NPY_BEGIN_THREADS_DEF;
- *      if (!needs_api) {
- *          NPY_BEGIN_THREADS;
- *      }
- *      // This first-visit loop can be skipped if 'assign_identity' was non-NULL
- *      if (skip_first_count > 0) {
- *          do {
- *              char *data0 = dataptr[0], *data1 = dataptr[1];
- *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *              npy_intp count = *countptr;
- *
- *              // Skip any first-visit elements
- *              if (NpyIter_IsFirstVisit(iter, 0)) {
- *                  if (stride0 == 0) {
- *                      --count;
- *                      --skip_first_count;
- *                      data1 += stride1;
- *                  }
- *                  else {
- *                      skip_first_count -= count;
- *                      count = 0;
- *                  }
- *              }
- *
- *              while (count--) {
- *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                    *(operand_t *)data1);
- *                  data0 += stride0;
- *                  data1 += stride1;
- *              }
- *
- *              // Jump to the faster loop when skipping is done
- *              if (skip_first_count == 0) {
- *                  if (iternext(iter)) {
- *                      break;
- *                  }
- *                  else {
- *                      goto finish_loop;
- *                  }
- *              }
- *          } while (iternext(iter));
- *      }
- *      do {
- *          char *data0 = dataptr[0], *data1 = dataptr[1];
- *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *          npy_intp count = *countptr;
- *
- *          while (count--) {
- *              *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                *(operand_t *)data1);
- *              data0 += stride0;
- *              data1 += stride1;
- *          }
- *      } while (iternext(iter));
- *  finish_loop:
- *      if (!needs_api) {
- *          NPY_END_THREADS;
- *      }
- *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
- *  }
- *
- * If needs_api is True, this function should call PyErr_Occurred()
- * to check if an error occurred during processing, and return -1 for
- * error, 0 for success.
- */
-typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
-                                            char **dataptr,
-                                            npy_intp *strideptr,
-                                            npy_intp *countptr,
-                                            NpyIter_IterNextFunc *iternext,
-                                            int needs_api,
-                                            npy_intp skip_first_count,
-                                            void *data);
-
-/************************************************************
  * This is the form of the struct that's returned pointed by the
  * PyCObject attribute of an array __array_struct__. See
  * http://numpy.scipy.org/array_interface.shtml for the full
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 96b1576a3..92d352e9a 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -719,7 +719,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'numpymemoryview.h'),
             join('src', 'multiarray', 'number.h'),
             join('src', 'multiarray', 'numpyos.h'),
-            join('src', 'multiarray', 'reduction.h'),
             join('src', 'multiarray', 'refcount.h'),
             join('src', 'multiarray', 'scalartypes.h'),
             join('src', 'multiarray', 'sequence.h'),
@@ -784,7 +783,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'number.c'),
             join('src', 'multiarray', 'numpymemoryview.c'),
             join('src', 'multiarray', 'numpyos.c'),
-            join('src', 'multiarray', 'reduction.c'),
             join('src', 'multiarray', 'refcount.c'),
             join('src', 'multiarray', 'sequence.c'),
             join('src', 'multiarray', 'shape.c'),
@@ -847,6 +845,7 @@ def configuration(parent_package='',top_path=None):
 
     umath_src = [
             join('src', 'umath', 'umathmodule.c'),
+            join('src', 'umath', 'reduction.c'),
             join('src', 'umath', 'funcs.inc.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 49c304990..977580d70 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -17,7 +17,6 @@
 #include "arrayobject.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
-#include "reduction.h"
 
 #include "item_selection.h"
 
@@ -1921,99 +1920,6 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
     return count;
 }
 
-static int
-assign_reduce_identity_zero(PyArrayObject *result, void *data)
-{
-    return PyArray_AssignZero(result, NULL);
-}
-
-static int
-reduce_count_nonzero_loop(NpyIter *iter,
-                                            char **dataptr,
-                                            npy_intp *strides,
-                                            npy_intp *countptr,
-                                            NpyIter_IterNextFunc *iternext,
-                                            int needs_api,
-                                            npy_intp skip_first_count,
-                                            void *data)
-{
-    PyArray_NonzeroFunc *nonzero = (PyArray_NonzeroFunc *)data;
-    PyArrayObject *arr = NpyIter_GetOperandArray(iter)[1];
-
-    NPY_BEGIN_THREADS_DEF;
-
-    if (!needs_api) {
-        NPY_BEGIN_THREADS;
-    }
-
-    /*
-     * 'skip_first_count' will always be 0 because we are doing a reduction
-     * with an identity.
-     */
-
-    do {
-        char *data0 = dataptr[0], *data1 = dataptr[1];
-        npy_intp stride0 = strides[0], stride1 = strides[1];
-        npy_intp count = *countptr;
-
-        while (count--) {
-            if (nonzero(data1, arr)) {
-                ++(*(npy_intp *)data0);
-            }
-            data0 += stride0;
-            data1 += stride1;
-        }
-    } while (iternext(iter));
-
-    if (!needs_api) {
-        NPY_END_THREADS;
-    }
-
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
-}
-
-/*
- * A full reduction version of PyArray_CountNonzero, supporting
- * an 'out' parameter and doing the count as a reduction along
- * selected axes.
- */
-NPY_NO_EXPORT PyObject *
-PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
-                        npy_bool *axis_flags, int keepdims)
-{
-    PyArray_NonzeroFunc *nonzero;
-    PyArrayObject *result;
-    PyArray_Descr *dtype;
-
-    nonzero = PyArray_DESCR(arr)->f->nonzero;
-    if (nonzero == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                    "Cannot count the number of non-zeros for a dtype "
-                    "which doesn't have a 'nonzero' function");
-        return NULL;
-    }
-
-    dtype = PyArray_DescrFromType(NPY_INTP);
-    if (dtype == NULL) {
-        return NULL;
-    }
-
-    result = PyArray_ReduceWrapper(arr, out, NULL,
-                            PyArray_DESCR(arr), dtype,
-                            NPY_SAME_KIND_CASTING,
-                            axis_flags, 1, keepdims, 0,
-                            &assign_reduce_identity_zero,
-                            &reduce_count_nonzero_loop,
-                            nonzero, 0, "count_nonzero");
-    Py_DECREF(dtype);
-    if (out == NULL && result != NULL) {
-        return PyArray_Return(result);
-    }
-    else {
-        return (PyObject *)result;
-    }
-}
-
 /*NUMPY_API
  * Counts the number of non-zero elements in the array.
  *
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
index 5f4ba1faa..90bb5100d 100644
--- a/numpy/core/src/multiarray/item_selection.h
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -27,15 +27,4 @@ NPY_NO_EXPORT int
 PyArray_MultiIndexSetItem(PyArrayObject *self, npy_intp *multi_index,
                                                 PyObject *obj);
 
-/*
- * A full reduction version of PyArray_CountNonzero, supporting
- * an 'out' parameter and doing the count as a reduction along
- * selected axes.
- */
-NPY_NO_EXPORT PyObject *
-PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
-                        npy_bool *axis_flags, int keepdims);
-
-
-
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index fd4183c66..1ab7823ad 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -51,7 +51,6 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "item_selection.h"
 #include "shape.h"
 #include "ctors.h"
-#include "reduction.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -1895,20 +1894,11 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"arr", "axis", "out", "keepdims", NULL};
-
-    PyObject *array_in, *axis_in = NULL, *out_in = NULL;
-    PyObject *ret = NULL;
-    PyArrayObject *array, *out = NULL;
-    npy_bool axis_flags[NPY_MAXDIMS];
-    int keepdims = 0;
+    PyObject *array_in;
+    PyArrayObject *array;
+    npy_intp count;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds,
-                                "O|OOi:count_nonzero", kwlist,
-                                &array_in,
-                                &axis_in,
-                                &out_in,
-                                &keepdims)) {
+    if (!PyArg_ParseTuple(args, "O", &array_in)) {
         return NULL;
     }
 
@@ -1917,27 +1907,25 @@ array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
-                                        axis_flags) != NPY_SUCCEED) {
-        Py_DECREF(array);
-        return NULL;
-    }
-
-    if (out_in != NULL) {
-        if (PyArray_Check(out_in)) {
-            out = (PyArrayObject *)out_in;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "'out' must be an array");
-            return NULL;
-        }
-    }
-
-    ret = PyArray_ReduceCountNonzero(array, out, axis_flags, keepdims);
+    count =  PyArray_CountNonzero(array);
 
     Py_DECREF(array);
 
-    return ret;
+    if (count == -1) {
+        return NULL;
+    }
+#if defined(NPY_PY3K)
+    return PyLong_FromSsize_t(count);
+#elif PY_VERSION_HEX >= 0x02050000
+    return PyInt_FromSsize_t(count);
+#else
+    if ((npy_intp)((long)count) == count) {
+        return PyInt_FromLong(count);
+    }
+    else {
+        return PyLong_FromVoidPtr((void*)count);
+    }
+#endif
 }
 
 static PyObject *
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index 9a4956cfa..9410263e4 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -48,7 +48,6 @@
 #include "array_assign.c"
 #include "array_assign_scalar.c"
 #include "array_assign_array.c"
-#include "reduction.c"
 #include "ucsnarrow.c"
 
 #include "arrayobject.c"
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index f466c6d0d..be301e26d 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -166,6 +166,24 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
     return (PyArrayObject *)ret;
 }
 
+/*
+ * Creates a result for reducing 'operand' along the axes specified
+ * in 'axis_flags'. If 'dtype' isn't NULL, this function steals a
+ * reference to 'dtype'.
+ *
+ * If 'out' isn't NULL, this function creates a view conforming
+ * to the number of dimensions of 'operand', adding a singleton dimension
+ * for each reduction axis specified. In this case, 'dtype' is ignored
+ * (but its reference is still stolen), and the caller must handle any
+ * type conversion/validity check for 'out'
+ *
+ * If 'subok' is true, creates a result with the subtype of 'operand',
+ * otherwise creates on with the base ndarray class.
+ *
+ * If 'out' is NULL, it allocates a new array whose shape matches that of
+ * 'operand', except for at the reduction axes. If 'dtype' is NULL, the dtype
+ * of 'operand' is used for the result.
+ */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
                            PyArray_Descr *dtype, npy_bool *axis_flags,
@@ -215,6 +233,42 @@ check_nonreorderable_axes(int ndim, npy_bool *axis_flags, const char *funcname)
     return 0;
 }
 
+/*
+ * This function initializes a result array for a reduction operation
+ * which has no identity. This means it needs to copy the first element
+ * it sees along the reduction axes to result, then return a view of
+ * the operand which excludes that element.
+ *
+ * If a reduction has an identity, such as 0 or 1, the result should
+ * be initialized by calling PyArray_AssignZero(result, NULL, NULL)
+ * or PyArray_AssignOne(result, NULL, NULL), because this
+ * function raises an exception when there are no elements to reduce.
+ *
+ * This means it copies the subarray indexed at zero along each reduction axis
+ * into 'result', then returns a view into 'operand' excluding those copied
+ * elements.
+ *
+ * result  : The array into which the result is computed. This must have
+ *           the same number of dimensions as 'operand', but for each
+ *           axis i where 'axis_flags[i]' is True, it has a single element.
+ * operand : The array being reduced.
+ * axis_flags : An array of boolean flags, one for each axis of 'operand'.
+ *              When a flag is True, it indicates to reduce along that axis.
+ * reorderable : If True, the reduction being done is reorderable, which
+ *               means specifying multiple axes of reduction at once is ok,
+ *               and the reduction code may calculate the reduction in an
+ *               arbitrary order. The calculation may be reordered because
+ *               of cache behavior or multithreading requirements.
+ * out_skip_first_count : This gets populated with the number of first-visit
+ *                        elements that should be skipped during the
+ *                        iteration loop.
+ * funcname : The name of the reduction operation, for the purpose of
+ *            better quality error messages. For example, "numpy.max"
+ *            would be a good name for NumPy's max function.
+ *
+ * Returns a view which contains the remaining elements on which to do
+ * the reduction.
+ */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_InitializeReduceResult(
                     PyArrayObject *result, PyArrayObject *operand,
@@ -274,7 +328,7 @@ PyArray_InitializeReduceResult(
     /*
      * Copy the elements into the result to start.
      */
-    if (PyArray_AssignArray(result, op_view, NULL, NPY_UNSAFE_CASTING) < 0) {
+    if (PyArray_CopyInto(result, op_view) < 0) {
         Py_DECREF(op_view);
         return NULL;
     }
@@ -313,8 +367,7 @@ PyArray_InitializeReduceResult(
     return op_view;
 }
 
-/*NUMPY_API
- * 
+/*
  * This function executes all the standard NumPy reduction function
  * boilerplate code, just calling assign_identity and the appropriate
  * inner loop function where necessary.
@@ -344,19 +397,28 @@ PyArray_InitializeReduceResult(
  * data        : Data which is passed to assign_identity and the inner loop.
  * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
+ *
+ * TODO FIXME: if you squint, this is essentially an second independent
+ * implementation of generalized ufuncs with signature (i)->(), plus a few
+ * extra bells and whistles. (Indeed, as far as I can tell, it was originally
+ * split out to support a fancy version of count_nonzero... which is not
+ * actually a reduction function at all, it's just a (i)->() function!) So
+ * probably these two implementation should be merged into one. (In fact it
+ * would be quite nice to support axis= and keepdims etc. for arbitrary
+ * generalized ufuncs!)
  */
 NPY_NO_EXPORT PyArrayObject *
-PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
-                        PyArrayObject *wheremask,
-                        PyArray_Descr *operand_dtype,
-                        PyArray_Descr *result_dtype,
-                        NPY_CASTING casting,
-                        npy_bool *axis_flags, int reorderable,
-                        int keepdims,
-                        int subok,
-                        PyArray_AssignReduceIdentityFunc *assign_identity,
-                        PyArray_ReduceLoopFunc *loop,
-                        void *data, npy_intp buffersize, const char *funcname)
+PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
+                      PyArrayObject *wheremask,
+                      PyArray_Descr *operand_dtype,
+                      PyArray_Descr *result_dtype,
+                      NPY_CASTING casting,
+                      npy_bool *axis_flags, int reorderable,
+                      int keepdims,
+                      int subok,
+                      PyArray_AssignReduceIdentityFunc *assign_identity,
+                      PyArray_ReduceLoopFunc *loop,
+                      void *data, npy_intp buffersize, const char *funcname)
 {
     PyArrayObject *result = NULL, *op_view = NULL;
     npy_intp skip_first_count = 0;
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 76ab9e18a..43cd071e0 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -1,70 +1,154 @@
 #ifndef _NPY_PRIVATE__REDUCTION_H_
 #define _NPY_PRIVATE__REDUCTION_H_
 
+/************************************************************
+ * Typedefs used by PyArray_ReduceWrapper, new in 1.7.
+ ************************************************************/
+
 /*
- * This function initializes a result array for a reduction operation
- * which has no identity. This means it needs to copy the first element
- * it sees along the reduction axes to result, then return a view of
- * the operand which excludes that element.
- *
- * If a reduction has an identity, such as 0 or 1, the result should
- * be initialized by calling PyArray_AssignZero(result, NULL, NULL)
- * or PyArray_AssignOne(result, NULL, NULL), because this
- * function raises an exception when there are no elements to reduce.
- *
- * This means it copies the subarray indexed at zero along each reduction axis
- * into 'result', then returns a view into 'operand' excluding those copied
- * elements.
+ * This is a function for assigning a reduction identity to the result,
+ * before doing the reduction computation. The
+ * value in 'data' is passed through from PyArray_ReduceWrapper.
  *
- * result  : The array into which the result is computed. This must have
- *           the same number of dimensions as 'operand', but for each
- *           axis i where 'axis_flags[i]' is True, it has a single element.
- * operand : The array being reduced.
- * axis_flags : An array of boolean flags, one for each axis of 'operand'.
- *              When a flag is True, it indicates to reduce along that axis.
- * reorderable : If True, the reduction being done is reorderable, which
- *               means specifying multiple axes of reduction at once is ok,
- *               and the reduction code may calculate the reduction in an
- *               arbitrary order. The calculation may be reordered because
- *               of cache behavior or multithreading requirements.
- * out_skip_first_count : This gets populated with the number of first-visit
- *                        elements that should be skipped during the
- *                        iteration loop.
- * funcname : The name of the reduction operation, for the purpose of
- *            better quality error messages. For example, "numpy.max"
- *            would be a good name for NumPy's max function.
+ * This function could, for example, simply be a call like
+ *      return PyArray_AssignZero(result, NULL, NULL);
  *
- * Returns a view which contains the remaining elements on which to do
- * the reduction.
+ * It should return -1 on failure, or 0 on success.
  */
-NPY_NO_EXPORT PyArrayObject *
-PyArray_InitializeReduceResult(
-                    PyArrayObject *result, PyArrayObject *operand,
-                    npy_bool *axis_flags, int reorderable,
-                    npy_intp *out_skip_first_count, const char *funcname);
+typedef int (PyArray_AssignReduceIdentityFunc)(PyArrayObject *result,
+                                               void *data);
 
 /*
- * Creates a result for reducing 'operand' along the axes specified
- * in 'axis_flags'. If 'dtype' isn't NULL, this function steals a
- * reference to 'dtype'.
+ * This is a function for the reduce loop.
+ *
+ * The needs_api parameter indicates whether it's ok to release the GIL during
+ * the loop, such as when the iternext() function never calls
+ * a function which could raise a Python exception.
+ *
+ * Ths skip_first_count parameter indicates how many elements need to be
+ * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
+ * when the 'assign_identity' parameter was NULL when calling
+ * PyArray_ReduceWrapper.
+ *
+ * The loop gets two data pointers and two strides, and should
+ * look roughly like this:
+ *  {
+ *      NPY_BEGIN_THREADS_DEF;
+ *      if (!needs_api) {
+ *          NPY_BEGIN_THREADS;
+ *      }
+ *      // This first-visit loop can be skipped if 'assign_identity' was non-NULL
+ *      if (skip_first_count > 0) {
+ *          do {
+ *              char *data0 = dataptr[0], *data1 = dataptr[1];
+ *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
+ *              npy_intp count = *countptr;
  *
- * If 'out' isn't NULL, this function creates a view conforming
- * to the number of dimensions of 'operand', adding a singleton dimension
- * for each reduction axis specified. In this case, 'dtype' is ignored
- * (but its reference is still stolen), and the caller must handle any
- * type conversion/validity check for 'out'
+ *              // Skip any first-visit elements
+ *              if (NpyIter_IsFirstVisit(iter, 0)) {
+ *                  if (stride0 == 0) {
+ *                      --count;
+ *                      --skip_first_count;
+ *                      data1 += stride1;
+ *                  }
+ *                  else {
+ *                      skip_first_count -= count;
+ *                      count = 0;
+ *                  }
+ *              }
  *
- * If 'subok' is true, creates a result with the subtype of 'operand',
- * otherwise creates on with the base ndarray class.
+ *              while (count--) {
+ *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                    *(operand_t *)data1);
+ *                  data0 += stride0;
+ *                  data1 += stride1;
+ *              }
  *
- * If 'out' is NULL, it allocates a new array whose shape matches that of
- * 'operand', except for at the reduction axes. If 'dtype' is NULL, the dtype
- * of 'operand' is used for the result.
+ *              // Jump to the faster loop when skipping is done
+ *              if (skip_first_count == 0) {
+ *                  if (iternext(iter)) {
+ *                      break;
+ *                  }
+ *                  else {
+ *                      goto finish_loop;
+ *                  }
+ *              }
+ *          } while (iternext(iter));
+ *      }
+ *      do {
+ *          char *data0 = dataptr[0], *data1 = dataptr[1];
+ *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
+ *          npy_intp count = *countptr;
+ *
+ *          while (count--) {
+ *              *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                *(operand_t *)data1);
+ *              data0 += stride0;
+ *              data1 += stride1;
+ *          }
+ *      } while (iternext(iter));
+ *  finish_loop:
+ *      if (!needs_api) {
+ *          NPY_END_THREADS;
+ *      }
+ *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
+ *  }
+ *
+ * If needs_api is True, this function should call PyErr_Occurred()
+ * to check if an error occurred during processing, and return -1 for
+ * error, 0 for success.
+ */
+typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
+                                            char **dataptr,
+                                            npy_intp *strideptr,
+                                            npy_intp *countptr,
+                                            NpyIter_IterNextFunc *iternext,
+                                            int needs_api,
+                                            npy_intp skip_first_count,
+                                            void *data);
+
+/*
+ * This function executes all the standard NumPy reduction function
+ * boilerplate code, just calling assign_identity and the appropriate
+ * inner loop function where necessary.
+ *
+ * operand     : The array to be reduced.
+ * out         : NULL, or the array into which to place the result.
+ * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
+ *               so that support can be added in the future without breaking
+ *               API compatibility. Pass in NULL.
+ * operand_dtype : The dtype the inner loop expects for the operand.
+ * result_dtype : The dtype the inner loop expects for the result.
+ * casting     : The casting rule to apply to the operands.
+ * axis_flags  : Flags indicating the reduction axes of 'operand'.
+ * reorderable : If True, the reduction being done is reorderable, which
+ *               means specifying multiple axes of reduction at once is ok,
+ *               and the reduction code may calculate the reduction in an
+ *               arbitrary order. The calculation may be reordered because
+ *               of cache behavior or multithreading requirements.
+ * keepdims    : If true, leaves the reduction dimensions in the result
+ *               with size one.
+ * subok       : If true, the result uses the subclass of operand, otherwise
+ *               it is always a base class ndarray.
+ * assign_identity : If NULL, PyArray_InitializeReduceResult is used, otherwise
+ *               this function is called to initialize the result to
+ *               the reduction's unit.
+ * loop        : The loop which does the reduction.
+ * data        : Data which is passed to assign_identity and the inner loop.
+ * buffersize  : Buffer size for the iterator. For the default, pass in 0.
+ * funcname    : The name of the reduction function, for error messages.
  */
 NPY_NO_EXPORT PyArrayObject *
-PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
-                    PyArray_Descr *dtype, npy_bool *axis_flags,
-                    int keepdims, int subok,
-                    const char *funcname);
+PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
+                      PyArrayObject *wheremask,
+                      PyArray_Descr *operand_dtype,
+                      PyArray_Descr *result_dtype,
+                      NPY_CASTING casting,
+                      npy_bool *axis_flags, int reorderable,
+                      int keepdims,
+                      int subok,
+                      PyArray_AssignReduceIdentityFunc *assign_identity,
+                      PyArray_ReduceLoopFunc *loop,
+                      void *data, npy_intp buffersize, const char *funcname);
 
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 8de2dad39..3e0306bd2 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -41,6 +41,7 @@
 #include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
 #include "ufunc_type_resolution.h"
+#include "reduction.h"
 
 #include "ufunc_object.h"
 
@@ -2624,13 +2625,13 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         return NULL;
     }
 
-    result = PyArray_ReduceWrapper(arr, out, NULL, dtype, dtype,
-                                NPY_UNSAFE_CASTING,
-                                axis_flags, reorderable,
-                                keepdims, 0,
-                                assign_identity,
-                                reduce_loop,
-                                ufunc, buffersize, ufunc_name);
+    result = PyUFunc_ReduceWrapper(arr, out, NULL, dtype, dtype,
+                                   NPY_UNSAFE_CASTING,
+                                   axis_flags, reorderable,
+                                   keepdims, 0,
+                                   assign_identity,
+                                   reduce_loop,
+                                   ufunc, buffersize, ufunc_name);
 
     Py_DECREF(dtype);
     Py_XDECREF(errobj);
diff --git a/numpy/core/src/umath/umathmodule_onefile.c b/numpy/core/src/umath/umathmodule_onefile.c
index 2255daf76..62c7727e8 100644
--- a/numpy/core/src/umath/umathmodule_onefile.c
+++ b/numpy/core/src/umath/umathmodule_onefile.c
@@ -2,4 +2,5 @@
 
 #include "ufunc_object.c"
 #include "ufunc_type_resolution.c"
+#include "reduction.c"
 #include "umathmodule.c"
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 2cedb87e9..480b43811 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -595,17 +595,6 @@ class TestNonzero(TestCase):
         assert_equal(np.nonzero(x['a'].T), ([0,1,1,2],[1,1,2,0]))
         assert_equal(np.nonzero(x['b'].T), ([0,0,1,2,2],[0,1,2,0,2]))
 
-    def test_count_nonzero_axis(self):
-        a = array([[0,1,0],[2,3,0]])
-        assert_equal(np.count_nonzero(a, axis=()), [[0,1,0],[1,1,0]])
-        assert_equal(np.count_nonzero(a, axis=0), [1,2,0])
-        assert_equal(np.count_nonzero(a, axis=1), [1,2])
-        assert_equal(np.count_nonzero(a, axis=(0,1)), 3)
-
-        res = array([-1,-1,-1], dtype='i2')
-        np.count_nonzero(a, axis=0, out=res)
-        assert_equal(res, [1,2,0])
-
 class TestIndex(TestCase):
     def test_boolean(self):
         a = rand(3,5,8)