8 files changed, 143 insertions, 50 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 8596b9c9c..b402d2150 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -896,10 +896,21 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     ----------
     a : array_like
         The array for which to count non-zeros.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a reduction is performed.
+        The default (`axis` = None) is perform a reduction over all
+        the dimensions of the input array.
+    skipna : bool, optional
+        If this is set to True, any NA elements in the array are skipped
+        instead of propagating.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
-    count : int
+    count : int or array of int
         Number of non-zero values in the array.
 
     See Also
@@ -910,14 +921,18 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     --------
     >>> np.count_nonzero(np.eye(4))
     4
-
     >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]])
     5
+    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1)
+    array([2, 3])
+    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1, keepdims=True)
+    array([[2],
+           [3]])
     """)
 
 add_newdoc('numpy.core.multiarray', 'count_reduce_items',
     """
-    count_reduce_items(arr, axis=None, skipna=False)
+    count_reduce_items(arr, axis=None, skipna=False, keepdims=False)
 
     Counts the number of items a reduction with the same `axis`
     and `skipna` parameter values would use. The purpose of this
@@ -941,6 +956,10 @@ add_newdoc('numpy.core.multiarray', 'count_reduce_items',
         counted. The only time this function does any actual counting
         instead of a cheap multiply of a few sizes is when `skipna` is
         true and `arr` has an NA mask.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -5330,7 +5349,7 @@ add_newdoc('numpy.core', 'ufunc', ('types',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, skipna=False)
+    reduce(a, axis=0, dtype=None, out=None, skipna=False, keepdims=False)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -5380,6 +5399,10 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
         were not counted in the array. The default, False, causes the
         NA values to propagate, so if any element in a set of elements
         being reduced is NA, the result will be NA.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 2364fbfe8..4682386d7 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -2336,7 +2336,7 @@ def round_(a, decimals=0, out=None):
     return round(decimals, out)
 
 
-def mean(a, axis=None, dtype=None, out=None):
+def mean(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
     """
     Compute the arithmetic mean along the specified axis.
 
@@ -2361,6 +2361,13 @@ def mean(a, axis=None, dtype=None, out=None):
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.
         See `doc.ufuncs` for details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2407,11 +2414,29 @@ def mean(a, axis=None, dtype=None, out=None):
     0.55000000074505806
 
     """
-    try:
-        mean = a.mean
-    except AttributeError:
-        return _wrapit(a, 'mean', axis, dtype, out)
-    return mean(axis, dtype, out)
+    if not (type(a) is mu.ndarray):
+        try:
+            mean = a.mean
+            return mean(axis=axis, dtype=dtype, out=out)
+        except AttributeError:
+            pass
+
+    arr = asarray(a)
+
+    # Upgrade bool, unsigned int, and int to float64
+    if dtype is None and arr.dtype.kind in ['b','u','i']:
+        ret = um.add.reduce(arr, axis=axis, dtype='f8',
+                            out=out, skipna=skipna, keepdims=keepdims)
+    else:
+        ret = um.add.reduce(arr, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
+    rcount = mu.count_reduce_items(arr, axis=axis,
+                            skipna=skipna, keepdims=keepdims)
+    if type(ret) is mu.ndarray:
+        um.true_divide(ret, rcount, out=ret, casting='unsafe')
+    else:
+        ret = ret / float(rcount)
+    return ret
 
 
 def std(a, axis=None, dtype=None, out=None, ddof=0):
@@ -2458,14 +2483,15 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
     The standard deviation is the square root of the average of the squared
     deviations from the mean, i.e., ``std = sqrt(mean(abs(x - x.mean())**2))``.
 
-    The average squared deviation is normally calculated as ``x.sum() / N``, where
-    ``N = len(x)``.  If, however, `ddof` is specified, the divisor ``N - ddof``
-    is used instead. In standard statistical practice, ``ddof=1`` provides an
-    unbiased estimator of the variance of the infinite population. ``ddof=0``
-    provides a maximum likelihood estimate of the variance for normally
-    distributed variables. The standard deviation computed in this function
-    is the square root of the estimated variance, so even with ``ddof=1``, it
-    will not be an unbiased estimate of the standard deviation per se.
+    The average squared deviation is normally calculated as
+    ``x.sum() / N``, where ``N = len(x)``.  If, however, `ddof` is specified,
+    the divisor ``N - ddof`` is used instead. In standard statistical
+    practice, ``ddof=1`` provides an unbiased estimator of the variance
+    of the infinite population. ``ddof=0`` provides a maximum likelihood
+    estimate of the variance for normally distributed variables. The
+    standard deviation computed in this function is the square root of
+    the estimated variance, so even with ``ddof=1``, it will not be an
+    unbiased estimate of the standard deviation per se.
 
     Note that, for complex numbers, `std` takes the absolute
     value before squaring, so that the result is always real and nonnegative.
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 5257aec8e..ec0b1aa46 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1984,7 +1984,7 @@ reduce_count_nonzero_masked_inner_loop(NpyIter *iter,
  */
 NPY_NO_EXPORT PyObject *
 PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
-                        npy_bool *axis_flags, int skipna)
+                        npy_bool *axis_flags, int skipna, int keepdims)
 {
     PyArray_NonzeroFunc *nonzero;
     PyArrayObject *result;
@@ -2005,7 +2005,7 @@ PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
 
     result = PyArray_ReduceWrapper(arr, out,
                             PyArray_DESCR(arr), dtype,
-                            axis_flags, skipna,
+                            axis_flags, skipna, keepdims,
                             &assign_reduce_unit_zero,
                             &reduce_count_nonzero_inner_loop,
                             &reduce_count_nonzero_masked_inner_loop,
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
index 722aaa5d1..5c1741aaf 100644
--- a/numpy/core/src/multiarray/item_selection.h
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -35,7 +35,7 @@ PyArray_MultiIndexSetItem(PyArrayObject *self, npy_intp *multi_index,
  */
 NPY_NO_EXPORT PyObject *
 PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
-                        npy_bool *axis_flags, int skipna);
+                        npy_bool *axis_flags, int skipna, int keepdims);
 
 
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index e21369e6f..2afb0d84a 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -2166,19 +2166,21 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"arr", "axis", "out", "skipna", NULL};
+    static char *kwlist[] = {"arr", "axis", "out", "skipna", "keepdims", NULL};
 
     PyObject *array_in, *axis_in = NULL, *out_in = NULL;
     PyObject *ret = NULL;
     PyArrayObject *array, *out = NULL;
     npy_bool axis_flags[NPY_MAXDIMS];
-    int skipna = 0;
+    int skipna = 0, keepdims = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOi:count_nonzero", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "O|OOii:count_nonzero", kwlist,
                                 &array_in,
                                 &axis_in,
                                 &out_in,
-                                &skipna)) {
+                                &skipna,
+                                &keepdims)) {
         return NULL;
     }
 
@@ -2204,7 +2206,7 @@ array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
         }
     }
 
-    ret = PyArray_ReduceCountNonzero(array, out, axis_flags, skipna);
+    ret = PyArray_ReduceCountNonzero(array, out, axis_flags, skipna, keepdims);
 
     Py_DECREF(array);
 
@@ -2214,19 +2216,20 @@ array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 static PyObject *
 array_count_reduce_items(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"arr", "axis", "skipna", NULL};
+    static char *kwlist[] = {"arr", "axis", "skipna", "keepdims", NULL};
 
     PyObject *array_in, *axis_in = NULL;
     PyObject *ret = NULL;
     PyArrayObject *array;
     npy_bool axis_flags[NPY_MAXDIMS];
-    int skipna = 0;
+    int skipna = 0, keepdims = 0;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds,
-                                "O|Oi:count_reduce_items", kwlist,
+                                "O|Oii:count_reduce_items", kwlist,
                                 &array_in,
                                 &axis_in,
-                                &skipna)) {
+                                &skipna,
+                                &keepdims)) {
         return NULL;
     }
 
@@ -2242,7 +2245,7 @@ array_count_reduce_items(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *k
         return NULL;
     }
 
-    ret = PyArray_CountReduceItems(array, axis_flags, skipna);
+    ret = PyArray_CountReduceItems(array, axis_flags, skipna, keepdims);
 
     Py_DECREF(array);
 
diff --git a/numpy/core/src/multiarray/reduction.c b/numpy/core/src/multiarray/reduction.c
index 1866e3cba..f305b8e06 100644
--- a/numpy/core/src/multiarray/reduction.c
+++ b/numpy/core/src/multiarray/reduction.c
@@ -80,7 +80,7 @@ allocate_reduce_result(PyArrayObject *arr, npy_bool *axis_flags,
  */
 static PyArrayObject *
 conform_reduce_result(int ndim, npy_bool *axis_flags,
-                        PyArrayObject *out, const char *funcname)
+                    PyArrayObject *out, int keepdims, const char *funcname)
 {
     npy_intp strides[NPY_MAXDIMS], shape[NPY_MAXDIMS];
     npy_intp *strides_out = PyArray_STRIDES(out);
@@ -89,6 +89,35 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
     PyArray_Descr *dtype;
     PyArrayObject_fieldaccess *ret;
 
+    /*
+     * If the 'keepdims' parameter is true, do a simpler validation and
+     * return a new reference to 'out'.
+     */
+    if (keepdims) {
+        if (PyArray_NDIM(out) != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                    "output parameter for reduction operation %s "
+                    "has the wrong number of dimensions (must match "
+                    "the operand's when keepdims=True)", funcname);
+            return NULL;
+        }
+
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                if (shape_out[idim] != 1) {
+                    PyErr_Format(PyExc_ValueError,
+                            "output parameter for reduction operation %s "
+                            "has a reduction dimension not equal to one "
+                            "(required when keepdims=True)", funcname);
+                    return NULL;
+                }
+            }
+        }
+
+        Py_INCREF(out);
+        return out;
+    }
+
     /* Construct the strides and shape */
     idim_out = 0;
     for (idim = 0; idim < ndim; ++idim) {
@@ -180,7 +209,7 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
 NPY_NO_EXPORT PyArrayObject *
 PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
                     PyArray_Descr *dtype, npy_bool *axis_flags,
-                    int need_namask, const char *funcname)
+                    int need_namask, int keepdims, const char *funcname)
 {
     PyArrayObject *result;
 
@@ -209,7 +238,7 @@ PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
         }
 
         result = conform_reduce_result(PyArray_NDIM(operand), axis_flags,
-                                        out, funcname);
+                                        out, keepdims, funcname);
     }
 
     return result;
@@ -413,6 +442,8 @@ PyArray_InitializeReduceResult(
  * result_dtype : The dtype the inner loop expects for the result.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
  * skipna      : If true, NAs are skipped instead of propagating.
+ * keepdims    : If true, leaves the reduction dimensions in the result
+ *               with size one.
  * assign_unit : If NULL, PyArray_InitializeReduceResult is used, otherwise
  *               this function is called to initialize the result to
  *               the reduction's unit.
@@ -426,7 +457,7 @@ NPY_NO_EXPORT PyArrayObject *
 PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                         PyArray_Descr *operand_dtype,
                         PyArray_Descr *result_dtype,
-                        npy_bool *axis_flags, int skipna,
+                        npy_bool *axis_flags, int skipna, int keepdims,
                         PyArray_AssignReduceUnitFunc *assign_unit,
                         PyArray_ReduceInnerLoopFunc *inner_loop,
                         PyArray_ReduceInnerLoopFunc *masked_inner_loop,
@@ -466,7 +497,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     Py_INCREF(result_dtype);
     result = PyArray_CreateReduceResult(operand, out,
                             result_dtype, axis_flags, !skipna && use_maskna,
-                            funcname);
+                            keepdims, funcname);
     if (result == NULL) {
         goto fail;
     }
@@ -617,7 +648,9 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
 finish:
     /* Strip out the extra 'one' dimensions in the result */
     if (out == NULL) {
-        PyArray_RemoveAxesInPlace(result, axis_flags);
+        if (!keepdims) {
+            PyArray_RemoveAxesInPlace(result, axis_flags);
+        }
     }
     else {
         Py_DECREF(result);
@@ -651,7 +684,7 @@ fail:
  */
 NPY_NO_EXPORT PyObject *
 PyArray_CountReduceItems(PyArrayObject *operand,
-                            npy_bool *axis_flags, int skipna)
+                            npy_bool *axis_flags, int skipna, int keepdims)
 {
     int idim, ndim = PyArray_NDIM(operand);
 
@@ -715,7 +748,7 @@ PyArray_CountReduceItems(PyArrayObject *operand,
         }
         result = PyArray_CreateReduceResult(operand, NULL,
                                 result_dtype, axis_flags, 0,
-                                "count_reduce_items");
+                                keepdims, "count_reduce_items");
         if (result == NULL) {
             return NULL;
         }
@@ -767,7 +800,9 @@ PyArray_CountReduceItems(PyArrayObject *operand,
                                     result_data, result_strides_it);
 
         /* Remove the reduction axes and return the result */
-        PyArray_RemoveAxesInPlace(result, axis_flags);
+        if (!keepdims) {
+            PyArray_RemoveAxesInPlace(result, axis_flags);
+        }
         return PyArray_Return(result);
     }
 }
diff --git a/numpy/core/src/multiarray/reduction.h b/numpy/core/src/multiarray/reduction.h
index 849db256b..e2a9806c1 100644
--- a/numpy/core/src/multiarray/reduction.h
+++ b/numpy/core/src/multiarray/reduction.h
@@ -94,6 +94,8 @@ typedef void (PyArray_ReduceInnerLoopFunc)(NpyIter *iter,
  * result_dtype : The dtype the inner loop expects for the result.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
  * skipna      : If true, NAs are skipped instead of propagating.
+ * keepdims    : If true, leaves the reduction dimensions in the result
+ *               with size one.
  * assign_unit : If NULL, PyArray_InitializeReduceResult is used, otherwise
  *               this function is called to initialize the result to
  *               the reduction's unit.
@@ -107,7 +109,7 @@ NPY_NO_EXPORT PyArrayObject *
 PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                         PyArray_Descr *operand_dtype,
                         PyArray_Descr *result_dtype,
-                        npy_bool *axis_flags, int skipna,
+                        npy_bool *axis_flags, int skipna, int keepdims,
                         PyArray_AssignReduceUnitFunc *assign_unit,
                         PyArray_ReduceInnerLoopFunc *inner_loop,
                         PyArray_ReduceInnerLoopFunc *masked_inner_loop,
@@ -127,7 +129,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
  */
 NPY_NO_EXPORT PyObject *
 PyArray_CountReduceItems(PyArrayObject *operand,
-                            npy_bool *axis_flags, int skipna);
+                            npy_bool *axis_flags, int skipna, int keepdims);
 
 
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index d845140ad..b80d3b5af 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2610,7 +2610,7 @@ initialize_reduce_result(int identity, PyArrayObject *result,
  */
 static PyArrayObject *
 PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
-        int naxes, int *axes, int otype, int skipna)
+        int naxes, int *axes, int otype, int skipna, int keepdims)
 {
     int iaxes, ndim, retcode;
     PyArray_Descr *otype_dtype = NULL;
@@ -2723,7 +2723,7 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
     Py_XINCREF(otype_dtype);
     result = PyArray_CreateReduceResult(arr, out,
                             otype_dtype, axis_flags, !skipna && use_maskna,
-                            ufunc_name);
+                            keepdims, ufunc_name);
     if (result == NULL) {
         return NULL;
     }
@@ -2911,7 +2911,9 @@ finish:
 
     /* Strip out the extra 'one' dimensions in the result */
     if (out == NULL) {
-        PyArray_RemoveAxesInPlace(result, axis_flags);
+        if (!keepdims) {
+            PyArray_RemoveAxesInPlace(result, axis_flags);
+        }
     }
     else {
         Py_DECREF(result);
@@ -3708,8 +3710,9 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
     PyArrayObject *indices = NULL;
     PyArray_Descr *otype = NULL;
     PyArrayObject *out = NULL;
-    int skipna = 0;
-    static char *kwlist1[] = {"array", "axis", "dtype", "out", "skipna", NULL};
+    int skipna = 0, keepdims = 0;
+    static char *kwlist1[] = {"array", "axis", "dtype",
+                                "out", "skipna", "keepdims", NULL};
     static char *kwlist2[] = {"array", "indices", "axis",
                                 "dtype", "out", "skipna", NULL};
     static char *_reduce_type[] = {"reduce", "accumulate", "reduceat", NULL};
@@ -3758,12 +3761,13 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
         }
     }
     else {
-        if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&i", kwlist1,
+        if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&ii", kwlist1,
                                         &op,
                                         &axes_in,
                                         PyArray_DescrConverter2, &otype,
                                         PyArray_OutputConverter, &out,
-                                        &skipna)) {
+                                        &skipna,
+                                        &keepdims)) {
             Py_XDECREF(otype);
             return NULL;
         }
@@ -3930,7 +3934,7 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
     switch(operation) {
     case UFUNC_REDUCE:
         ret = PyUFunc_Reduce(self, mp, out, naxes, axes,
-                                              otype->type_num, skipna);
+                                          otype->type_num, skipna, keepdims);
         break;
     case UFUNC_ACCUMULATE:
         if (naxes != 1) {