ENH: missingdata: Implement skipna= support for np.std and np.var

author: Mark Wiebe <mwwiebe@gmail.com> 2011-08-17 23:07:58 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 07:26:56 -0600
commit: a112fc4a6b28fbb85e1b0c6d423095d13cf7b226 (patch)
tree: 07ce0d495f708debcf76be16f7cfb66ea0a1ddb5 /numpy
parent: 0fa4f22fec4b19e2a8c1d93e5a1f955167c9addd (diff)
download: numpy-a112fc4a6b28fbb85e1b0c6d423095d13cf7b226.tar.gz
5 files changed, 183 insertions, 28 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index b402d2150..711b7de52 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -4194,7 +4194,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('squeeze',
     """
-    a.squeeze()
+    a.squeeze(axis=None)
 
     Remove single-dimensional entries from the shape of `a`.
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 4682386d7..03cb427cd 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -868,7 +868,7 @@ def resize(a, new_shape):
     return reshape(a, new_shape)
 
 
-def squeeze(a):
+def squeeze(a, axis=None):
     """
     Remove single-dimensional entries from the shape of an array.
 
@@ -876,12 +876,17 @@ def squeeze(a):
     ----------
     a : array_like
         Input data.
+    axis : None or int or tuple of ints, optional
+        Selects a subset of the single-dimensional entries in the
+        shape. If an axis is selected with shape entry greater than
+        one, that axis is left untouched.
 
     Returns
     -------
     squeezed : ndarray
-        The input array, but with with all dimensions of length 1
-        removed.  Whenever possible, a view on `a` is returned.
+        The input array, but with with all or a subset of the
+        dimensions of length 1 removed. This is always `a` itself
+        or a view into `a`.
 
     Examples
     --------
@@ -896,7 +901,7 @@ def squeeze(a):
         squeeze = a.squeeze
     except AttributeError:
         return _wrapit(a, 'squeeze')
-    return squeeze()
+    return squeeze(axis=axis)
 
 
 def diagonal(a, offset=0, axis1=0, axis2=1):
@@ -2432,14 +2437,15 @@ def mean(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
                             out=out, skipna=skipna, keepdims=keepdims)
     rcount = mu.count_reduce_items(arr, axis=axis,
                             skipna=skipna, keepdims=keepdims)
-    if type(ret) is mu.ndarray:
+    if isinstance(ret, mu.ndarray):
         um.true_divide(ret, rcount, out=ret, casting='unsafe')
     else:
         ret = ret / float(rcount)
     return ret
 
 
-def std(a, axis=None, dtype=None, out=None, ddof=0):
+def std(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
     """
     Compute the standard deviation along the specified axis.
 
@@ -2466,6 +2472,13 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
         Means Delta Degrees of Freedom.  The divisor used in calculations
         is ``N - ddof``, where ``N`` represents the number of elements.
         By default `ddof` is zero.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2526,14 +2539,25 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
     0.44999999925552653
 
     """
-    try:
-        std = a.std
-    except AttributeError:
-        return _wrapit(a, 'std', axis, dtype, out, ddof)
-    return std(axis, dtype, out, ddof)
+    if not (type(a) is mu.ndarray):
+        try:
+            std = a.std
+            return std(axis=axis, dtype=dtype, out=out, ddof=ddof)
+        except AttributeError:
+            pass
+
+    ret = var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
+                                skipna=skipna, keepdims=keepdims)
+
+    if isinstance(ret, mu.ndarray):
+        um.sqrt(ret, out=ret)
+    else:
+        ret = um.sqrt(ret)
 
+    return ret
 
-def var(a, axis=None, dtype=None, out=None, ddof=0):
+def var(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
     """
     Compute the variance along the specified axis.
 
@@ -2561,6 +2585,13 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
         "Delta Degrees of Freedom": the divisor used in the calculation is
         ``N - ddof``, where ``N`` represents the number of elements. By
         default `ddof` is zero.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2600,9 +2631,9 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     >>> a = np.array([[1,2],[3,4]])
     >>> np.var(a)
     1.25
-    >>> np.var(a,0)
+    >>> np.var(a, axis=0)
     array([ 1.,  1.])
-    >>> np.var(a,1)
+    >>> np.var(a, axis=1)
     array([ 0.25,  0.25])
 
     In single precision, var() can be inaccurate:
@@ -2613,7 +2644,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     >>> np.var(a)
     0.20405951142311096
 
-    Computing the standard deviation in float64 is more accurate:
+    Computing the variance in float64 is more accurate:
 
     >>> np.var(a, dtype=np.float64)
     0.20249999932997387
@@ -2621,8 +2652,50 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     0.20250000000000001
 
     """
-    try:
-        var = a.var
-    except AttributeError:
-        return _wrapit(a, 'var', axis, dtype, out, ddof)
-    return var(axis, dtype, out, ddof)
+    if not (type(a) is mu.ndarray):
+        try:
+            var = a.var
+            return var(axis=axis, dtype=dtype, out=out, ddof=ddof)
+        except AttributeError:
+            pass
+
+    arr = asarray(a)
+
+    # First compute the mean, saving 'rcount' for reuse later
+    if dtype is None and arr.dtype.kind in ['b','u','i']:
+        arrmean = um.add.reduce(arr, axis=axis, dtype='f8',
+                            skipna=skipna, keepdims=True)
+    else:
+        arrmean = um.add.reduce(arr, axis=axis, dtype=dtype,
+                            skipna=skipna, keepdims=True)
+    rcount = mu.count_reduce_items(arr, axis=axis,
+                            skipna=skipna, keepdims=True)
+    if isinstance(arrmean, mu.ndarray):
+        um.true_divide(arrmean, rcount, out=arrmean, casting='unsafe')
+    else:
+        arrmean = arrmean / float(rcount)
+
+    # arr - arrmean
+    x = arr - arrmean
+
+    # (arr - arrmean) ** 2
+    if arr.dtype.kind == 'c':
+        um.multiply(x, um.conjugate(x), out=x)
+        x = x.real
+    else:
+        um.multiply(x, x, out=x)
+
+    # add.reduce((arr - arrmean) ** 2, axis)
+    ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out,
+                                skipna=skipna, keepdims=keepdims)
+
+    # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof)
+    if not keepdims and isinstance(rcount, mu.ndarray):
+        rcount = rcount.squeeze(axis=axis)
+    rcount -= ddof
+    if isinstance(ret, mu.ndarray):
+        um.true_divide(ret, rcount, out=ret, casting='unsafe')
+    else:
+        ret = ret / float(rcount)
+
+    return ret
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 649ff734d..6a5a5620b 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -16,10 +16,12 @@
 #include "common.h"
 #include "ctors.h"
 #include "calculation.h"
-
-#include "methods.h"
 #include "convert_datatype.h"
 #include "item_selection.h"
+#include "conversion_utils.h"
+#include "shape.h"
+
+#include "methods.h"
 
 
 /* NpyArg_ParseKeywords
@@ -138,12 +140,28 @@ array_reshape(PyArrayObject *self, PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_squeeze(PyArrayObject *self, PyObject *args)
+array_squeeze(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    if (!PyArg_ParseTuple(args, "")) {
+    PyObject *axis_in = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+
+    static char *kwlist[] = {"axis", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist,
+                                     &axis_in)) {
         return NULL;
     }
-    return PyArray_Squeeze(self);
+
+    if (axis_in == NULL) {
+        return PyArray_Squeeze(self);
+    }
+    else {
+        if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(self),
+                                            axis_flags) != NPY_SUCCEED) {
+            return NULL;
+        }
+
+        return PyArray_SqueezeSelected(self, axis_flags);
+    }
 }
 
 static PyObject *
@@ -160,8 +178,9 @@ array_view(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      &out_dtype,
                                      &out_type,
                                      &maskna,
-                                     &ownmaskna))
+                                     &ownmaskna)) {
         return NULL;
+    }
 
     /* If user specified a positional argument, guess whether it
        represents a type or a dtype for backward compatibility. */
@@ -2382,7 +2401,7 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"squeeze",
         (PyCFunction)array_squeeze,
-        METH_VARARGS, NULL},
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"std",
         (PyCFunction)array_stddev,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 496d32955..97b0b204a 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -708,6 +708,59 @@ PyArray_Squeeze(PyArrayObject *self)
     return (PyObject *)ret;
 }
 
+/*
+ * Just like PyArray_Squeeze, but allows the caller to select
+ * a subset of the size-one dimensions to squeeze out.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_SqueezeSelected(PyArrayObject *self, npy_bool *axis_flags)
+{
+    PyArrayObject *ret;
+    npy_bool unit_dims[NPY_MAXDIMS];
+    int idim, ndim, any_ones;
+    npy_intp *shape;
+
+    ndim = PyArray_NDIM(self);
+    shape = PyArray_SHAPE(self);
+
+    /* Verify that the axes requested are all of size one */
+    any_ones = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (axis_flags[idim] != 0 && shape[idim] == 1) {
+            unit_dims[idim] = 1;
+            any_ones = 1;
+        }
+        else {
+            unit_dims[idim] = 0;
+        }
+    }
+
+    /* If there were no axes to squeeze out, return the same array */
+    if (!any_ones) {
+        Py_INCREF(self);
+        return (PyObject *)self;
+    }
+
+    ret = (PyArrayObject *)PyArray_View(self, NULL, &PyArray_Type);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    PyArray_RemoveAxesInPlace(ret, unit_dims);
+
+    /*
+     * If self isn't not a base class ndarray, call its
+     * __array_wrap__ method
+     */
+    if (Py_TYPE(self) != &PyArray_Type) {
+        PyArrayObject *tmp = PyArray_SubclassWrap(self, ret);
+        Py_DECREF(ret);
+        ret = tmp;
+    }
+
+    return (PyObject *)ret;
+}
+
 /*NUMPY_API
  * SwapAxes
  */
@@ -1196,6 +1249,10 @@ build_shape_string(npy_intp n, npy_intp *vals)
  * has a shape entry bigger than one, this effectively selects
  * index zero for that axis.
  *
+ * WARNING: If an axis flagged for removal has a shape equal to zero,
+ *          the array will point to invalid memory. The caller must
+ *          validate this!
+ *
  * For example, this can be used to remove the reduction axes
  * from a reduction result once its computation is complete.
  */
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index a293254a7..0451a463e 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -21,5 +21,11 @@ NPY_NO_EXPORT void
 PyArray_CreateMultiSortedStridePerm(int narrays, PyArrayObject **arrays,
                         int ndim, int *out_strideperm);
 
+/*
+ * Just like PyArray_Squeeze, but allows the caller to select
+ * a subset of the size-one dimensions to squeeze out.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_SqueezeSelected(PyArrayObject *self, npy_bool *axis_flags);
 
 #endif
author	Mark Wiebe <mwwiebe@gmail.com>	2011-08-17 23:07:58 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 07:26:56 -0600
commit	a112fc4a6b28fbb85e1b0c6d423095d13cf7b226 (patch)
tree	07ce0d495f708debcf76be16f7cfb66ea0a1ddb5 /numpy
parent	0fa4f22fec4b19e2a8c1d93e5a1f955167c9addd (diff)
download	numpy-a112fc4a6b28fbb85e1b0c6d423095d13cf7b226.tar.gz