ENH: missingdata: Towards making count_nonzero a full-featured reduction operation

author: Mark Wiebe <mwwiebe@gmail.com> 2011-08-17 14:05:38 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 07:26:55 -0600
commit: 6bfd819a0897caf6e6db244930c40ed0d17b9e62 (patch)
tree: 2fb8316f2aef0905ac687e15bd39c18b125e7479
parent: a1faa1b6883c47333508a0476c1304b0a8a3f64e (diff)
download: numpy-6bfd819a0897caf6e6db244930c40ed0d17b9e62.tar.gz
5 files changed, 248 insertions, 41 deletions
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 461bd565f..5e59bf306 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -177,6 +177,8 @@ PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf)
 
 /*NUMPY_API
  * Get axis from an object (possibly None) -- a converter function,
+ *
+ * See also PyArray_ConvertMultiAxis, which also handles a tuple of axes.
  */
 NPY_NO_EXPORT int
 PyArray_AxisConverter(PyObject *obj, int *axis)
@@ -193,6 +195,90 @@ PyArray_AxisConverter(PyObject *obj, int *axis)
     return PY_SUCCEED;
 }
 
+/*
+ * Converts an axis parameter into an ndim-length C-array of
+ * boolean flags, True for each axis specified.
+ *
+ * If obj is None, everything is set to True. If obj is a tuple,
+ * each axis within the tuple is set to True. If obj is an integer,
+ * just that axis is set to True.
+ */
+NPY_NO_EXPORT int
+PyArray_ConvertMultiAxis(PyObject *axis_in, int ndim, npy_bool *out_axis_flags)
+{
+    /* None means all of the axes */
+    if (axis_in == Py_None || axis_in == NULL) {
+        memset(out_axis_flags, 1, ndim);
+        return NPY_SUCCEED;
+    }
+    /* A tuple of which axes */
+    else if (PyTuple_Check(axis_in)) {
+        int i, naxes;
+
+        memset(out_axis_flags, 0, ndim);
+
+        naxes = PyTuple_Size(axis_in);
+        if (naxes < 0) {
+            return NPY_FAIL;
+        }
+        for (i = 0; i < naxes; ++i) {
+            PyObject *tmp = PyTuple_GET_ITEM(axis_in, i);
+            long axis = PyInt_AsLong(tmp);
+            if (axis == -1 && PyErr_Occurred()) {
+                return NPY_FAIL;
+            }
+            if (axis < 0) {
+                axis += ndim;
+            }
+            if (axis < 0 || axis >= ndim) {
+                PyErr_SetString(PyExc_ValueError,
+                        "'axis' entry is out of bounds");
+                return NPY_FAIL;
+            }
+            if (out_axis_flags[axis]) {
+                PyErr_SetString(PyExc_ValueError,
+                        "duplicate value in 'axis'");
+                return NPY_FAIL;
+            }
+            out_axis_flags[axis] = 1;
+        }
+
+        return NPY_SUCCEED;
+    }
+    /* Try to interpret axis as an integer */
+    else {
+        long axis;
+
+        memset(out_axis_flags, 0, ndim);
+
+        axis = PyInt_AsLong(axis_in);
+        /* TODO: PyNumber_Index would be good to use here */
+        if (axis == -1 && PyErr_Occurred()) {
+            return NPY_FAIL;
+        }
+        if (axis < 0) {
+            axis += ndim;
+        }
+        /*
+         * Special case letting axis=0 slip through for scalars,
+         * for backwards compatibility reasons.
+         */
+        if (axis == 0 && ndim == 0) {
+            return NPY_SUCCEED;
+        }
+
+        if (axis < 0 || axis >= ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                    "'axis' entry is out of bounds");
+            return NPY_FAIL;
+        }
+
+        out_axis_flags[axis] = 1;
+
+        return NPY_SUCCEED;
+    }
+}
+
 /*NUMPY_API
  * Convert an object to true / false
  */
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 64b26b23e..3ebd6ebf0 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -40,4 +40,15 @@ PyArray_TypestrConvert(int itemsize, int gentype);
 NPY_NO_EXPORT PyObject *
 PyArray_IntTupleFromIntp(int len, intp *vals);
 
+/*
+ * Converts an axis parameter into an ndim-length C-array of
+ * boolean flags, True for each axis specified.
+ *
+ * If obj is None, everything is set to True. If obj is a tuple,
+ * each axis within the tuple is set to True. If obj is an integer,
+ * just that axis is set to True.
+ */
+NPY_NO_EXPORT int
+PyArray_ConvertMultiAxis(PyObject *axis_in, int ndim, npy_bool *out_axis_flags);
+
 #endif
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index ff217be62..28f243600 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1891,6 +1891,74 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
     return count;
 }
 
+/*
+ * A full reduction version of PyArray_CountNonzero, supporting
+ * an 'out' parameter and doing the count as a reduction along
+ * selected axes. It also supports a 'skipna' parameter, which
+ * skips over any NA masked values in arr.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
+                        npy_bool *axis_flags, int skipna)
+{
+    PyArray_NonzeroFunc *nonzero;
+    int ndim, use_maskna;
+    PyArray_Descr *dtype;
+    PyArrayObject *result = NULL;
+
+    nonzero = PyArray_DESCR(arr)->f->nonzero;
+    if (nonzero == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "Cannot count the number of non-zeros for a dtype "
+                    "which doesn't have a 'nonzero' function");
+        return NULL;
+    }
+
+    ndim = PyArray_NDIM(arr);
+    use_maskna = PyArray_HASMASKNA(arr);
+
+    /*
+     * If 'arr' has an NA mask, but 'out' doesn't, validate that 'arr'
+     * contains no NA values so we can ignore the mask entirely.
+     */
+    if (use_maskna && !skipna && out != NULL && !PyArray_HASMASKNA(out)) {
+        if (PyArray_ContainsNA(arr)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA value to an array which "
+                    "does not support NAs");
+            return NULL;
+        }
+        else {
+            use_maskna = 0;
+        }
+    }
+
+    /* This reference gets stolen by PyArray_CreateReduceResult */
+    dtype = PyArray_DescrFromType(NPY_INTP);
+    if (dtype == NULL) {
+        return NULL;
+    }
+    /* This either conforms 'out' to the ndim of 'arr', or allocates
+     * a new array appropriate for this reduction.
+     */
+    result = PyArray_CreateReduceResult(arr, out,
+                            dtype, axis_flags, !skipna && use_maskna,
+                            "count_nonzero");
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (use_maskna) {
+        /*
+         * Do the reduction on the NA mask before the data. This way
+         * we can avoid modifying the outputs which end up masked, obeying
+         * the required NA masking semantics.
+         */
+        if (!skipna) {
+        }
+    }
+}
+
 /*NUMPY_API
  * Counts the number of non-zero elements in the array. Raises
  * an error if the array contains an NA.
diff --git a/numpy/core/src/multiarray/na_mask.c b/numpy/core/src/multiarray/na_mask.c
index 0cb05beab..07aedfe9e 100644
--- a/numpy/core/src/multiarray/na_mask.c
+++ b/numpy/core/src/multiarray/na_mask.c
@@ -496,8 +496,7 @@ PyArray_IsNA(PyObject *obj)
     }
 }
 
-/*NUMPY_API
- *
+/*
  * This function performs a reduction on the masks for an array.
  * The masks are provided in raw form, with their strides conformed
  * for the reduction.
@@ -511,8 +510,8 @@ PyArray_IsNA(PyObject *obj)
  *
  * Returns 0 on success, -1 on failure.
  */
-NPY_NO_EXPORT int
-PyArray_ReduceMaskNAArray(int ndim, npy_intp *shape,
+static int
+raw_reduce_maskna_array(int ndim, npy_intp *shape,
             PyArray_Descr *src_dtype, char *src_data, npy_intp *src_strides,
             PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides)
 {
@@ -600,6 +599,70 @@ PyArray_ReduceMaskNAArray(int ndim, npy_intp *shape,
     return 0;
 }
 
+/*NUMPY_API
+ *
+ * This function performs a reduction on the masks for an array.
+ *
+ * This is for use with a reduction where 'skipna=False'.
+ *
+ * result: The result array, which should have the same 'ndim' as
+ *         'operand' but with dimensions of size one for every reduction
+ *         axis. This array must have an NA mask.
+ * operand: The operand for which the reduction is being done. This array
+ *          must have an NA mask.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_ReduceMaskNAArray(PyArrayObject *result, PyArrayObject *operand)
+{
+    int idim, ndim;
+    npy_intp result_strides[NPY_MAXDIMS];
+    npy_intp *result_shape, *operand_shape;
+    npy_intp *result_maskna_strides;
+
+    ndim = PyArray_NDIM(operand);
+    if (ndim != PyArray_NDIM(result)) {
+        PyErr_SetString(PyExc_ValueError,
+                "result and operand must have the same 'ndim' in "
+                "ReduceMaskNAArray");
+        return -1;
+    }
+    if (!PyArray_HASMASKNA(result) || !PyArray_HASMASKNA(operand)) {
+        PyErr_SetString(PyExc_ValueError,
+                "both result and operand must have NA masks in "
+                "ReduceMaskNAArray");
+        return -1;
+    }
+
+    /* Need to make sure the appropriate strides are 0 in 'result' */
+    result_shape = PyArray_SHAPE(result);
+    operand_shape = PyArray_SHAPE(operand);
+    result_maskna_strides = PyArray_MASKNA_STRIDES(result);
+    for (idim = 0; idim < ndim; ++idim) {
+        if (result_shape[idim] == 1) {
+            result_strides[idim] = 0;
+        }
+        else if (result_shape[idim] != operand_shape[idim]) {
+            PyErr_SetString(PyExc_ValueError,
+                "the result shape must match the operand shape wherever "
+                "it is not 1 in ReduceMaskNAArray");
+            return -1;
+        }
+        else {
+            result_strides[idim] = result_maskna_strides[idim];
+        }
+    }
+
+    return raw_reduce_maskna_array(ndim, PyArray_DIMS(operand),
+                    PyArray_MASKNA_DTYPE(operand),
+                    PyArray_MASKNA_DATA(operand),
+                    PyArray_MASKNA_STRIDES(operand),
+                    PyArray_MASKNA_DTYPE(result),
+                    PyArray_MASKNA_DATA(result),
+                    result_strides);
+}
+
 static void
 _strided_bool_mask_inversion(char *dst, npy_intp dst_stride,
                             char *src, npy_intp src_stride,
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index a4f9bb105..c64563950 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2505,6 +2505,7 @@ get_masked_binary_op_function(PyUFuncObject *self, PyArrayObject *arr,
     PyArrayObject *op[3] = {arr, arr, NULL};
     PyArray_Descr *dtype[3] = {NULL, NULL, NULL};
     PyObject *type_tup = NULL;
+    char *ufunc_name = self->name ? self->name : "(unknown)";
 
     NPY_UF_DBG_PRINT1("Getting masked binary op function for type number %d\n",
                                 *otype);
@@ -2546,9 +2547,9 @@ get_masked_binary_op_function(PyUFuncObject *self, PyArrayObject *arr,
         for (i = 0; i < 3; ++i) {
             Py_DECREF(dtype[i]);
         }
-        PyErr_SetString(PyExc_RuntimeError,
+        PyErr_Format(PyExc_RuntimeError,
                 "could not find a masked binary loop appropriate for "
-                "reduce ufunc");
+                "reduce ufunc %s", ufunc_name);
         return -1;
     }
 
@@ -2607,7 +2608,7 @@ initialize_reduce_result(int identity, PyArrayObject *result,
  * The axes must already be bounds-checked by the calling function,
  * this function does not validate them.
  */
-static PyObject *
+static PyArrayObject *
 PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
         int naxes, int *axes, int otype, int skipna)
 {
@@ -2659,17 +2660,15 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
     use_maskna = PyArray_HASMASKNA(arr);
 
     /* Detect whether to ignore the MASKNA */
-    if (use_maskna) {
-        if (!skipna && out != NULL && !PyArray_HASMASKNA(out)) {
-            if (PyArray_ContainsNA(arr)) {
-                PyErr_SetString(PyExc_ValueError,
-                        "Cannot assign NA value to an array which "
-                        "does not support NAs");
-                return NULL;
-            }
-            else {
-                use_maskna = 0;
-            }
+    if (use_maskna && !skipna && out != NULL && !PyArray_HASMASKNA(out)) {
+        if (PyArray_ContainsNA(arr)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA value to an array which "
+                    "does not support NAs");
+            return NULL;
+        }
+        else {
+            use_maskna = 0;
         }
     }
 
@@ -2737,27 +2736,7 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
          * the required NA masking semantics.
          */
         if (!skipna) {
-            int idim;
-            npy_intp result_strides[NPY_MAXDIMS];
-            /* Need to make sure the appropriate strides are 0 in 'result' */
-            for (idim = 0; idim < PyArray_NDIM(arr); ++idim) {
-                if (PyArray_DIMS(result)[idim] == 1) {
-                    result_strides[idim] = 0;
-                }
-                else {
-                    result_strides[idim] = PyArray_MASKNA_STRIDES(result)[idim];
-                }
-            }
-            if (!PyArray_HASMASKNA(arr) || !PyArray_HASMASKNA(result))
-                printf ("hasmaskna %d %d\n", PyArray_HASMASKNA(arr),
-                                PyArray_HASMASKNA(result));
-            if (PyArray_ReduceMaskNAArray(ndim, PyArray_DIMS(arr),
-                        PyArray_MASKNA_DTYPE(arr),
-                        PyArray_MASKNA_DATA(arr),
-                        PyArray_MASKNA_STRIDES(arr),
-                        PyArray_MASKNA_DTYPE(result),
-                        PyArray_MASKNA_DATA(result),
-                        result_strides) < 0) {
+            if (PyArray_ReduceMaskNAArray(result, arr) < 0) {
                 goto fail;
             }
 
@@ -2948,7 +2927,7 @@ finish:
     Py_XDECREF(arr_view);
     Py_XDECREF(otype_dtype);
     NPY_AUXDATA_FREE(maskedinnerloopdata);
-    return (PyObject *)result;
+    return result;
 
 fail:
     if (iter != NULL) {
@@ -3952,7 +3931,7 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
 
     switch(operation) {
     case UFUNC_REDUCE:
-        ret = (PyArrayObject *)PyUFunc_Reduce(self, mp, out, naxes, axes,
+        ret = PyUFunc_Reduce(self, mp, out, naxes, axes,
                                               otype->type_num, skipna);
         break;
     case UFUNC_ACCUMULATE:
author	Mark Wiebe <mwwiebe@gmail.com>	2011-08-17 14:05:38 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 07:26:55 -0600
commit	6bfd819a0897caf6e6db244930c40ed0d17b9e62 (patch)
tree	2fb8316f2aef0905ac687e15bd39c18b125e7479
parent	a1faa1b6883c47333508a0476c1304b0a8a3f64e (diff)
download	numpy-6bfd819a0897caf6e6db244930c40ed0d17b9e62.tar.gz