From e73dd6a826ad1b0ccb0e1cbe4aacfeee644f390f Mon Sep 17 00:00:00 2001 From: Mark Wiebe Date: Sun, 31 Jul 2011 09:24:03 -0500 Subject: ENH: missingdata: Progress towards supporting ufunc reduce with NA masks --- numpy/add_newdocs.py | 4 +- numpy/core/src/multiarray/arrayobject.c | 2 +- numpy/core/src/multiarray/multiarraymodule.c | 21 ++++- numpy/core/src/multiarray/na_mask.c | 93 ++++++++++++-------- numpy/core/src/multiarray/na_mask.h | 10 +++ numpy/core/src/private/lowlevel_strided_loops.h | 24 ++++++ numpy/core/src/umath/ufunc_object.c | 109 +++++++++++++++++------- numpy/core/tests/test_maskna.py | 9 ++ numpy/core/tests/test_nditer.py | 19 +++++ 9 files changed, 219 insertions(+), 72 deletions(-) (limited to 'numpy') diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py index 65c9646eb..536cde9d1 100644 --- a/numpy/add_newdocs.py +++ b/numpy/add_newdocs.py @@ -1278,7 +1278,7 @@ add_newdoc('numpy.core.multiarray','correlate', add_newdoc('numpy.core.multiarray', 'arange', """ - arange([start,] stop[, step,], dtype=None) + arange([start,] stop[, step,], dtype=None, maskna=False) Return evenly spaced values within a given interval. @@ -1307,6 +1307,8 @@ add_newdoc('numpy.core.multiarray', 'arange', dtype : dtype The type of the output array. If `dtype` is not given, infer the data type from the other input arguments. + maskna : boolean + If this is true, the returned array will have an NA mask. Returns ------- diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c index f124e5ef0..5215a99d8 100644 --- a/numpy/core/src/multiarray/arrayobject.c +++ b/numpy/core/src/multiarray/arrayobject.c @@ -367,7 +367,7 @@ array_dealloc(PyArrayObject *self) /* If the array has an NA mask, free its associated data */ if (fa->flags & NPY_ARRAY_MASKNA) { - Py_DECREF(fa->maskna_dtype); + Py_XDECREF(fa->maskna_dtype); if (fa->flags & NPY_ARRAY_OWNMASKNA) { PyDataMem_FREE(fa->maskna_data); } diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index d2462ee5b..c7e0cfa9d 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -2574,17 +2574,30 @@ array_correlate2(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) static PyObject * array_arange(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws) { PyObject *o_start = NULL, *o_stop = NULL, *o_step = NULL, *range=NULL; - static char *kwd[]= {"start", "stop", "step", "dtype", NULL}; + static char *kwd[]= {"start", "stop", "step", "dtype", "maskna", NULL}; PyArray_Descr *typecode = NULL; + int maskna = 0; - if(!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&", kwd, - &o_start, &o_stop, &o_step, - PyArray_DescrConverter2, &typecode)) { + if(!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&i", kwd, + &o_start, + &o_stop, + &o_step, + PyArray_DescrConverter2, &typecode, + &maskna)) { Py_XDECREF(typecode); return NULL; } range = PyArray_ArangeObj(o_start, o_stop, o_step, typecode); Py_XDECREF(typecode); + + /* Allocate an NA mask if requested */ + if (maskna) { + if (PyArray_AllocateMaskNA((PyArrayObject *)range, 1, 0, 1) < 0) { + Py_DECREF(range); + return NULL; + } + } + return range; } diff --git a/numpy/core/src/multiarray/na_mask.c b/numpy/core/src/multiarray/na_mask.c index a5ad489de..ef80c310c 100644 --- a/numpy/core/src/multiarray/na_mask.c +++ b/numpy/core/src/multiarray/na_mask.c @@ -64,8 +64,7 @@ PyArray_ContainsNA(PyArrayObject *arr) } /* Do the iteration */ - memset(coord, 0, ndim * sizeof(npy_intp)); - do { + NPY_RAW_ITER_START(idim, ndim, coord, shape, strides, data) { char *d = data; /* Process the innermost dimension */ for (i = 0; i < shape[0]; ++i, d += strides[0]) { @@ -73,19 +72,65 @@ PyArray_ContainsNA(PyArrayObject *arr) return 1; } } + } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, strides, data); + } - /* Increment to the next n-dimensional coordinate */ - for (idim = 1; idim < ndim; ++idim) { - if (++coord[idim] == shape[idim]) { - coord[idim] = 0; - data -= (shape[idim] - 1) * strides[idim]; - } - else { - data += strides[idim]; - break; - } + return 0; +} + +/* + * Assigns the mask value to all the NA mask elements of + * the array. + * + * Returns 0 on success, -1 on failure. + */ +NPY_NO_EXPORT int +PyArray_AssignMaskNA(PyArrayObject *arr, npy_mask maskvalue) +{ + int idim, ndim; + char *data; + npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; + npy_intp i, coord[NPY_MAXDIMS]; + + /* Need NA support to fill the NA mask */ + if (!PyArray_HASMASKNA(arr)) { + PyErr_SetString(PyExc_ValueError, + "Cannot fill the NA mask of an array which has no NA mask"); + return -1; + } + + if (PyArray_HASFIELDS(arr)) { + /* TODO: need to add field-NA support */ + PyErr_SetString(PyExc_ValueError, + "field-NA support is not implemented yet"); + return -1; + } + + /* Use raw iteration with no heap memory allocation */ + if (PyArray_PrepareOneRawArrayIter( + PyArray_NDIM(arr), PyArray_MASKNA_DATA(arr), + PyArray_DIMS(arr), PyArray_MASKNA_STRIDES(arr), + &ndim, &data, shape, strides) < 0) { + PyErr_Clear(); + return 1; + } + + /* Special case contiguous inner stride */ + if (strides[0] == 1) { + NPY_RAW_ITER_START(idim, ndim, coord, shape, strides, data) { + /* Process the innermost dimension */ + memset(data, maskvalue, shape[0]); + } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, strides, data); + } + /* General inner stride */ + else { + NPY_RAW_ITER_START(idim, ndim, coord, shape, strides, data) { + char *d = data; + /* Process the innermost dimension */ + for (i = 0; i < shape[0]; ++i, d += strides[0]) { + *(npy_mask *)d = maskvalue; } - } while (i < ndim); + } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, strides, data); } return 0; @@ -243,8 +288,6 @@ PyArray_AssignNA(PyArrayObject *arr, NpyNA *na) { NpyNA_fields *fna = (NpyNA_fields *)na; char maskvalue; - PyArray_Descr *maskdtype; - npy_intp strides[NPY_MAXDIMS]; if (!PyArray_HASMASKNA(arr)) { PyErr_SetString(PyExc_ValueError, @@ -269,25 +312,7 @@ PyArray_AssignNA(PyArrayObject *arr, NpyNA *na) maskvalue = (char)NpyMaskValue_Create(0, fna->payload); } - /* Copy the mask value to arr's mask */ - maskdtype = PyArray_DescrFromType(maskvalue == 0 ? NPY_BOOL - : NPY_MASK); - if (maskdtype == NULL) { - return -1; - } - memset(strides, 0, PyArray_NDIM(arr) * sizeof(npy_intp)); - if (PyArray_CastRawNDimArrays(PyArray_NDIM(arr), - PyArray_DIMS(arr), - &maskvalue, PyArray_MASKNA_DATA(arr), - strides, PyArray_MASKNA_STRIDES(arr), - maskdtype, PyArray_MASKNA_DTYPE(arr), - 0) < 0) { - Py_DECREF(maskdtype); - return -1; - } - - Py_DECREF(maskdtype); - return 0; + return PyArray_AssignMaskNA(arr, maskvalue); } /* diff --git a/numpy/core/src/multiarray/na_mask.h b/numpy/core/src/multiarray/na_mask.h index 9cc42b379..22944b781 100644 --- a/numpy/core/src/multiarray/na_mask.h +++ b/numpy/core/src/multiarray/na_mask.h @@ -9,6 +9,16 @@ NPY_NO_EXPORT int PyArray_AssignNA(PyArrayObject *arr, NpyNA *na); +/* + * Assigns the mask value to all the NA mask elements of + * the array. + * + * Returns 0 on success, -1 on failure. + */ +NPY_NO_EXPORT int +PyArray_AssignMaskNA(PyArrayObject *arr, npy_mask maskvalue); + + /* * A ufunc-like function, which returns a boolean or an array * of booleans indicating which values are NA. diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h index e9a1dd40d..711239f35 100644 --- a/numpy/core/src/private/lowlevel_strided_loops.h +++ b/numpy/core/src/private/lowlevel_strided_loops.h @@ -327,6 +327,10 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim, * not be stored as a PyArrayObject. For example, to iterate over * the NA mask of an array. * + * You can use this together with NPY_RAW_ITER_START and + * NPY_RAW_ITER_ONE_NEXT to handle the looping boilerplate of everything + * but the innermost loop (which is for idim == 0). + * * Returns 0 on success, -1 on failure. */ NPY_NO_EXPORT int @@ -335,6 +339,26 @@ PyArray_PrepareOneRawArrayIter(int ndim, char *data, int *out_ndim, char **out_data, npy_intp *out_shape, npy_intp *out_strides); +/* Start raw iteration */ +#define NPY_RAW_ITER_START(idim, ndim, coord, shape, strides, data) \ + memset((coord), 0, (ndim) * sizeof(coord[0])); \ + do { + +/* Increment to the next n-dimensional coordinate for one raw array */ +#define NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, strides, data) \ + for ((idim) = 1; (idim) < (ndim); ++(idim)) { \ + if (++(coord)[idim] == (shape)[idim]) { \ + (coord)[idim] = 0; \ + (data) -= ((shape)[idim] - 1) * (strides)[idim]; \ + } \ + else { \ + (data) += (strides)[idim]; \ + break; \ + } \ + } \ + } while ((idim) < (ndim)); \ + + /* * TRIVIAL ITERATION diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 131671cb7..643394d6a 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -2450,7 +2450,8 @@ get_binary_op_function(PyUFuncObject *self, int *otype, static PyObject * PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out, - int axis, int otype, int operation, char *opname) + int axis, int otype, int skipna, + int operation, char *opname) { PyArrayObject *op[2]; PyArray_Descr *op_dtypes[2] = {NULL, NULL}; @@ -2458,7 +2459,7 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, int *op_axes[2] = {op_axes_arrays[0], op_axes_arrays[1]}; npy_uint32 op_flags[2]; int i, idim, ndim, otype_final; - int needs_api, need_outer_iterator; + int needs_api, need_outer_iterator, use_maskna = 0; NpyIter *iter = NULL, *iter_inner = NULL; @@ -2482,6 +2483,19 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, printf("\n"); #endif + use_maskna = PyArray_HASMASKNA(arr); + if (use_maskna) { + if (operation == UFUNC_ACCUMULATE) { + PyErr_SetString(PyExc_RuntimeError, + "ufunc accumulate doesn't support NA masked arrays yet"); + return NULL; + } + } + /* If there's no NA mask, there are no NAs to skip */ + else { + skipna = 0; + } + if (PyUFunc_GetPyValues(opname, &buffersize, &errormask, &errobj) < 0) { return NULL; } @@ -2554,18 +2568,23 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, } /* The per-operand flags for the outer loop */ - op_flags[0] = NPY_ITER_READWRITE| - NPY_ITER_NO_BROADCAST| - NPY_ITER_ALLOCATE| + op_flags[0] = NPY_ITER_READWRITE | + NPY_ITER_NO_BROADCAST | + NPY_ITER_ALLOCATE | NPY_ITER_NO_SUBTYPE; op_flags[1] = NPY_ITER_READONLY; + if (use_maskna) { + op_flags[0] |= NPY_ITER_USE_MASKNA; + op_flags[1] |= NPY_ITER_USE_MASKNA; + } + op[0] = out; op[1] = arr; need_outer_iterator = (ndim > 1); if (operation == UFUNC_ACCUMULATE) { - /* This is because we can't buffer, so must do UPDATEIFCOPY */ + /* We can't buffer, so must do UPDATEIFCOPY */ if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) || !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) || (out && @@ -2655,11 +2674,17 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, if (out == NULL) { goto fail; } + + if (use_maskna) { + if (PyArray_AllocateMaskNA(out, 1, 0, 1) < 0) { + goto fail; + } + } } } /* - * If the reduction unit has size zero, either return the reduction + * If the reduction axis has size zero, either return the reduction * unit for UFUNC_REDUCE, or return the zero-sized output array * for UFUNC_ACCUMULATE. */ @@ -2709,14 +2734,20 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr, op_flags[1] = NPY_ITER_READONLY| NPY_ITER_ALIGNED; + if (use_maskna) { + op_flags[0] |= NPY_ITER_USE_MASKNA; + op_flags[1] |= NPY_ITER_USE_MASKNA; + } + op_axes[0][0] = -1; op_axes[1][0] = axis; - iter_inner = NpyIter_AdvancedNew(2, op, NPY_ITER_EXTERNAL_LOOP| - NPY_ITER_BUFFERED| - NPY_ITER_DELAY_BUFALLOC| - NPY_ITER_GROWINNER| - NPY_ITER_REDUCE_OK| + iter_inner = NpyIter_AdvancedNew(2, op, + NPY_ITER_EXTERNAL_LOOP | + NPY_ITER_BUFFERED | + NPY_ITER_DELAY_BUFALLOC | + NPY_ITER_GROWINNER | + NPY_ITER_REDUCE_OK | NPY_ITER_REFS_OK, NPY_CORDER, NPY_UNSAFE_CASTING, op_flags, op_dtypes, @@ -3061,18 +3092,18 @@ fail: */ static PyObject * PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out, - int axis, int otype) + int axis, int otype, int skipna) { - return PyUFunc_ReductionOp(self, arr, out, axis, otype, + return PyUFunc_ReductionOp(self, arr, out, axis, otype, skipna, UFUNC_REDUCE, "reduce"); } static PyObject * PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out, - int axis, int otype) + int axis, int otype, int skipna) { - return PyUFunc_ReductionOp(self, arr, out, axis, otype, + return PyUFunc_ReductionOp(self, arr, out, axis, otype, skipna, UFUNC_ACCUMULATE, "accumulate"); } @@ -3097,7 +3128,7 @@ PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out, */ static PyObject * PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind, - PyArrayObject *out, int axis, int otype) + PyArrayObject *out, int axis, int otype, int skipna) { PyArrayObject *op[3]; PyArray_Descr *op_dtypes[3] = {NULL, NULL, NULL}; @@ -3124,6 +3155,12 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind, int buffersize = 0, errormask = 0; PyObject *errobj = NULL; + if (PyArray_HASMASKNA(arr)) { + PyErr_SetString(PyExc_RuntimeError, + "ufunc reduceat doesn't support NA masked arrays yet"); + return NULL; + } + NPY_BEGIN_THREADS_DEF; reduceat_ind = (npy_intp *)PyArray_DATA(ind); @@ -3468,8 +3505,10 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, PyArrayObject *indices = NULL; PyArray_Descr *otype = NULL; PyArrayObject *out = NULL; - static char *kwlist1[] = {"array", "axis", "dtype", "out", NULL}; - static char *kwlist2[] = {"array", "indices", "axis", "dtype", "out", NULL}; + int skipna = 0; + static char *kwlist1[] = {"array", "axis", "dtype", "out", "skipna", NULL}; + static char *kwlist2[] = {"array", "indices", "axis", + "dtype", "out", "skipna", NULL}; static char *_reduce_type[] = {"reduce", "accumulate", "reduceat", NULL}; if (self == NULL) { @@ -3498,10 +3537,13 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, if (operation == UFUNC_REDUCEAT) { PyArray_Descr *indtype; indtype = PyArray_DescrFromType(PyArray_INTP); - if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|iO&O&", kwlist2, - &op, &obj_ind, &axis, + if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|iO&O&i", kwlist2, + &op, + &obj_ind, + &axis, PyArray_DescrConverter2, &otype, - PyArray_OutputConverter, &out)) { + PyArray_OutputConverter, &out, + &skipna)) { Py_XDECREF(otype); return NULL; } @@ -3513,10 +3555,12 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, } } else { - if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|iO&O&", kwlist1, - &op, &axis, + if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|iO&O&i", kwlist1, + &op, + &axis, PyArray_DescrConverter2, &otype, - PyArray_OutputConverter, &out)) { + PyArray_OutputConverter, &out, + &skipna)) { Py_XDECREF(otype); return NULL; } @@ -3528,7 +3572,8 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, else { context = NULL; } - mp = (PyArrayObject *)PyArray_FromAny(op, NULL, 0, 0, 0, context); + mp = (PyArrayObject *)PyArray_FromAny(op, NULL, 0, 0, + NPY_ARRAY_ALLOWNA, context); Py_XDECREF(context); if (mp == NULL) { return NULL; @@ -3579,14 +3624,14 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, && ((strcmp(self->name,"add") == 0) || (strcmp(self->name,"multiply") == 0))) { if (PyTypeNum_ISBOOL(typenum)) { - typenum = PyArray_LONG; + typenum = NPY_LONG; } else if ((size_t)PyArray_DESCR(mp)->elsize < sizeof(long)) { if (PyTypeNum_ISUNSIGNED(typenum)) { - typenum = PyArray_ULONG; + typenum = NPY_ULONG; } else { - typenum = PyArray_LONG; + typenum = NPY_LONG; } } } @@ -3597,15 +3642,15 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args, switch(operation) { case UFUNC_REDUCE: ret = (PyArrayObject *)PyUFunc_Reduce(self, mp, out, axis, - otype->type_num); + otype->type_num, skipna); break; case UFUNC_ACCUMULATE: ret = (PyArrayObject *)PyUFunc_Accumulate(self, mp, out, axis, - otype->type_num); + otype->type_num, skipna); break; case UFUNC_REDUCEAT: ret = (PyArrayObject *)PyUFunc_Reduceat(self, mp, indices, out, - axis, otype->type_num); + axis, otype->type_num, skipna); Py_DECREF(indices); break; } diff --git a/numpy/core/tests/test_maskna.py b/numpy/core/tests/test_maskna.py index 631a4c990..1d3899c7b 100644 --- a/numpy/core/tests/test_maskna.py +++ b/numpy/core/tests/test_maskna.py @@ -356,5 +356,14 @@ def test_array_maskna_view_array_assignment_1D(): # TODO: fancy indexing is next... +def test_ufunc_1D(): + a = np.arange(3, maskna=True) + b = np.arange(3) + + # An NA mask is produced if an operand has one + c = a + b + assert_(c.flags.maskna) + assert_equal(c, [0,2,4]) + if __name__ == "__main__": run_module_suite() diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py index 7020e3d21..cb562ba3f 100644 --- a/numpy/core/tests/test_nditer.py +++ b/numpy/core/tests/test_nditer.py @@ -2485,6 +2485,25 @@ def test_iter_maskna(): assert_equal(np.isna(a), [1,0,1]) assert_equal(a_orig, [1.5,3,5.5]) + # Copying NA values with buffering and external_loop + a_orig[...] = [1.5,2.5,3.5] + b_orig[...] = [4.5,5.5,6.5] + a[...] = [np.NA, np.NA, 5.5] + b[...] = [np.NA, 3.5, np.NA] + it = np.nditer([a,b], ['buffered','external_loop'], + [['writeonly','use_maskna'], + ['readonly','use_maskna']], + op_dtypes=['i4','i4'], + casting='unsafe') + for x, y in it: + assert_equal(x.size, 3) + x[...] = y + # The 3.5 in b gets truncated to 3, because the iterator is processing + # elements as int32 values. + assert_equal(a[1], 3) + assert_equal(np.isna(a), [1,0,1]) + assert_equal(a_orig, [1.5,3,5.5]) + # WRITEMASKED and MASKNA aren't supported together yet mask = np.array([1,1,0], dtype='?') assert_raises(ValueError, np.nditer, [a,b,mask], [], -- cgit v1.2.1