10 files changed, 509 insertions, 24 deletions
diff --git a/doc/neps/missing-data.rst b/doc/neps/missing-data.rst
index 038ea9370..e83bd2189 100644
--- a/doc/neps/missing-data.rst
+++ b/doc/neps/missing-data.rst
@@ -743,6 +743,38 @@ to be consistent with the result of np.sum([])::
     >>> np.sum([])
     0.0
 
+Boolean Indexing
+================
+
+Indexing using a boolean array containing NAs does not have a consistent
+interpretation according to the NA abstraction. For example::
+
+    >>> a = np.array([1, 2])
+    >>> mask = np.array([np.NA, True], maskna=True)
+    >>> a[mask]
+    What should happen here?
+
+Since the NA represents a valid but unknown value, and it is a boolean,
+it has two possible underlying values::
+
+    >>> a[np.array([True, True])]
+    array([1, 2])
+    >>> a[np.array([False, True])]
+    array([2])
+
+The thing which changes is the length of the output array, nothing which
+itself can be substituted for NA. For this reason, at least initially,
+NumPy will raise an exception for this case.
+
+Another possibility is to add an inconsistency, and follow the approach
+R uses. That is, to produce the following::
+
+    >>> a[mask]
+    array([NA, 2], maskna=True)
+
+If, in user testing, this is found necessary for pragmatic reasons,
+the feature should be added even though it is inconsistent.
+
 PEP 3118
 ========
 
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index fe086c997..62cc12233 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1058,6 +1058,8 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter,
 #define NPY_ITER_ARRAYMASK                  0x20000000
 /* Split this operand up into data and an NA mask */
 #define NPY_ITER_USE_MASKNA                 0x40000000
+/* Iterate over the data, even if it has an NA mask and without USE_MASKNA */
+#define NPY_ITER_IGNORE_MASKNA              0x80000000
 
 #define NPY_ITER_GLOBAL_FLAGS               0x0000ffff
 #define NPY_ITER_PER_OP_FLAGS               0xffff0000
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index a14c7de36..5ffd93886 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -3873,6 +3873,76 @@ PyArray_CastRawArrays(npy_intp count,
 }
 
 /*
+ * Prepares shape and strides for a simple raw array iteration.
+ * This sorts the strides into FORTRAN order, reverses any negative
+ * strides, then coalesces axes where possible. The results are
+ * filled in the output parameters.
+ *
+ * This is intended for simple, lightweight iteration over arrays
+ * where no buffering of any kind is needed, and the array may
+ * not be stored as a PyArrayObject. For example, to iterate over
+ * the NA mask of an array.
+ *
+ * The arrays shape, out_shape, strides, and out_strides must all
+ * point to different data.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareOneRawArrayIter(int ndim, char *data,
+                            npy_intp *shape, npy_intp *strides,
+                            int *out_ndim, char **out_data,
+                            npy_intp *out_shape, npy_intp *out_strides)
+{
+    _npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int i, j;
+
+    /* Sort the axes based on the destination strides */
+    PyArray_CreateSortedStridePerm(ndim, strides, strideperm);
+    for (i = 0; i < ndim; ++i) {
+        int iperm = ndim - strideperm[i].perm - 1;
+        out_shape[i] = shape[iperm];
+        out_strides[i] = strides[iperm];
+    }
+
+    /* Reverse any negative strides */
+    for (i = 0; i < ndim; ++i) {
+        npy_intp stride_entry = out_strides[i], shape_entry = out_shape[i];
+
+        if (stride_entry < 0) {
+            data += stride_entry * (shape_entry - 1);
+            out_strides[i] = -stride_entry;
+        }
+    }
+
+    /* Coalesce any dimensions where possible */
+    i = 0;
+    for (j = 1; j < ndim; ++j) {
+        if (out_shape[i] == 1) {
+            /* Drop axis i */
+            out_shape[i] = out_shape[j];
+            out_strides[i] = out_strides[j];
+        }
+        else if (out_shape[j] == 1) {
+            /* Drop axis j */
+        }
+        else if (out_strides[i] * out_shape[i] == out_strides[j]) {
+            /* Coalesce axes i and j */
+            out_shape[i] *= out_shape[j];
+        }
+        else {
+            /* Can't coalesce, go to next i */
+            ++i;
+        }
+    }
+    ndim = i+1;
+
+    *out_data = data;
+    *out_ndim = ndim;
+    return 0;
+}
+
+/*
  * Casts the elements from one n-dimensional array to another n-dimensional
  * array with identical shape but possibly different strides and dtypes.
  * Does not account for overlap.
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 54de27e05..4ba7b7881 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -17,6 +17,7 @@
 #include "common.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
+#include "item_selection.h"
 
 #define _check_axis PyArray_CheckAxis
 
@@ -1690,6 +1691,77 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
     return ret;
 }
 
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
+{
+    int idim;
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp i, coord[NPY_MAXDIMS];
+    npy_intp count = 0;
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, data, ashape, astrides,
+                    &ndim, &data, shape, strides) < 0) {
+        return -1;
+    }
+
+    /* Do the iteration */
+    memset(coord, 0, ndim * sizeof(npy_intp));
+    /* Special case for contiguous inner loop */
+    if (strides[0] == 1) {
+        do {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape[0]; ++i, ++d) {
+                count += (*d != 0);
+            }
+
+            /* Increment to the next n-dimensional coordinate */
+            for (idim = 1; idim < ndim; ++idim) {
+                if (++coord[idim] == shape[idim]) {
+                    coord[idim] = 0;
+                    data -= (shape[idim] - 1) * strides[idim];
+                }
+                else {
+                    data += strides[idim];
+                    break;
+                }
+            }
+        } while (i < ndim);
+    }
+    /* General inner loop */
+    else {
+        do {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape[0]; ++i, d += strides[0]) {
+                count += (*d != 0);
+            }
+
+            /* Increment to the next n-dimensional coordinate */
+            for (idim = 1; idim < ndim; ++idim) {
+                if (++coord[idim] == shape[idim]) {
+                    coord[idim] = 0;
+                    data -= (shape[idim] - 1) * strides[idim];
+                }
+                else {
+                    data += strides[idim];
+                    break;
+                }
+            }
+        } while (i < ndim);
+    }
+
+    return count;
+}
+
 /*NUMPY_API
  * Counts the number of non-zero elements in the array
  *
@@ -1698,7 +1770,7 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
 NPY_NO_EXPORT npy_intp
 PyArray_CountNonzero(PyArrayObject *self)
 {
-    PyArray_NonzeroFunc *nonzero = PyArray_DESCR(self)->f->nonzero;
+    PyArray_NonzeroFunc *nonzero;
     char *data;
     npy_intp stride, count;
     npy_intp nonzero_count = 0;
@@ -1708,6 +1780,14 @@ PyArray_CountNonzero(PyArrayObject *self)
     char **dataptr;
     npy_intp *strideptr, *innersizeptr;
 
+    /* Special low-overhead version specific to the boolean type */
+    if (PyArray_DESCR(self)->type_num == NPY_BOOL) {
+        return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    }
+
+    nonzero = PyArray_DESCR(self)->f->nonzero;
+
     /* If it's a trivial one-dimensional loop, don't use an iterator */
     if (PyArray_TRIVIALLY_ITERABLE(self)) {
         PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
@@ -1767,7 +1847,7 @@ PyArray_CountNonzero(PyArrayObject *self)
 
     NpyIter_Deallocate(iter);
 
-    return nonzero_count;
+    return PyErr_Occurred() ? -1 : nonzero_count;
 }
 
 /*NUMPY_API
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
new file mode 100644
index 000000000..9023908ca
--- /dev/null
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -0,0 +1,13 @@
+#ifndef _NPY_PRIVATE__ITEM_SELECTION_H_
+#define _NPY_PRIVATE__ITEM_SELECTION_H_
+
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides);
+
+#endif
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index 8371702f9..69988f553 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -397,14 +397,22 @@ NPY_NO_EXPORT PyObject *
 PyArray_IterNew(PyObject *obj)
 {
     PyArrayIterObject *it;
-    PyArrayObject *ao = (PyArrayObject *)obj;
+    PyArrayObject *ao;
 
-    if (!PyArray_Check(ao)) {
+    if (!PyArray_Check(obj)) {
         PyErr_BadInternalCall();
         return NULL;
     }
+    ao = (PyArrayObject *)obj;
+
+    if (PyArray_HASMASKNA(ao)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Old-style NumPy iterators do not support NA masks, "
+                "use numpy.nditer instead");
+        return NULL;
+    }
 
-    it = (PyArrayIterObject *)_pya_malloc(sizeof(PyArrayIterObject));
+    it = (PyArrayIterObject *)PyArray_malloc(sizeof(PyArrayIterObject));
     PyObject_Init((PyObject *)it, &PyArrayIter_Type);
     /* it = PyObject_New(PyArrayIterObject, &PyArrayIter_Type);*/
     if (it == NULL) {
@@ -425,6 +433,13 @@ PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd)
     int i, diff, j, compat, k;
     PyArrayObject *ao = (PyArrayObject *)obj;
 
+    if (PyArray_HASMASKNA(ao)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Old-style NumPy iterators do not support NA masks, "
+                "use numpy.nditer instead");
+        return NULL;
+    }
+
     if (PyArray_NDIM(ao) > nd) {
         goto err;
     }
@@ -442,7 +457,7 @@ PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd)
     if (!compat) {
         goto err;
     }
-    it = (PyArrayIterObject *)_pya_malloc(sizeof(PyArrayIterObject));
+    it = (PyArrayIterObject *)PyArray_malloc(sizeof(PyArrayIterObject));
     PyObject_Init((PyObject *)it, &PyArrayIter_Type);
 
     if (it == NULL) {
@@ -1506,7 +1521,7 @@ PyArray_MultiIterFromObjects(PyObject **mps, int n, int nadd, ...)
                      "array objects (inclusive).", NPY_MAXARGS);
         return NULL;
     }
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -1571,7 +1586,7 @@ PyArray_MultiIterNew(int n, ...)
 
     /* fprintf(stderr, "multi new...");*/
 
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -1634,7 +1649,7 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
         return NULL;
     }
 
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -2031,7 +2046,7 @@ PyArray_NeighborhoodIterNew(PyArrayIterObject *x, npy_intp *bounds,
     int i;
     PyArrayNeighborhoodIterObject *ret;
 
-    ret = _pya_malloc(sizeof(*ret));
+    ret = PyArray_malloc(sizeof(*ret));
     if (ret == NULL) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 396f0a9e4..10e3f82b2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -16,6 +16,8 @@
 #include "iterators.h"
 #include "mapping.h"
 #include "na_singleton.h"
+#include "lowlevel_strided_loops.h"
+#include "item_selection.h"
 
 #define SOBJ_NOTFANCY 0
 #define SOBJ_ISFANCY 1
@@ -496,7 +498,6 @@ static int
 fancy_indexing_check(PyObject *args)
 {
     int i, n;
-    PyObject *obj;
     int retval = SOBJ_NOTFANCY;
 
     if (PyTuple_Check(args)) {
@@ -505,10 +506,11 @@ fancy_indexing_check(PyObject *args)
             return SOBJ_TOOMANY;
         }
         for (i = 0; i < n; i++) {
-            obj = PyTuple_GET_ITEM(args,i);
+            PyObject *obj = PyTuple_GET_ITEM(args,i);
             if (PyArray_Check(obj)) {
-                if (PyArray_ISINTEGER((PyArrayObject *)obj) ||
-                    PyArray_ISBOOL((PyArrayObject *)obj)) {
+                int type_num = PyArray_DESCR((PyArrayObject *)obj)->type_num;
+                if (PyTypeNum_ISINTEGER(type_num) ||
+                                        PyTypeNum_ISBOOL(type_num)) {
                     retval = SOBJ_ISFANCY;
                 }
                 else {
@@ -522,8 +524,8 @@ fancy_indexing_check(PyObject *args)
         }
     }
     else if (PyArray_Check(args)) {
-        if ((PyArray_TYPE((PyArrayObject *)args)==NPY_BOOL) ||
-            (PyArray_ISINTEGER((PyArrayObject *)args))) {
+        int type_num = PyArray_DESCR((PyArrayObject *)args)->type_num;
+        if (PyTypeNum_ISINTEGER(type_num) || PyTypeNum_ISBOOL(type_num)) {
             return SOBJ_ISFANCY;
         }
         else {
@@ -543,13 +545,14 @@ fancy_indexing_check(PyObject *args)
             return SOBJ_ISFANCY;
         }
         for (i = 0; i < n; i++) {
-            obj = PySequence_GetItem(args, i);
+            PyObject *obj = PySequence_GetItem(args, i);
             if (obj == NULL) {
                 return SOBJ_ISFANCY;
             }
             if (PyArray_Check(obj)) {
-                if (PyArray_ISINTEGER((PyArrayObject *)obj) ||
-                                    PyArray_ISBOOL((PyArrayObject *)obj)) {
+                int type_num = PyArray_DESCR((PyArrayObject *)obj)->type_num;
+                if (PyTypeNum_ISINTEGER(type_num) ||
+                                            PyTypeNum_ISBOOL(type_num)) {
                     retval = SOBJ_LISTTUP;
                 }
                 else {
@@ -560,7 +563,7 @@ fancy_indexing_check(PyObject *args)
                 retval = SOBJ_LISTTUP;
             }
             else if (PySlice_Check(obj) || obj == Py_Ellipsis ||
-                    obj == Py_None) {
+                                                    obj == Py_None) {
                 retval = SOBJ_NOTFANCY;
             }
             Py_DECREF(obj);
@@ -672,6 +675,211 @@ array_subscript_simple(PyArrayObject *self, PyObject *op)
     return (PyObject *)ret;
 }
 
+/*
+ * Implements boolean indexing. This produces a one-dimensional
+ * array which picks out all of the elements of 'self' for which
+ * the corresponding element of 'op' is True.
+ *
+ * This operation is somewhat unfortunate, because to produce
+ * a one-dimensional output array, it has to choose a particular
+ * iteration order, in the case of NumPy that is always C order even
+ * though this function allows different choices.
+ */
+NPY_NO_EXPORT PyArrayObject *
+array_boolean_subscript(PyArrayObject *self, PyArrayObject *bmask, NPY_ORDER order)
+{
+    npy_intp size, itemsize;
+    char *ret_data, *ret_maskna_data = NULL;
+    PyArray_Descr *dtype;
+    PyArrayObject *ret;
+    int self_has_maskna = PyArray_HASMASKNA(self), needs_api = 0;
+
+    if (PyArray_DESCR(bmask)->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_TypeError,
+                "NumPy boolean array indexing requires a boolean index");
+        return NULL;
+    }
+
+    /* See the Boolean Indexing section of the missing data NEP */
+    if (PyArray_ContainsNA(bmask)) {
+        PyErr_SetString(PyExc_ValueError,
+                "The boolean mask array may not contain any NA values");
+        return NULL;
+    }
+
+    /*
+     * Since we've checked that the mask contains no NAs, we
+     * can do a straightforward count of the boolean True values
+     * in the raw mask data array.
+     */
+    size = count_boolean_trues(PyArray_NDIM(bmask), PyArray_DATA(bmask),
+                                PyArray_DIMS(bmask), PyArray_STRIDES(bmask));
+    /* Correction factor for broadcasting 'bmask' to 'self' */
+    size *= PyArray_SIZE(self) / PyArray_SIZE(bmask);
+
+    /* Allocate the output of the boolean indexing */
+    dtype = PyArray_DESCR(self);
+    Py_INCREF(dtype);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self), dtype, 1, &size,
+                                NULL, NULL, 0, (PyObject *)self);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Allocate an NA mask for ret if required */
+    if (self_has_maskna) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        ret_maskna_data = PyArray_MASKNA_DATA(ret);
+    }
+
+    itemsize = dtype->elsize;
+    ret_data = PyArray_DATA(ret);
+
+    /* Create an iterator for the data */
+    if (size > 0) {
+        NpyIter *iter;
+        PyArrayObject *op[2] = {self, bmask};
+        npy_uint32 flags, op_flags[2];
+        npy_intp fixed_strides[3];
+        PyArray_StridedTransferFn *stransfer = NULL;
+        NpyAuxData *transferdata = NULL;
+
+        NpyIter_IterNextFunc *iternext;
+        npy_intp innersize, *innerstrides;
+        char **dataptrs;
+
+        /* Set up the iterator */
+        flags = NPY_ITER_EXTERNAL_LOOP;
+        if (self_has_maskna) {
+            op_flags[0] = NPY_ITER_READONLY | NPY_ITER_USE_MASKNA;
+        }
+        else {
+            op_flags[0] = NPY_ITER_READONLY;
+        }
+        /*
+         * Since we already checked PyArray_ContainsNA(bmask), can
+         * ignore any MASKNA of bmask.
+         */
+        op_flags[1] = NPY_ITER_READONLY | NPY_ITER_IGNORE_MASKNA;
+
+        iter = NpyIter_MultiNew(2, op, flags, order, NPY_NO_CASTING,
+                                op_flags, NULL);
+        if (iter == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        /* Get a dtype transfer function */
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+        if (PyArray_GetDTypeTransferFunction(PyArray_ISALIGNED(self),
+                        fixed_strides[0], itemsize,
+                        dtype, dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        /* Get the values needed for the inner loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            Py_DECREF(ret);
+            NPY_AUXDATA_FREE(transferdata);
+            return NULL;
+        }
+        innerstrides = NpyIter_GetInnerStrideArray(iter);
+        dataptrs = NpyIter_GetDataPtrArray(iter);
+
+        /* Regular inner loop */
+        if (!self_has_maskna) {
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp subloopsize;
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                char *self_data = dataptrs[0];
+                char *bmask_data = dataptrs[1];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                }
+            } while (iternext(iter));
+        }
+        /* NA masked inner loop */
+        else {
+            npy_intp i;
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp maskna_stride = innerstrides[2];
+            npy_intp subloopsize;
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                char *self_data = dataptrs[0];
+                char *bmask_data = dataptrs[1];
+                char *maskna_data = dataptrs[2];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    /*
+                     * Because it's a newly allocated array, we
+                     * don't have to be careful about not overwriting
+                     * NA masked values. If 'ret' were an output parameter,
+                     * we would have to avoid that.
+                     */
+                    stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                    /* Copy the mask */
+                    for (i = 0; i < subloopsize; ++i) {
+                        *ret_maskna_data++ = *maskna_data;
+                        maskna_data += maskna_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                }
+            } while (iternext(iter));
+        }
+
+        NpyIter_Deallocate(iter);
+        NPY_AUXDATA_FREE(transferdata);
+    }
+
+    return ret;
+}
+
 NPY_NO_EXPORT PyObject *
 array_subscript(PyArrayObject *self, PyObject *op)
 {
diff --git a/numpy/core/src/multiarray/na_mask.c b/numpy/core/src/multiarray/na_mask.c
index 92eab30e5..b7af2c824 100644
--- a/numpy/core/src/multiarray/na_mask.c
+++ b/numpy/core/src/multiarray/na_mask.c
@@ -43,11 +43,50 @@ NPY_NO_EXPORT npy_bool
 PyArray_ContainsNA(PyArrayObject *arr)
 {
     /* Need NA support to contain NA */
-    if (!PyArray_HasNASupport(arr)) {
-        return 0;
-    }
+    if (PyArray_HASMASKNA(arr)) {
+        int idim, ndim;
+        char *data;
+        npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+        npy_intp i, coord[NPY_MAXDIMS];
+
+        if (PyArray_HASFIELDS(arr)) {
+            /* TODO: need to add field-NA support */
+            return 1;
+        }
+
+        /* Use raw iteration with no heap memory allocation */
+        if (PyArray_PrepareOneRawArrayIter(
+                        PyArray_NDIM(arr), PyArray_MASKNA_DATA(arr),
+                        PyArray_DIMS(arr), PyArray_MASKNA_STRIDES(arr),
+                        &ndim, &data, shape, strides) < 0) {
+            PyErr_Clear();
+            return 1;
+        }
+
+        /* Do the iteration */
+        memset(coord, 0, ndim * sizeof(npy_intp));
+        do {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape[0]; ++i, d += strides[0]) {
+                if (!NpyMaskValue_IsExposed((npy_mask)(*d))) {
+                    return 1;
+                }
+            }
 
-    /* TODO: Loop through NA mask */
+            /* Increment to the next n-dimensional coordinate */
+            for (idim = 1; idim < ndim; ++idim) {
+                if (++coord[idim] == shape[idim]) {
+                    coord[idim] = 0;
+                    data -= (shape[idim] - 1) * strides[idim];
+                }
+                else {
+                    data += strides[idim];
+                    break;
+                }
+            }
+        } while (i < ndim);
+    }
 
     return 0;
 }
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index c606e58f3..e9a1dd40d 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -316,6 +316,25 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
                 PyArray_MaskedStridedTransferFn *stransfer,
                 NpyAuxData *data);
 
+/*
+ * Prepares shape and strides for a simple raw array iteration.
+ * This sorts the strides into FORTRAN order, reverses any negative
+ * strides, then coalesces axes where possible. The results are
+ * filled in the output parameters.
+ *
+ * This is intended for simple, lightweight iteration over arrays
+ * where no buffering of any kind is needed, and the array may
+ * not be stored as a PyArrayObject. For example, to iterate over
+ * the NA mask of an array.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareOneRawArrayIter(int ndim, char *data,
+                            npy_intp *shape, npy_intp *strides,
+                            int *out_ndim, char **out_data,
+                            npy_intp *out_shape, npy_intp *out_strides);
+
 
 /*
  *            TRIVIAL ITERATION
diff --git a/numpy/core/tests/test_na.py b/numpy/core/tests/test_na.py
index 3b179b126..98ad67f27 100644
--- a/numpy/core/tests/test_na.py
+++ b/numpy/core/tests/test_na.py
@@ -225,6 +225,13 @@ def test_array_maskna_isna_1D():
     a[2:10:3] = np.NA
     assert_equal(np.isna(a), [0,0,1,1,0,1,1,0,1,0])
 
+    # Checking isna of a boolean mask index
+    mask = np.array([1,1,0,0,0,1,0,1,1,0], dtype='?')
+    assert_equal(np.isna(a[mask]), [0,0,1,0,1])
+    # Assigning NA to a boolean masked index
+    a[mask] = np.NA
+    assert_equal(np.isna(a), [1,1,1,1,0,1,1,1,1,0])
+
     # TODO: fancy indexing is next...