Merge branch 'pull-141'

* pull-141: (167 commits) ENH: missingdata: Make PyArray_Converter and PyArray_OutputConverter safer for legacy code DOC: missingdata: Add a mention of the design NEP, and masks vs bitpatterns DOC: missingdata: Updates from pull request feedback DOC: missingdata: Updates based on pull request feedback ENH: nditer: Change the Python nditer exposure to automatically add NPY_ITER_USE_MASKNA ENH: missingdata: Make comparisons with NA return NA(dtype='bool') BLD: core: onefile build fix and Python3 compatibility change DOC: Mention the update to np.all and np.any in the release notes TST: dtype: Adjust void dtype test to pass without raising a zero-size exception STY: Remove trailing whitespace TST: missingdata: Write some tests for the np.any and np.all NA behavior ENH: missingdata: Make numpy.all follow the NA && False == False rule ENH: missingdata: Make numpy.all follow the NA || True == True rule DOC: missingdata: Also show what assigning a non-NA value does in each case DOC: missingdata: Add introductory documentation for NA-masked arrays ENH: core: Rename PyArrayObject_fieldaccess to PyArrayObject_fields DOC: missingdata: Some tweaks to the NA mask documentation DOC: missingdata: Add example of a C-API function supporting NA masks DOC: missingdata: Documenting C API for NA-masked arrays ENH: missingdata: Finish adding C-API access to the NpyNA object ...
author: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 21:46:08 -0600
committer: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 21:46:08 -0600
commit: 9ecd91b7bf8c77d696ec9856ba10896d8f60309a (patch)
tree: 9884131ece5eada06212538c591965bf5928afa2 /numpy
parent: aa55ba7437fbe6b8772a360a641b5aa7d3e669e0 (diff)
parent: 10fac981763e87f949bed15c66127fc380fa9b27 (diff)
download: numpy-9ecd91b7bf8c77d696ec9856ba10896d8f60309a.tar.gz
105 files changed, 18213 insertions, 4849 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 1a3f9e461..ce61c5f0e 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -197,6 +197,15 @@ add_newdoc('numpy.core', 'nditer',
           * "allocate" causes the array to be allocated if it is None
             in the `op` parameter.
           * "no_subtype" prevents an "allocate" operand from using a subtype.
+          * "arraymask" indicates that this operand is the mask to use
+            for selecting elements when writing to operands with the
+            'writemasked' flag set. The iterator does not enforce this,
+            but when writing from a buffer back to the array, it only
+            copies those elements indicated by this mask.
+          * 'writemasked' indicates that only elements where the chosen
+            'arraymask' operand is True will be written to.
+          * 'use_maskna' indicates that this operand should be treated
+            like an NA-masked array.
     op_dtypes : dtype or tuple of dtype(s), optional
         The required data type(s) of the operands. If copying or buffering
         is enabled, the data will be converted to/from their original types.
@@ -631,7 +640,7 @@ add_newdoc('numpy.core', 'broadcast', ('reset',
 
 add_newdoc('numpy.core.multiarray', 'array',
     """
-    array(object, dtype=None, copy=True, order=None, subok=False, ndmin=0)
+    array(object, dtype=None, copy=True, order=None, subok=False, ndmin=0, maskna=None, ownmaskna=False)
 
     Create an array.
 
@@ -667,6 +676,19 @@ add_newdoc('numpy.core.multiarray', 'array',
         Specifies the minimum number of dimensions that the resulting
         array should have.  Ones will be pre-pended to the shape as
         needed to meet this requirement.
+    maskna : bool or None, optional
+        If this is set to True, it forces the array to have an NA mask.
+        If the input is an array without a mask, this means a view with
+        an NA mask is created. If the input is an array with a mask, the
+        mask is preserved as-is.
+
+        If this is set to False, it forces the array to not have an NA
+        mask. If the input is an array with a mask, and has no NA values,
+        it will create a copy of the input without an NA mask.
+    ownmaskna : bool, optional
+        If this is set to True, forces the array to have a mask which
+        it owns. It may still return a view of the data from the input,
+        but the result will always own its own mask.
 
     Returns
     -------
@@ -736,6 +758,8 @@ add_newdoc('numpy.core.multiarray', 'empty',
     order : {'C', 'F'}, optional
         Whether to store multi-dimensional data in C (row-major) or
         Fortran (column-major) order in memory.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
 
     See Also
     --------
@@ -884,6 +908,35 @@ add_newdoc('numpy.core.multiarray', 'zeros',
 
     """)
 
+add_newdoc('numpy.core.multiarray', 'isna',
+    """
+    isna(a)
+
+    Returns an array with True for each element of *a* that is NA.
+
+    Parameters
+    ----------
+    a : array_like
+        The array for which to check for NA.
+
+    Returns
+    -------
+    result : bool or array of bool
+        Number of non-zero values in the array.
+
+    Examples
+    --------
+    >>> np.isna(np.NA)
+    True
+    >>> np.isna(1.5)
+    False
+    >>> np.isna(np.nan)
+    False
+    >>> a = np.array([0, np.NA, 3.5, np.NA])
+    >>> np.isna(a)
+    array([False,  True, False,  True], dtype=bool)
+    """)
+
 add_newdoc('numpy.core.multiarray', 'count_nonzero',
     """
     count_nonzero(a)
@@ -894,10 +947,21 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     ----------
     a : array_like
         The array for which to count non-zeros.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a reduction is performed.
+        The default (`axis` = None) is perform a reduction over all
+        the dimensions of the input array.
+    skipna : bool, optional
+        If this is set to True, any NA elements in the array are skipped
+        instead of propagating.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
-    count : int
+    count : int or array of int
         Number of non-zero values in the array.
 
     See Also
@@ -908,9 +972,67 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     --------
     >>> np.count_nonzero(np.eye(4))
     4
-
     >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]])
     5
+    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1)
+    array([2, 3])
+    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]], axis=1, keepdims=True)
+    array([[2],
+           [3]])
+    """)
+
+add_newdoc('numpy.core.multiarray', 'count_reduce_items',
+    """
+    count_reduce_items(arr, axis=None, skipna=False, keepdims=False)
+
+    Counts the number of items a reduction with the same `axis`
+    and `skipna` parameter values would use. The purpose of this
+    function is for the creation of reduction operations
+    which use the item count, such as :func:`mean`.
+
+    When `skipna` is False or `arr` doesn't have an NA mask,
+    the result is simply the product of the reduction axis
+    sizes, returned as a single scalar.
+
+    Parameters
+    ----------
+    arr : array_like
+        The array for which to count the reduce items.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a reduction is performed.
+        The default (`axis` = None) is perform a reduction over all
+        the dimensions of the input array.
+    skipna : bool, optional
+        If this is set to True, any NA elements in the array are not
+        counted. The only time this function does any actual counting
+        instead of a cheap multiply of a few sizes is when `skipna` is
+        true and `arr` has an NA mask.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
+
+    Returns
+    -------
+    count : intp or array of intp
+        Number of items that would be used in a reduction with the
+        same `axis` and `skipna` parameter values.
+
+    Examples
+    --------
+    >>> a = np.array([[1,np.NA,1], [1,1,np.NA]])
+
+    >>> np.count_reduce_items(a)
+    6
+    >>> np.count_reduce_items(a, skipna=True)
+    4
+    >>> np.sum(a, skipna=True)
+    4
+
+    >>> np.count_reduce_items(a, axis=0, skipna=True)
+    array([2, 1, 1])
+    >>> np.sum(a, axis=0, skipna=True)
+    array([2, 1, 1])
     """)
 
 add_newdoc('numpy.core.multiarray','set_typeDict',
@@ -1276,7 +1398,7 @@ add_newdoc('numpy.core.multiarray','correlate',
 
 add_newdoc('numpy.core.multiarray', 'arange',
     """
-    arange([start,] stop[, step,], dtype=None)
+    arange([start,] stop[, step,], dtype=None, maskna=False)
 
     Return evenly spaced values within a given interval.
 
@@ -1305,6 +1427,8 @@ add_newdoc('numpy.core.multiarray', 'arange',
     dtype : dtype
         The type of the output array.  If `dtype` is not given, infer the data
         type from the other input arguments.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -3174,17 +3298,21 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('conjugate',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('copy',
     """
-    a.copy(order='C')
+    a.copy(order='C', maskna=None)
 
     Return a copy of the array.
 
     Parameters
     ----------
-    order : {'C', 'F', 'A'}, optional
-        By default, the result is stored in C-contiguous (row-major) order in
-        memory.  If `order` is `F`, the result has 'Fortran' (column-major)
-        order.  If order is 'A' ('Any'), then the result has the same order
-        as the input.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout of the copy. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+    maskna : bool, optional
+        If specifies, forces the copy to have or to not have an
+        NA mask. This is a way to remove an NA mask from an array
+        while making a copy.
 
     See also
     --------
@@ -3248,7 +3376,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('diagonal',
 
     Return specified diagonals.
 
-    Refer to `numpy.diagonal` for full documentation.
+    Refer to :func:`numpy.diagonal` for full documentation.
 
     See Also
     --------
@@ -3698,7 +3826,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('put',
 
 add_newdoc('numpy.core.multiarray', 'copyto',
     """
-    copyto(dst, src, casting='same_kind', where=None)
+    copyto(dst, src, casting='same_kind', where=None, preservena=False)
 
     Copies values from `src` into `dst`, broadcasting as necessary.
     Raises a TypeError if the casting rule is violated, and if
@@ -3721,10 +3849,13 @@ add_newdoc('numpy.core.multiarray', 'copyto',
           * 'same_kind' means only safe casts or casts within a kind,
             like float64 to float32, are allowed.
           * 'unsafe' means any data conversions may be done.
-    where : array_like of bool
+    where : array_like of bool, optional
         A boolean array which is broadcasted to match the dimensions
         of `dst`, and selects elements to copy from `src` to `dst`
         wherever it contains the value True.
+    preservena : bool, optional
+        If set to True, leaves any NA values in `dst` untouched. This
+        is similar to the "hard mask" feature in numpy.ma.
 
     """)
 
@@ -4118,7 +4249,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('squeeze',
     """
-    a.squeeze()
+    a.squeeze(axis=None)
 
     Remove single-dimensional entries from the shape of `a`.
 
@@ -5273,7 +5404,7 @@ add_newdoc('numpy.core', 'ufunc', ('types',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None)
+    reduce(a, axis=0, dtype=None, out=None, skipna=False, keepdims=False)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -5295,8 +5426,22 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     ----------
     a : array_like
         The array to act on.
-    axis : int, optional
-        The axis along which to apply the reduction.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a reduction is performed.
+        The default (`axis` = 0) is perform a reduction over the first
+        dimension of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        .. versionadded:: 1.7.0
+
+        If this is `None`, a reduction is performed over all the axes.
+        If this is a tuple of ints, a reduction is performed on multiple
+        axes, instead of a single axis or all the axes as before.
+
+        For operations which are either not commutative or not associative,
+        doing a reduction over multiple axes is not well-defined. The
+        ufuncs do not currently raise an exception in this case, but will
+        likely do so in the future.
     dtype : data-type code, optional
         The type used to represent the intermediate results. Defaults
         to the data-type of the output array if this is provided, or
@@ -5304,6 +5449,15 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     out : ndarray, optional
         A location into which the result is stored. If not provided, a
         freshly-allocated array is returned.
+    skipna : bool, optional
+        If this is set to True, the reduction is done as if any NA elements
+        were not counted in the array. The default, False, causes the
+        NA values to propagate, so if any element in a set of elements
+        being reduced is NA, the result will be NA.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
diff --git a/numpy/core/SConscript b/numpy/core/SConscript
index 5bf62ea11..918dc5b46 100644
--- a/numpy/core/SConscript
+++ b/numpy/core/SConscript
@@ -381,6 +381,8 @@ arraytypes_src = env.GenerateFromTemplate(
     pjoin('src', 'multiarray', 'arraytypes.c.src'))
 nditer_src = env.GenerateFromTemplate(
     pjoin('src', 'multiarray', 'nditer_templ.c.src'))
+boolean_ops_src = env.GenerateFromTemplate(
+    pjoin('src', 'multiarray', 'boolean_ops.c.src'))
 lowlevel_strided_loops_src = env.GenerateFromTemplate(
     pjoin('src', 'multiarray', 'lowlevel_strided_loops.c.src'))
 einsum_src = env.GenerateFromTemplate(pjoin('src', 'multiarray', 'einsum.c.src'))
@@ -443,6 +445,10 @@ if ENABLE_SEPARATE_COMPILATION:
     multiarray_src = [pjoin('src', 'multiarray', 'multiarraymodule.c'),
         pjoin('src', 'multiarray', 'hashdescr.c'),
         pjoin('src', 'multiarray', 'arrayobject.c'),
+        pjoin('src', 'multiarray', 'array_assign.c'),
+        pjoin('src', 'multiarray', 'array_assign_scalar.c'),
+        pjoin('src', 'multiarray', 'array_assign_array.c'),
+        pjoin('src', 'multiarray', 'boolean_ops.c'),
         pjoin('src', 'multiarray', 'datetime.c'),
         pjoin('src', 'multiarray', 'datetime_strings.c'),
         pjoin('src', 'multiarray', 'datetime_busday.c'),
@@ -452,6 +458,8 @@ if ENABLE_SEPARATE_COMPILATION:
         pjoin('src', 'multiarray', 'descriptor.c'),
         pjoin('src', 'multiarray', 'iterators.c'),
         pjoin('src', 'multiarray', 'mapping.c'),
+        pjoin('src', 'multiarray', 'na_mask.c'),
+        pjoin('src', 'multiarray', 'na_object.c'),
         pjoin('src', 'multiarray', 'number.c'),
         pjoin('src', 'multiarray', 'getset.c'),
         pjoin('src', 'multiarray', 'sequence.c'),
@@ -463,6 +471,7 @@ if ENABLE_SEPARATE_COMPILATION:
         pjoin('src', 'multiarray', 'item_selection.c'),
         pjoin('src', 'multiarray', 'calculation.c'),
         pjoin('src', 'multiarray', 'common.c'),
+        pjoin('src', 'multiarray', 'reduction.c'),
         pjoin('src', 'multiarray', 'refcount.c'),
         pjoin('src', 'multiarray', 'conversion_utils.c'),
         pjoin('src', 'multiarray', 'usertypes.c'),
@@ -476,6 +485,7 @@ if ENABLE_SEPARATE_COMPILATION:
     multiarray_src.extend(arraytypes_src)
     multiarray_src.extend(scalartypes_src)
     multiarray_src.extend(lowlevel_strided_loops_src)
+    multiarray_src.extend(boolean_ops_src)
     multiarray_src.extend(nditer_src)
     multiarray_src.extend(einsum_src)
     if PYTHON_HAS_UNICODE_WIDE:
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
new file mode 100644
index 000000000..80e12298c
--- /dev/null
+++ b/numpy/core/_methods.py
@@ -0,0 +1,97 @@
+# Array methods which are called by the both the C-code for the method
+# and the Python code for the NumPy-namespace function
+
+from numpy.core import multiarray as mu
+from numpy.core import umath as um
+from numpy.core.numeric import asanyarray
+
+def _amax(a, axis=None, out=None, skipna=False, keepdims=False):
+    return um.maximum.reduce(a, axis=axis,
+                            out=out, skipna=skipna, keepdims=keepdims)
+
+def _amin(a, axis=None, out=None, skipna=False, keepdims=False):
+    return um.minimum.reduce(a, axis=axis,
+                            out=out, skipna=skipna, keepdims=keepdims)
+
+def _sum(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
+    return um.add.reduce(a, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
+
+def _prod(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
+    return um.multiply.reduce(a, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
+
+def _mean(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
+    arr = asanyarray(a)
+
+    # Upgrade bool, unsigned int, and int to float64
+    if dtype is None and arr.dtype.kind in ['b','u','i']:
+        ret = um.add.reduce(arr, axis=axis, dtype='f8',
+                            out=out, skipna=skipna, keepdims=keepdims)
+    else:
+        ret = um.add.reduce(arr, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
+    rcount = mu.count_reduce_items(arr, axis=axis,
+                            skipna=skipna, keepdims=keepdims)
+    if isinstance(ret, mu.ndarray):
+        ret = um.true_divide(ret, rcount,
+                        out=ret, casting='unsafe', subok=False)
+    else:
+        ret = ret / float(rcount)
+    return ret
+
+def _var(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
+    arr = asanyarray(a)
+
+    # First compute the mean, saving 'rcount' for reuse later
+    if dtype is None and arr.dtype.kind in ['b','u','i']:
+        arrmean = um.add.reduce(arr, axis=axis, dtype='f8',
+                            skipna=skipna, keepdims=True)
+    else:
+        arrmean = um.add.reduce(arr, axis=axis, dtype=dtype,
+                            skipna=skipna, keepdims=True)
+    rcount = mu.count_reduce_items(arr, axis=axis,
+                            skipna=skipna, keepdims=True)
+    if isinstance(arrmean, mu.ndarray):
+        arrmean = um.true_divide(arrmean, rcount,
+                            out=arrmean, casting='unsafe', subok=False)
+    else:
+        arrmean = arrmean / float(rcount)
+
+    # arr - arrmean
+    x = arr - arrmean
+
+    # (arr - arrmean) ** 2
+    if arr.dtype.kind == 'c':
+        x = um.multiply(x, um.conjugate(x), out=x).real
+    else:
+        x = um.multiply(x, x, out=x)
+
+    # add.reduce((arr - arrmean) ** 2, axis)
+    ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out,
+                                skipna=skipna, keepdims=keepdims)
+
+    # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof)
+    if not keepdims and isinstance(rcount, mu.ndarray):
+        rcount = rcount.squeeze(axis=axis)
+    rcount -= ddof
+    if isinstance(ret, mu.ndarray):
+        ret = um.true_divide(ret, rcount,
+                        out=ret, casting='unsafe', subok=False)
+    else:
+        ret = ret / float(rcount)
+
+    return ret
+
+def _std(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
+    ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
+                                skipna=skipna, keepdims=keepdims)
+
+    if isinstance(ret, mu.ndarray):
+        ret = um.sqrt(ret, out=ret)
+    else:
+        ret = um.sqrt(ret)
+
+    return ret
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 508056a26..aad83500e 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -15,7 +15,7 @@ __docformat__ = 'restructuredtext'
 import sys
 import numerictypes as _nt
 from umath import maximum, minimum, absolute, not_equal, isnan, isinf
-from multiarray import format_longfloat, datetime_as_string, datetime_data
+from multiarray import format_longfloat, datetime_as_string, datetime_data, isna
 from fromnumeric import ravel
 
 
@@ -29,6 +29,7 @@ _float_output_suppress_small = False
 _line_width = 75
 _nan_str = 'nan'
 _inf_str = 'inf'
+_na_str = 'NA'
 _formatter = None  # formatting function for array elements
 
 if sys.version_info[0] >= 3:
@@ -36,7 +37,8 @@ if sys.version_info[0] >= 3:
 
 def set_printoptions(precision=None, threshold=None, edgeitems=None,
                      linewidth=None, suppress=None,
-                     nanstr=None, infstr=None, formatter=None):
+                     nanstr=None, infstr=None, nastr=None,
+                     formatter=None):
     """
     Set printing options.
 
@@ -63,6 +65,8 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
         String representation of floating point not-a-number (default nan).
     infstr : str, optional
         String representation of floating point infinity (default inf).
+    nastr : str, optional
+        String representation of NA missing value (default NA).
     formatter : dict of callables, optional
         If not None, the keys should indicate the type(s) that the respective
         formatting function applies to.  Callables should return a string.
@@ -140,7 +144,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
 
     global _summaryThreshold, _summaryEdgeItems, _float_output_precision, \
            _line_width, _float_output_suppress_small, _nan_str, _inf_str, \
-           _formatter
+           _na_str, _formatter
     if linewidth is not None:
         _line_width = linewidth
     if threshold is not None:
@@ -155,6 +159,8 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
         _nan_str = nanstr
     if infstr is not None:
         _inf_str = infstr
+    if nastr is not None:
+        _na_str = nastr
     _formatter = formatter
 
 def get_printoptions():
@@ -189,6 +195,7 @@ def get_printoptions():
              suppress=_float_output_suppress_small,
              nanstr=_nan_str,
              infstr=_inf_str,
+             nastr=_na_str,
              formatter=_formatter)
     return d
 
@@ -212,9 +219,19 @@ def _leading_trailing(a):
     return b
 
 def _boolFormatter(x):
-    if x: return ' True'
-    else: return 'False'
+    if isna(x):
+        return str(x).replace('NA', _na_str, 1)
+    elif x:
+        return ' True'
+    else:
+        return 'False'
+
 
+def repr_format(x):
+    if isna(x):
+        return str(x).replace('NA', _na_str, 1)
+    else:
+        return repr(x)
 
 def _array2string(a, max_line_width, precision, suppress_small, separator=' ',
                   prefix="", formatter=None):
@@ -247,8 +264,9 @@ def _array2string(a, max_line_width, precision, suppress_small, separator=' ',
                   'longcomplexfloat' : LongComplexFormat(precision),
                   'datetime' : DatetimeFormat(data),
                   'timedelta' : TimedeltaFormat(data),
-                  'numpystr' : repr,
+                  'numpystr' : repr_format,
                   'str' : str}
+
     if formatter is not None:
         fkeys = [k for k in formatter.keys() if formatter[k] is not None]
         if 'all' in fkeys:
@@ -419,16 +437,20 @@ def array2string(a, max_line_width=None, precision=None,
 
     if a.shape == ():
         x = a.item()
-        try:
-            lst = a._format(x)
-            msg = "The `_format` attribute is deprecated in Numpy 2.0 and " \
-                  "will be removed in 2.1. Use the `formatter` kw instead."
-            import warnings
-            warnings.warn(msg, DeprecationWarning)
-        except AttributeError:
-            if isinstance(x, tuple):
-                x = _convert_arrays(x)
-            lst = style(x)
+        if isna(x):
+            lst = str(x).replace('NA', _na_str, 1)
+        else:
+            try:
+                lst = a._format(x)
+                msg = "The `_format` attribute is deprecated in Numpy " \
+                      "2.0 and will be removed in 2.1. Use the " \
+                      "`formatter` kw instead."
+                import warnings
+                warnings.warn(msg, DeprecationWarning)
+            except AttributeError:
+                if isinstance(x, tuple):
+                    x = _convert_arrays(x)
+                lst = style(x)
     elif reduce(product, a.shape) == 0:
         # treat as a null array if any of shape elements == 0
         lst = "[]"
@@ -531,14 +553,17 @@ class FloatFormat(object):
         import numeric as _nc
         errstate = _nc.seterr(all='ignore')
         try:
-            special = isnan(data) | isinf(data)
-            non_zero = absolute(data.compress(not_equal(data, 0) & ~special))
+            special = isnan(data) | isinf(data) | isna(data)
+            special[isna(data)] = False
+            valid = not_equal(data, 0) & ~special
+            valid[isna(data)] = False
+            non_zero = absolute(data.compress(valid))
             if len(non_zero) == 0:
                 max_val = 0.
                 min_val = 0.
             else:
-                max_val = maximum.reduce(non_zero)
-                min_val = minimum.reduce(non_zero)
+                max_val = maximum.reduce(non_zero, skipna=True)
+                min_val = minimum.reduce(non_zero, skipna=True)
                 if max_val >= 1.e8:
                     self.exp_format = True
                 if not self.suppress_small and (min_val < 0.0001
@@ -569,7 +594,8 @@ class FloatFormat(object):
             if _nc.any(special):
                 self.max_str_len = max(self.max_str_len,
                                        len(_nan_str),
-                                       len(_inf_str)+1)
+                                       len(_inf_str)+1,
+                                       len(_na_str))
             if self.sign:
                 format = '%#+'
             else:
@@ -583,7 +609,9 @@ class FloatFormat(object):
         import numeric as _nc
         err = _nc.seterr(invalid='ignore')
         try:
-            if isnan(x):
+            if isna(x):
+                return self.special_fmt % (str(x).replace('NA', _na_str, 1),)
+            elif isnan(x):
                 if self.sign:
                     return self.special_fmt % ('+' + _nan_str,)
                 else:
@@ -626,16 +654,21 @@ _MININT = -sys.maxint-1
 class IntegerFormat(object):
     def __init__(self, data):
         try:
-            max_str_len = max(len(str(maximum.reduce(data))),
-                              len(str(minimum.reduce(data))))
+            max_str_len = max(len(str(maximum.reduce(data, skipna=True))),
+                              len(str(minimum.reduce(data, skipna=True))))
             self.format = '%' + str(max_str_len) + 'd'
         except TypeError, NotImplementedError:
             # if reduce(data) fails, this instance will not be called, just
             # instantiated in formatdict.
             pass
+        except ValueError:
+            # this occurs when everything is NA
+            pass
 
     def __call__(self, x):
-        if _MININT < x < _MAXINT:
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
+        elif _MININT < x < _MAXINT:
             return self.format % x
         else:
             return "%s" % x
@@ -648,7 +681,9 @@ class LongFloatFormat(object):
         self.sign = sign
 
     def __call__(self, x):
-        if isnan(x):
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
+        elif isnan(x):
             if self.sign:
                 return '+' + _nan_str
             else:
@@ -676,9 +711,12 @@ class LongComplexFormat(object):
         self.imag_format = LongFloatFormat(precision, sign=True)
 
     def __call__(self, x):
-        r = self.real_format(x.real)
-        i = self.imag_format(x.imag)
-        return r + i + 'j'
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
+        else:
+            r = self.real_format(x.real)
+            i = self.imag_format(x.imag)
+            return r + i + 'j'
 
 
 class ComplexFormat(object):
@@ -688,14 +726,17 @@ class ComplexFormat(object):
                                        sign=True)
 
     def __call__(self, x):
-        r = self.real_format(x.real, strip_zeros=False)
-        i = self.imag_format(x.imag, strip_zeros=False)
-        if not self.imag_format.exp_format:
-            z = i.rstrip('0')
-            i = z + 'j' + ' '*(len(i)-len(z))
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
         else:
-            i = i + 'j'
-        return r + i
+            r = self.real_format(x.real, strip_zeros=False)
+            i = self.imag_format(x.imag, strip_zeros=False)
+            if not self.imag_format.exp_format:
+                z = i.rstrip('0')
+                i = z + 'j' + ' '*(len(i)-len(z))
+            else:
+                i = i + 'j'
+            return r + i
 
 class DatetimeFormat(object):
     def __init__(self, x, unit=None,
@@ -720,7 +761,10 @@ class DatetimeFormat(object):
         self.casting = casting
 
     def __call__(self, x):
-        return "'%s'" % datetime_as_string(x,
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
+        else:
+            return "'%s'" % datetime_as_string(x,
                                         unit=self.unit,
                                         timezone=self.timezone,
                                         casting=self.casting)
@@ -734,5 +778,8 @@ class TimedeltaFormat(object):
             self.format = '%' + str(max_str_len) + 'd'
 
     def __call__(self, x):
-        return self.format % x.astype('i8')
+        if isna(x):
+            return str(x).replace('NA', _na_str, 1)
+        else:
+            return self.format % x.astype('i8')
 
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 29cbb271f..05f382240 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -21,40 +21,45 @@ from os.path import join
 __docformat__ = 'restructuredtext'
 
 # The files under src/ that are scanned for API functions
-API_FILES = [join('multiarray', 'methods.c'),
+API_FILES = [join('multiarray', 'array_assign_array.c'),
+             join('multiarray', 'array_assign_scalar.c'),
              join('multiarray', 'arrayobject.c'),
-             join('multiarray', 'flagsobject.c'),
-             join('multiarray', 'descriptor.c'),
-             join('multiarray', 'iterators.c'),
-             join('multiarray', 'getset.c'),
-             join('multiarray', 'number.c'),
-             join('multiarray', 'sequence.c'),
-             join('multiarray', 'ctors.c'),
-             join('multiarray', 'convert.c'),
-             join('multiarray', 'shape.c'),
-             join('multiarray', 'item_selection.c'),
-             join('multiarray', 'convert_datatype.c'),
              join('multiarray', 'arraytypes.c.src'),
-             join('multiarray', 'multiarraymodule.c'),
-             join('multiarray', 'scalartypes.c.src'),
-             join('multiarray', 'scalarapi.c'),
+             join('multiarray', 'buffer.c'),
              join('multiarray', 'calculation.c'),
-             join('multiarray', 'usertypes.c'),
-             join('multiarray', 'refcount.c'),
              join('multiarray', 'conversion_utils.c'),
-             join('multiarray', 'buffer.c'),
+             join('multiarray', 'convert.c'),
+             join('multiarray', 'convert_datatype.c'),
+             join('multiarray', 'ctors.c'),
              join('multiarray', 'datetime.c'),
-             join('multiarray', 'datetime_strings.c'),
              join('multiarray', 'datetime_busday.c'),
              join('multiarray', 'datetime_busdaycal.c'),
+             join('multiarray', 'datetime_strings.c'),
+             join('multiarray', 'descriptor.c'),
+             join('multiarray', 'einsum.c.src'),
+             join('multiarray', 'flagsobject.c'),
+             join('multiarray', 'getset.c'),
+             join('multiarray', 'item_selection.c'),
+             join('multiarray', 'iterators.c'),
+             join('multiarray', 'methods.c'),
+             join('multiarray', 'multiarraymodule.c'),
+             join('multiarray', 'na_mask.c'),
+             join('multiarray', 'na_object.c'),
              join('multiarray', 'nditer_api.c'),
              join('multiarray', 'nditer_constr.c'),
              join('multiarray', 'nditer_pywrap.c'),
              join('multiarray', 'nditer_templ.c.src'),
-             join('multiarray', 'einsum.c.src'),
+             join('multiarray', 'number.c'),
+             join('multiarray', 'reduction.c'),
+             join('multiarray', 'refcount.c'),
+             join('multiarray', 'scalartypes.c.src'),
+             join('multiarray', 'scalarapi.c'),
+             join('multiarray', 'sequence.c'),
+             join('multiarray', 'shape.c'),
+             join('multiarray', 'usertypes.c'),
+             join('umath', 'loops.c.src'),
              join('umath', 'ufunc_object.c'),
              join('umath', 'ufunc_type_resolution.c'),
-             join('umath', 'loops.c.src'),
             ]
 THIS_DIR = os.path.dirname(__file__)
 API_FILES = [os.path.join(THIS_DIR, '..', 'src', a) for a in API_FILES]
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 477cd122b..7159d9896 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -11,6 +11,7 @@ sys.path.pop(0)
 Zero = "PyUFunc_Zero"
 One = "PyUFunc_One"
 None_ = "PyUFunc_None"
+ReorderableNone = "PyUFunc_ReorderableNone"
 
 # Sentinel value to specify using the full type description in the
 # function name
@@ -237,7 +238,7 @@ defdict = {
 'add' :
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
-          'PyUFunc_AdditionTypeResolution',
+          'PyUFunc_AdditionTypeResolver',
           TD(notimes_or_obj),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
@@ -246,9 +247,9 @@ defdict = {
           TD(O, f='PyNumber_Add'),
           ),
 'subtract' :
-    Ufunc(2, 1, Zero,
+    Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
-          'PyUFunc_SubtractionTypeResolution',
+          'PyUFunc_SubtractionTypeResolver',
           TD(notimes_or_obj),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
@@ -259,7 +260,7 @@ defdict = {
 'multiply' :
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
-          'PyUFunc_MultiplicationTypeResolution',
+          'PyUFunc_MultiplicationTypeResolver',
           TD(notimes_or_obj),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
@@ -269,9 +270,9 @@ defdict = {
           TD(O, f='PyNumber_Multiply'),
           ),
 'divide' :
-    Ufunc(2, 1, One,
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.divide'),
-          'PyUFunc_DivisionTypeResolution',
+          'PyUFunc_DivisionTypeResolver',
           TD(intfltcmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -280,9 +281,9 @@ defdict = {
           TD(O, f='PyNumber_Divide'),
           ),
 'floor_divide' :
-    Ufunc(2, 1, One,
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.floor_divide'),
-          'PyUFunc_DivisionTypeResolution',
+          'PyUFunc_DivisionTypeResolver',
           TD(intfltcmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -290,9 +291,9 @@ defdict = {
           TD(O, f='PyNumber_FloorDivide'),
           ),
 'true_divide' :
-    Ufunc(2, 1, One,
+    Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.true_divide'),
-          'PyUFunc_DivisionTypeResolution',
+          'PyUFunc_DivisionTypeResolver',
           TD('bBhH', out='d'),
           TD('iIlLqQ', out='d'),
           TD(flts+cmplx),
@@ -309,7 +310,7 @@ defdict = {
           TD(P, f='conjugate'),
           ),
 'fmod' :
-    Ufunc(2, 1, Zero,
+    Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.fmod'),
           None,
           TD(ints),
@@ -330,15 +331,17 @@ defdict = {
           TD(ints+inexact),
           TD(O, f='Py_reciprocal'),
           ),
-'ones_like' :
+# This is no longer used as numpy.ones_like, however it is
+# still used by some internal calls.
+'_ones_like' :
     Ufunc(1, 1, None,
-          docstrings.get('numpy.core.umath.ones_like'),
-          'PyUFunc_OnesLikeTypeResolution',
+          docstrings.get('numpy.core.umath._ones_like'),
+          'PyUFunc_OnesLikeTypeResolver',
           TD(noobj),
           TD(O, f='Py_get_one'),
           ),
 'power' :
-    Ufunc(2, 1, One,
+    Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.power'),
           None,
           TD(ints),
@@ -348,7 +351,7 @@ defdict = {
 'absolute' :
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
-          'PyUFunc_AbsoluteTypeResolution',
+          'PyUFunc_AbsoluteTypeResolver',
           TD(bints+flts+timedeltaonly),
           TD(cmplx, out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
@@ -362,7 +365,7 @@ defdict = {
 'negative' :
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
-          'PyUFunc_SimpleUnaryOperationTypeResolution',
+          'PyUFunc_SimpleUnaryOperationTypeResolver',
           TD(bints+flts+timedeltaonly),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
@@ -370,13 +373,13 @@ defdict = {
 'sign' :
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sign'),
-          'PyUFunc_SimpleUnaryOperationTypeResolution',
+          'PyUFunc_SimpleUnaryOperationTypeResolver',
           TD(nobool_or_datetime),
           ),
 'greater' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(all, out='?'),
           ),
 'greater_equal' :
@@ -388,80 +391,80 @@ defdict = {
 'less' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(all, out='?'),
           ),
 'less_equal' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(all, out='?'),
           ),
 'equal' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(all, out='?'),
           ),
 'not_equal' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(all, out='?'),
           ),
 'logical_and' :
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.logical_and'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?'),
-          TD(P, f='logical_and'),
+          TD(O, f='npy_ObjectLogicalAnd'),
           ),
 'logical_not' :
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
           TD(nodatetime_or_obj, out='?'),
-          TD(P, f='logical_not'),
+          TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or' :
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.logical_or'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?'),
-          TD(P, f='logical_or'),
+          TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.logical_xor'),
-          'PyUFunc_SimpleBinaryComparisonTypeResolution',
+          'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?'),
           TD(P, f='logical_xor'),
           ),
 'maximum' :
-    Ufunc(2, 1, None,
+    Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
-          'PyUFunc_SimpleBinaryOperationTypeResolution',
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum' :
-    Ufunc(2, 1, None,
+    Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
-          'PyUFunc_SimpleBinaryOperationTypeResolution',
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
 'fmax' :
-    Ufunc(2, 1, None,
+    Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
-          'PyUFunc_SimpleBinaryOperationTypeResolution',
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
 'fmin' :
-    Ufunc(2, 1, None,
+    Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
-          'PyUFunc_SimpleBinaryOperationTypeResolution',
+          'PyUFunc_SimpleBinaryOperationTypeResolver',
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
@@ -925,7 +928,7 @@ r"""f = PyUFunc_FromFuncAndData(%s_functions, %s_data, %s_signatures, %d,
                                                 name, docstring))
         if uf.typereso != None:
             mlist.append(
-                r"((PyUFuncObject *)f)->type_resolution_function = &%s;" %
+                r"((PyUFuncObject *)f)->type_resolver = &%s;" %
                                                                 uf.typereso)
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index a256d849a..ca89c28ec 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -14,10 +14,12 @@ exception, so it should hopefully not get unnoticed).
 
 multiarray_global_vars = {
     'NPY_NUMUSERTYPES':             7,
+    'Npy_NA':                       299,
 }
 
 multiarray_global_vars_types = {
     'NPY_NUMUSERTYPES':             'int',
+    'Npy_NA':                       'PyObject *',
 }
 
 multiarray_scalar_bool_values = {
@@ -68,6 +70,8 @@ multiarray_types_api = {
     'PyTimedeltaArrType_Type':          216,
     'PyHalfArrType_Type':               217,
     'NpyIter_Type':                     218,
+    # End 1.6 API
+    'NpyNA_Type':                       281,
 }
 
 #define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
@@ -316,9 +320,30 @@ multiarray_funcs_api = {
     'PyArray_ConvertClipmodeSequence':      279,
     'PyArray_MatrixProduct2':               280,
     # End 1.6 API
-    'PyArray_MaskedCopyInto':               281,
-    'PyArray_MaskedMoveInto':               282,
-    'PyArray_SetBaseObject':                      283,
+    'NpyIter_GetFirstMaskNAOp':             282,
+    'NpyIter_GetMaskNAIndexArray':          283,
+    'NpyIter_IsFirstVisit':                 284,
+    'PyArray_SetBaseObject':                285,
+    'PyArray_HasNASupport':                 286,
+    'PyArray_ContainsNA':                   287,
+    'PyArray_AllocateMaskNA':               288,
+    'PyArray_CreateSortedStridePerm':       289,
+    'PyArray_AssignZero':                   290,
+    'PyArray_AssignOne':                    291,
+    'PyArray_AssignNA':                     292,
+    'PyArray_AssignMaskNA':                 293,
+    'PyArray_AssignRawScalar':              294,
+    'PyArray_AssignArray':                  295,
+    'PyArray_ReduceWrapper':                296,
+    'PyArray_RemoveAxesInPlace':            297,
+    'PyArray_DebugPrint':                   298,
+    'NpyNA_GetDType':                       300,
+    'NpyNA_IsMultiNA':                      301,
+    'NpyNA_GetPayload':                     302,
+    'NpyNA_FromObject':                     303,
+    'NpyNA_FromDTypeAndPayload':            304,
+    'PyArray_AllowNAConverter':             305,
+    'PyArray_OutputAllowNAConverter':       306,
 }
 
 ufunc_types_api = {
@@ -366,9 +391,8 @@ ufunc_funcs_api = {
     'PyUFunc_ee_e_As_ff_f':                     37,
     'PyUFunc_ee_e_As_dd_d':                     38,
     # End 1.6 API
-    'PyUFunc_DefaultTypeResolution':            39,
+    'PyUFunc_DefaultTypeResolver':              39,
     'PyUFunc_ValidateCasting':                  40,
-    'PyUFunc_DefaultTypeResolutionMasked':      41,
 }
 
 # List of all the dicts which define the C API
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 591a898ed..760617340 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -2490,24 +2490,16 @@ add_newdoc('numpy.core.umath', 'not_equal',
 
     """)
 
-add_newdoc('numpy.core.umath', 'ones_like',
+add_newdoc('numpy.core.umath', '_ones_like',
     """
-    Returns an array of ones with the same shape and type as a given array.
-
-    Equivalent to ``a.copy().fill(1)``.
-
-    Please refer to the documentation for `zeros_like` for further details.
+    This function used to be the numpy.ones_like, but now a
+    specific function for that has been written for consistency with
+    the other *_like functions. It is only used internally in a limited
+    fashion now.
 
     See Also
     --------
-    zeros_like, ones
-
-    Examples
-    --------
-    >>> a = np.array([[1, 2, 3], [4, 5, 6]])
-    >>> np.ones_like(a)
-    array([[1, 1, 1],
-           [1, 1, 1]])
+    ones_like
 
     """)
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 602c0ebc5..8e7a556ac 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -16,6 +16,7 @@ import multiarray as mu
 import umath as um
 import numerictypes as nt
 from numeric import asarray, array, asanyarray, concatenate
+import _methods
 _dt_ = nt.sctype2char
 
 import types
@@ -868,7 +869,7 @@ def resize(a, new_shape):
     return reshape(a, new_shape)
 
 
-def squeeze(a):
+def squeeze(a, axis=None):
     """
     Remove single-dimensional entries from the shape of an array.
 
@@ -876,12 +877,19 @@ def squeeze(a):
     ----------
     a : array_like
         Input data.
+    axis : None or int or tuple of ints, optional
+        .. versionadded:: 1.7.0
+
+        Selects a subset of the single-dimensional entries in the
+        shape. If an axis is selected with shape entry greater than
+        one, an error is raised.
 
     Returns
     -------
     squeezed : ndarray
-        The input array, but with with all dimensions of length 1
-        removed.  Whenever possible, a view on `a` is returned.
+        The input array, but with with all or a subset of the
+        dimensions of length 1 removed. This is always `a` itself
+        or a view into `a`.
 
     Examples
     --------
@@ -890,13 +898,20 @@ def squeeze(a):
     (1, 3, 1)
     >>> np.squeeze(x).shape
     (3,)
+    >>> np.squeeze(x, axis=(2,)).shape
+    (1, 3)
 
     """
     try:
         squeeze = a.squeeze
     except AttributeError:
         return _wrapit(a, 'squeeze')
-    return squeeze()
+    try:
+        # First try to use the new axis= parameter
+        return squeeze(axis=axis)
+    except TypeError:
+        # For backwards compatibility
+        return squeeze()
 
 
 def diagonal(a, offset=0, axis1=0, axis2=1):
@@ -911,6 +926,8 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
     removing `axis1` and `axis2` and appending an index to the right equal
     to the size of the resulting diagonals.
 
+    As of NumPy 1.7, this function always returns a view into `a`.
+
     Parameters
     ----------
     a : array_like
@@ -1376,7 +1393,7 @@ def clip(a, a_min, a_max, out=None):
     return clip(a_min, a_max, out)
 
 
-def sum(a, axis=None, dtype=None, out=None):
+def sum(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
     """
     Sum of array elements over a given axis.
 
@@ -1384,9 +1401,16 @@ def sum(a, axis=None, dtype=None, out=None):
     ----------
     a : array_like
         Elements to sum.
-    axis : integer, optional
-        Axis over which the sum is taken. By default `axis` is None,
-        and all elements are summed.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a sum is performed.
+        The default (`axis` = `None`) is perform a sum over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        .. versionadded:: 1.7.0
+
+        If this is a tuple of ints, a sum is performed on multiple
+        axes, instead of a single axis or all the axes as before.
     dtype : dtype, optional
         The type of the returned array and of the accumulator in which
         the elements are summed.  By default, the dtype of `a` is used.
@@ -1399,6 +1423,13 @@ def sum(a, axis=None, dtype=None, out=None):
         (the shape of `a` with `axis` removed, i.e.,
         ``numpy.delete(a.shape, axis)``).  Its type is preserved. See
         `doc.ufuncs` (Section "Output arguments") for more details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during summation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -1448,14 +1479,19 @@ def sum(a, axis=None, dtype=None, out=None):
             out[...] = res
             return out
         return res
-    try:
-        sum = a.sum
-    except AttributeError:
-        return _wrapit(a, 'sum', axis, dtype, out)
-    return sum(axis, dtype, out)
-
+    elif not (type(a) is mu.ndarray):
+        try:
+            sum = a.sum
+        except AttributeError:
+            return _methods._sum(a, axis=axis, dtype=dtype,
+                                out=out, skipna=skipna, keepdims=keepdims)
+        # NOTE: Dropping the skipna and keepdims parameters here...
+        return sum(axis=axis, dtype=dtype, out=out)
+    else:
+        return _methods._sum(a, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
 
-def product (a, axis=None, dtype=None, out=None):
+def product (a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
     """
     Return the product of array elements over a given axis.
 
@@ -1464,14 +1500,10 @@ def product (a, axis=None, dtype=None, out=None):
     prod : equivalent function; see for details.
 
     """
-    try:
-        prod = a.prod
-    except AttributeError:
-        return _wrapit(a, 'prod', axis, dtype, out)
-    return prod(axis, dtype, out)
+    return um.multiply.reduce(a, axis=axis, dtype=dtype, out=out, skipna=skipna, keepdims=keepdims)
 
 
-def sometrue(a, axis=None, out=None):
+def sometrue(a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Check whether some values are true.
 
@@ -1482,14 +1514,14 @@ def sometrue(a, axis=None, out=None):
     any : equivalent function
 
     """
-    try:
-        any = a.any
-    except AttributeError:
-        return _wrapit(a, 'any', axis, out)
-    return any(axis, out)
+    arr = asanyarray(a)
 
+    try:
+        return arr.any(axis=axis, out=out, skipna=skipna, keepdims=keepdims)
+    except TypeError:
+        return arr.any(axis=axis, out=out)
 
-def alltrue (a, axis=None, out=None):
+def alltrue (a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Check if all elements of input array are true.
 
@@ -1498,14 +1530,14 @@ def alltrue (a, axis=None, out=None):
     numpy.all : Equivalent function; see for details.
 
     """
-    try:
-        all = a.all
-    except AttributeError:
-        return _wrapit(a, 'all', axis, out)
-    return all(axis, out)
+    arr = asanyarray(a)
 
+    try:
+        return arr.all(axis=axis, out=out, skipna=skipna, keepdims=keepdims)
+    except TypeError:
+        return arr.all(axis=axis, out=out)
 
-def any(a,axis=None, out=None):
+def any(a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Test whether any array element along a given axis evaluates to True.
 
@@ -1515,17 +1547,29 @@ def any(a,axis=None, out=None):
     ----------
     a : array_like
         Input array or object that can be converted to an array.
-    axis : int, optional
-        Axis along which a logical OR is performed.  The default
-        (`axis` = `None`) is to perform a logical OR over a flattened
-        input array. `axis` may be negative, in which case it counts
-        from the last to the first axis.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical OR reduction is performed.
+        The default (`axis` = `None`) is perform a logical OR over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        .. versionadded:: 1.7.0
+
+        If this is a tuple of ints, a reduction is performed on multiple
+        axes, instead of a single axis or all the axes as before.
     out : ndarray, optional
         Alternate output array in which to place the result.  It must have
         the same shape as the expected output and its type is preserved
         (e.g., if it is of type float, then it will remain so, returning
         1.0 for True and 0.0 for False, regardless of the type of `a`).
         See `doc.ufuncs` (Section "Output arguments") for details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during summation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -1569,14 +1613,14 @@ def any(a,axis=None, out=None):
     (191614240, 191614240)
 
     """
-    try:
-        any = a.any
-    except AttributeError:
-        return _wrapit(a, 'any', axis, out)
-    return any(axis, out)
+    arr = asanyarray(a)
 
+    try:
+        return arr.any(axis=axis, out=out, skipna=skipna, keepdims=keepdims)
+    except TypeError:
+        return arr.any(axis=axis, out=out)
 
-def all(a,axis=None, out=None):
+def all(a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Test whether all array elements along a given axis evaluate to True.
 
@@ -1584,17 +1628,29 @@ def all(a,axis=None, out=None):
     ----------
     a : array_like
         Input array or object that can be converted to an array.
-    axis : int, optional
-        Axis along which a logical AND is performed.
-        The default (`axis` = `None`) is to perform a logical AND
-        over a flattened input array.  `axis` may be negative, in which
-        case it counts from the last to the first axis.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a logical AND reduction is performed.
+        The default (`axis` = `None`) is perform a logical OR over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        .. versionadded:: 1.7.0
+
+        If this is a tuple of ints, a reduction is performed on multiple
+        axes, instead of a single axis or all the axes as before.
     out : ndarray, optional
         Alternate output array in which to place the result.
         It must have the same shape as the expected output and its
         type is preserved (e.g., if ``dtype(out)`` is float, the result
         will consist of 0.0's and 1.0's).  See `doc.ufuncs` (Section
         "Output arguments") for more details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during summation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -1633,12 +1689,12 @@ def all(a,axis=None, out=None):
     (28293632, 28293632, array([ True], dtype=bool))
 
     """
-    try:
-        all = a.all
-    except AttributeError:
-        return _wrapit(a, 'all', axis, out)
-    return all(axis, out)
+    arr = asanyarray(a)
 
+    try:
+        return arr.all(axis=axis, out=out, skipna=skipna, keepdims=keepdims)
+    except TypeError:
+        return arr.all(axis=axis, out=out)
 
 def cumsum (a, axis=None, dtype=None, out=None):
     """
@@ -1771,7 +1827,7 @@ def ptp(a, axis=None, out=None):
     return ptp(axis, out)
 
 
-def amax(a, axis=None, out=None):
+def amax(a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Return the maximum of an array or maximum along an axis.
 
@@ -1785,6 +1841,13 @@ def amax(a, axis=None, out=None):
         Alternate output array in which to place the result.  Must be of
         the same shape and buffer length as the expected output.  See
         `doc.ufuncs` (Section "Output arguments") for more details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during reduction
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -1826,14 +1889,19 @@ def amax(a, axis=None, out=None):
     4.0
 
     """
-    try:
-        amax = a.max
-    except AttributeError:
-        return _wrapit(a, 'max', axis, out)
-    return amax(axis, out)
-
+    if not (type(a) is mu.ndarray):
+        try:
+            amax = a.max
+        except AttributeError:
+            return _methods._amax(a, axis=axis,
+                                out=out, skipna=skipna, keepdims=keepdims)
+        # NOTE: Dropping the skipna and keepdims parameters
+        return amax(axis=axis, out=out)
+    else:
+        return _methods._amax(a, axis=axis,
+                            out=out, skipna=skipna, keepdims=keepdims)
 
-def amin(a, axis=None, out=None):
+def amin(a, axis=None, out=None, skipna=False, keepdims=False):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -1847,6 +1915,13 @@ def amin(a, axis=None, out=None):
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
         See `doc.ufuncs` (Section "Output arguments") for more details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during reduction
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -1888,12 +1963,17 @@ def amin(a, axis=None, out=None):
     0.0
 
     """
-    try:
-        amin = a.min
-    except AttributeError:
-        return _wrapit(a, 'min', axis, out)
-    return amin(axis, out)
-
+    if not (type(a) is mu.ndarray):
+        try:
+            amin = a.min
+        except AttributeError:
+            return _methods._amin(a, axis=axis,
+                                out=out, skipna=skipna, keepdims=keepdims)
+        # NOTE: Dropping the skipna and keepdims parameters
+        return amin(axis=axis, out=out)
+    else:
+        return _methods._amin(a, axis=axis,
+                            out=out, skipna=skipna, keepdims=keepdims)
 
 def alen(a):
     """
@@ -1928,7 +2008,7 @@ def alen(a):
         return len(array(a,ndmin=1))
 
 
-def prod(a, axis=None, dtype=None, out=None):
+def prod(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
     """
     Return the product of array elements over a given axis.
 
@@ -1936,9 +2016,16 @@ def prod(a, axis=None, dtype=None, out=None):
     ----------
     a : array_like
         Input data.
-    axis : int, optional
-        Axis over which the product is taken.  By default, the product
-        of all elements is calculated.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a product is performed.
+        The default (`axis` = `None`) is perform a product over all
+        the dimensions of the input array. `axis` may be negative, in
+        which case it counts from the last to the first axis.
+
+        .. versionadded:: 1.7.0
+
+        If this is a tuple of ints, a product is performed on multiple
+        axes, instead of a single axis or all the axes as before.
     dtype : data-type, optional
         The data-type of the returned array, as well as of the accumulator
         in which the elements are multiplied.  By default, if `a` is of
@@ -1949,6 +2036,13 @@ def prod(a, axis=None, dtype=None, out=None):
         Alternative output array in which to place the result. It must have
         the same shape as the expected output, but the type of the
         output values will be cast if necessary.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during reduction
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2002,12 +2096,16 @@ def prod(a, axis=None, dtype=None, out=None):
     True
 
     """
-    try:
-        prod = a.prod
-    except AttributeError:
-        return _wrapit(a, 'prod', axis, dtype, out)
-    return prod(axis, dtype, out)
-
+    if not (type(a) is mu.ndarray):
+        try:
+            prod = a.prod
+        except AttributeError:
+            return _methods._prod(a, axis=axis, dtype=dtype,
+                                out=out, skipna=skipna, keepdims=keepdims)
+        return prod(axis=axis, dtype=dtype, out=out)
+    else:
+        return _methods._prod(a, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
 
 def cumprod(a, axis=None, dtype=None, out=None):
     """
@@ -2296,7 +2394,7 @@ def round_(a, decimals=0, out=None):
     return round(decimals, out)
 
 
-def mean(a, axis=None, dtype=None, out=None):
+def mean(a, axis=None, dtype=None, out=None, skipna=False, keepdims=False):
     """
     Compute the arithmetic mean along the specified axis.
 
@@ -2321,6 +2419,13 @@ def mean(a, axis=None, dtype=None, out=None):
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.
         See `doc.ufuncs` for details.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2367,14 +2472,19 @@ def mean(a, axis=None, dtype=None, out=None):
     0.55000000074505806
 
     """
-    try:
-        mean = a.mean
-    except AttributeError:
-        return _wrapit(a, 'mean', axis, dtype, out)
-    return mean(axis, dtype, out)
+    if not (type(a) is mu.ndarray):
+        try:
+            mean = a.mean
+            return mean(axis=axis, dtype=dtype, out=out)
+        except AttributeError:
+            pass
+
+    return _methods._mean(a, axis=axis, dtype=dtype,
+                            out=out, skipna=skipna, keepdims=keepdims)
 
 
-def std(a, axis=None, dtype=None, out=None, ddof=0):
+def std(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
     """
     Compute the standard deviation along the specified axis.
 
@@ -2401,6 +2511,13 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
         Means Delta Degrees of Freedom.  The divisor used in calculations
         is ``N - ddof``, where ``N`` represents the number of elements.
         By default `ddof` is zero.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2418,14 +2535,15 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
     The standard deviation is the square root of the average of the squared
     deviations from the mean, i.e., ``std = sqrt(mean(abs(x - x.mean())**2))``.
 
-    The average squared deviation is normally calculated as ``x.sum() / N``, where
-    ``N = len(x)``.  If, however, `ddof` is specified, the divisor ``N - ddof``
-    is used instead. In standard statistical practice, ``ddof=1`` provides an
-    unbiased estimator of the variance of the infinite population. ``ddof=0``
-    provides a maximum likelihood estimate of the variance for normally
-    distributed variables. The standard deviation computed in this function
-    is the square root of the estimated variance, so even with ``ddof=1``, it
-    will not be an unbiased estimate of the standard deviation per se.
+    The average squared deviation is normally calculated as
+    ``x.sum() / N``, where ``N = len(x)``.  If, however, `ddof` is specified,
+    the divisor ``N - ddof`` is used instead. In standard statistical
+    practice, ``ddof=1`` provides an unbiased estimator of the variance
+    of the infinite population. ``ddof=0`` provides a maximum likelihood
+    estimate of the variance for normally distributed variables. The
+    standard deviation computed in this function is the square root of
+    the estimated variance, so even with ``ddof=1``, it will not be an
+    unbiased estimate of the standard deviation per se.
 
     Note that, for complex numbers, `std` takes the absolute
     value before squaring, so that the result is always real and nonnegative.
@@ -2460,14 +2578,18 @@ def std(a, axis=None, dtype=None, out=None, ddof=0):
     0.44999999925552653
 
     """
-    try:
-        std = a.std
-    except AttributeError:
-        return _wrapit(a, 'std', axis, dtype, out, ddof)
-    return std(axis, dtype, out, ddof)
+    if not (type(a) is mu.ndarray):
+        try:
+            std = a.std
+            return std(axis=axis, dtype=dtype, out=out, ddof=ddof)
+        except AttributeError:
+            pass
 
+    return _methods._std(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
+                                skipna=skipna, keepdims=keepdims)
 
-def var(a, axis=None, dtype=None, out=None, ddof=0):
+def var(a, axis=None, dtype=None, out=None, ddof=0,
+                            skipna=False, keepdims=False):
     """
     Compute the variance along the specified axis.
 
@@ -2495,6 +2617,13 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
         "Delta Degrees of Freedom": the divisor used in the calculation is
         ``N - ddof``, where ``N`` represents the number of elements. By
         default `ddof` is zero.
+    skipna : bool, optional
+        If this is set to True, skips any NA values during calculation
+        instead of propagating them.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
 
     Returns
     -------
@@ -2534,9 +2663,9 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     >>> a = np.array([[1,2],[3,4]])
     >>> np.var(a)
     1.25
-    >>> np.var(a,0)
+    >>> np.var(a, axis=0)
     array([ 1.,  1.])
-    >>> np.var(a,1)
+    >>> np.var(a, axis=1)
     array([ 0.25,  0.25])
 
     In single precision, var() can be inaccurate:
@@ -2547,7 +2676,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     >>> np.var(a)
     0.20405951142311096
 
-    Computing the standard deviation in float64 is more accurate:
+    Computing the variance in float64 is more accurate:
 
     >>> np.var(a, dtype=np.float64)
     0.20249999932997387
@@ -2555,8 +2684,13 @@ def var(a, axis=None, dtype=None, out=None, ddof=0):
     0.20250000000000001
 
     """
-    try:
-        var = a.var
-    except AttributeError:
-        return _wrapit(a, 'var', axis, dtype, out, ddof)
-    return var(axis, dtype, out, ddof)
+    if not (type(a) is mu.ndarray):
+        try:
+            var = a.var
+            return var(axis=axis, dtype=dtype, out=out, ddof=ddof)
+        except AttributeError:
+            pass
+
+    return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
+                                skipna=skipna, keepdims=keepdims)
+
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index b2f9dc70c..3e919c761 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -3,7 +3,7 @@ __all__ = ['logspace', 'linspace']
 import numeric as _nx
 from numeric import array
 
-def linspace(start, stop, num=50, endpoint=True, retstep=False):
+def linspace(start, stop, num=50, endpoint=True, retstep=False, maskna=False):
     """
     Return evenly spaced numbers over a specified interval.
 
@@ -29,6 +29,8 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False):
     retstep : bool, optional
         If True, return (`samples`, `step`), where `step` is the spacing
         between samples.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -73,22 +75,22 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False):
     """
     num = int(num)
     if num <= 0:
-        return array([], float)
+        return array([], float, maskna=maskna)
     if endpoint:
         if num == 1:
-            return array([float(start)])
+            return array([float(start)], maskna=maskna)
         step = (stop-start)/float((num-1))
-        y = _nx.arange(0, num) * step + start
+        y = _nx.arange(0, num, maskna=maskna) * step + start
         y[-1] = stop
     else:
         step = (stop-start)/float(num)
-        y = _nx.arange(0, num) * step + start
+        y = _nx.arange(0, num, maskna=maskna) * step + start
     if retstep:
         return y, step
     else:
         return y
 
-def logspace(start,stop,num=50,endpoint=True,base=10.0):
+def logspace(start,stop,num=50,endpoint=True,base=10.0, maskna=False):
     """
     Return numbers spaced evenly on a log scale.
 
@@ -114,6 +116,8 @@ def logspace(start,stop,num=50,endpoint=True,base=10.0):
         The base of the log space. The step size between the elements in
         ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
         Default is 10.0.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -162,6 +166,6 @@ def logspace(start,stop,num=50,endpoint=True,base=10.0):
     >>> plt.show()
 
     """
-    y = linspace(start,stop,num=num,endpoint=endpoint)
+    y = linspace(start,stop,num=num,endpoint=endpoint,maskna=maskna)
     return _nx.power(base,y)
 
diff --git a/numpy/core/include/numpy/halffloat.h b/numpy/core/include/numpy/halffloat.h
index c6bb726bc..f9f5b1fd0 100644
--- a/numpy/core/include/numpy/halffloat.h
+++ b/numpy/core/include/numpy/halffloat.h
@@ -52,6 +52,8 @@ npy_half npy_half_nextafter(npy_half x, npy_half y);
 #define NPY_HALF_NINF   (0xfc00u)
 #define NPY_HALF_NAN    (0x7e00u)
 
+#define NPY_MAX_HALF    (0x7bffu)
+
 /*
  * Bit-level conversions
  */
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 94c45df88..b4046f940 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -204,6 +204,9 @@ typedef enum {
         NPY_UNSAFE_CASTING=4
 } NPY_CASTING;
 
+/* The default casting to use for typical assignment operations */
+#define NPY_DEFAULT_ASSIGN_CASTING NPY_SAME_KIND_CASTING
+
 typedef enum {
         NPY_CLIP=0,
         NPY_WRAP=1,
@@ -287,6 +290,40 @@ typedef enum {
     NPY_BUSDAY_RAISE
 } NPY_BUSDAY_ROLL;
 
+/*********************************************************************
+ * NumPy functions for dealing with masks, such as in masked iteration
+ *********************************************************************/
+
+typedef npy_uint8 npy_mask;
+#define NPY_MASK NPY_UINT8
+
+/*
+ * Bit 0 of the mask indicates whether a value is exposed
+ * or hidden. This is compatible with a 'where=' boolean
+ * mask, because NumPy booleans are 1 byte, and contain
+ * either the value 0 or 1.
+ */
+static NPY_INLINE npy_bool
+NpyMaskValue_IsExposed(npy_mask mask)
+{
+    return (mask & 0x01) != 0;
+}
+
+/*
+ * Bits 1 through 7 of the mask contain the payload.
+ */
+static NPY_INLINE npy_uint8
+NpyMaskValue_GetPayload(npy_mask mask)
+{
+    return ((npy_uint8)mask) >> 1;
+}
+
+static NPY_INLINE npy_mask
+NpyMaskValue_Create(npy_bool exposed, npy_uint8 payload)
+{
+    return (npy_mask)(exposed != 0) | (npy_mask)(payload << 1);
+}
+
 
 #define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr);
 #define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr);
@@ -512,17 +549,17 @@ typedef struct {
  *These are inherited for global data-type if any data-types in the
  * field have them
  */
-#define NPY_FROM_FIELDS    (NPY_NEEDS_INIT | NPY_LIST_PICKLE |             \
+#define NPY_FROM_FIELDS    (NPY_NEEDS_INIT | NPY_LIST_PICKLE | \
                             NPY_ITEM_REFCOUNT | NPY_NEEDS_PYAPI)
 
-#define NPY_OBJECT_DTYPE_FLAGS (NPY_LIST_PICKLE | NPY_USE_GETITEM |       \
+#define NPY_OBJECT_DTYPE_FLAGS (NPY_LIST_PICKLE | NPY_USE_GETITEM | \
                                 NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT | \
                                 NPY_NEEDS_INIT | NPY_NEEDS_PYAPI)
 
 #define PyDataType_FLAGCHK(dtype, flag) \
         (((dtype)->flags & (flag)) == (flag))
 
-#define PyDataType_REFCHK(dtype)                                          \
+#define PyDataType_REFCHK(dtype) \
         PyDataType_FLAGCHK(dtype, NPY_ITEM_REFCOUNT)
 
 typedef struct _PyArray_Descr {
@@ -582,34 +619,67 @@ typedef struct _arr_descr {
  * #define NPY_NO_DEPRECATED_API.
  */
 /* This struct will be moved to a private header in a future release */
-typedef struct tagPyArrayObject_fieldaccess {
-        PyObject_HEAD
-        char *data;             /* pointer to raw data buffer */
-        int nd;                 /* number of dimensions, also called ndim */
-        npy_intp *dimensions;   /* size in each dimension */
-        npy_intp *strides;      /*
-                                 * bytes to jump to get to the
-                                 * next element in each dimension
-                                 */
-        PyObject *base;         /*
-                                 * This object should be decref'd upon
-                                 * deletion of array
-                                 *
-                                 * For views it points to the original
-                                 * array
-                                 *
-                                 * For creation from buffer object it
-                                 * points to an object that shold be
-                                 * decref'd on deletion
-                                 *
-                                 * For UPDATEIFCOPY flag this is an
-                                 * array to-be-updated upon deletion
-                                 * of this one
-                                 */
-        PyArray_Descr *descr;   /* Pointer to type structure */
-        int flags;              /* Flags describing array -- see below */
-        PyObject *weakreflist;  /* For weakreferences */
-} PyArrayObject_fieldaccess;
+typedef struct tagPyArrayObject_fields {
+    PyObject_HEAD
+    /* Pointer to the raw data buffer */
+    char *data;
+    /* The number of dimensions, also called 'ndim' */
+    int nd;
+    /* The size in each dimension, also called 'shape' */
+    npy_intp *dimensions;
+    /*
+     * Number of bytes to jump to get to the
+     * next element in each dimension
+     */
+    npy_intp *strides;
+    /*
+     * This object is decref'd upon
+     * deletion of array. Except in the
+     * case of UPDATEIFCOPY which has
+     * special handling.
+     *
+     * For views it points to the original
+     * array, collapsed so no chains of
+     * views occur.
+     *
+     * For creation from buffer object it
+     * points to an object that shold be
+     * decref'd on deletion
+     *
+     * For UPDATEIFCOPY flag this is an
+     * array to-be-updated upon deletion
+     * of this one
+     */
+    PyObject *base;
+    /* Pointer to type structure */
+    PyArray_Descr *descr;
+    /* Flags describing array -- see below */
+    int flags;
+    /* For weak references */
+    PyObject *weakreflist;
+
+    /* New fields added as of NumPy 1.7 */
+
+    /*
+     * Descriptor for the mask dtype.
+     *   If no mask: NULL
+     *   If mask   : bool/uint8/structured dtype of mask dtypes
+     */
+    PyArray_Descr *maskna_dtype;
+    /*
+     * Raw data buffer for mask. If the array has the flag
+     * NPY_ARRAY_OWNMASKNA enabled, it owns this memory and
+     * must call PyArray_free on it when destroyed.
+     */
+    char *maskna_data;
+    /*
+     * Just like dimensions and strides point into the same memory
+     * buffer, we now just make that buffer 3x the nd instead of 2x
+     * and use the same buffer. This is always allocated, regardless
+     * of whether there is an NA mask or not.
+     */
+    npy_intp *maskna_strides;
+} PyArrayObject_fields;
 
 /*
  * To hide the implementation details, we only expose
@@ -624,10 +694,10 @@ typedef struct tagPyArrayObject {
  * Can't put this in npy_deprecated_api.h like the others.
  * PyArrayObject field access is deprecated as of NumPy 1.7.
  */
-typedef PyArrayObject_fieldaccess PyArrayObject;
+typedef PyArrayObject_fields PyArrayObject;
 #endif
 
-#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fieldaccess))
+#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields))
 
 /* Array Flags Object */
 typedef struct PyArrayFlagsObject {
@@ -680,6 +750,9 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 /*
  * Means c-style contiguous (last index varies the fastest). The data
  * elements right after each other.
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
  */
 #define NPY_ARRAY_C_CONTIGUOUS    0x0001
 
@@ -687,8 +760,11 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
  * Set if array is a contiguous Fortran array: the first index varies
  * the fastest in memory (strides array is reverse of C-contiguous
  * array)
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
  */
-#define NPY_ARRAY_F_CONTIGUOUS       0x0002
+#define NPY_ARRAY_F_CONTIGUOUS    0x0002
 
 /*
  * Note: all 0-d arrays are C_CONTIGUOUS and F_CONTIGUOUS. If a
@@ -698,52 +774,108 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 /*
  * If set, the array owns the data: it will be free'd when the array
  * is deleted.
+ *
+ * This flag may be tested for in PyArray_FLAGS(arr).
  */
-#define NPY_ARRAY_OWNDATA       0x0004
+#define NPY_ARRAY_OWNDATA         0x0004
 
 /*
  * An array never has the next four set; they're only used as parameter
  * flags to the the various FromAny functions
+ *
+ * This flag may be requested in constructor functions.
  */
 
 /* Cause a cast to occur regardless of whether or not it is safe. */
-#define NPY_ARRAY_FORCECAST     0x0010
+#define NPY_ARRAY_FORCECAST       0x0010
 
 /*
  * Always copy the array. Returned arrays are always CONTIGUOUS,
  * ALIGNED, and WRITEABLE.
+ *
+ * This flag may be requested in constructor functions.
  */
-#define NPY_ARRAY_ENSURECOPY    0x0020
+#define NPY_ARRAY_ENSURECOPY      0x0020
 
-/* Make sure the returned array is a base-class ndarray */
-#define NPY_ARRAY_ENSUREARRAY   0x0040
+/*
+ * Make sure the returned array is a base-class ndarray
+ *
+ * This flag may be requested in constructor functions.
+ */
+#define NPY_ARRAY_ENSUREARRAY     0x0040
 
 /*
  * Make sure that the strides are in units of the element size Needed
  * for some operations with record-arrays.
+ *
+ * This flag may be requested in constructor functions.
  */
-#define NPY_ARRAY_ELEMENTSTRIDES 0x0080
+#define NPY_ARRAY_ELEMENTSTRIDES  0x0080
 
 /*
  * Array data is aligned on the appropiate memory address for the type
  * stored according to how the compiler would align things (e.g., an
  * array of integers (4 bytes each) starts on a memory address that's
  * a multiple of 4)
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
  */
-#define NPY_ARRAY_ALIGNED       0x0100
+#define NPY_ARRAY_ALIGNED         0x0100
 
-/* Array data has the native endianness */
-#define NPY_ARRAY_NOTSWAPPED    0x0200
+/*
+ * Array data has the native endianness
+ *
+ * This flag may be requested in constructor functions.
+ */
+#define NPY_ARRAY_NOTSWAPPED      0x0200
 
-/* Array data is writeable */
-#define NPY_ARRAY_WRITEABLE     0x0400
+/*
+ * Array data is writeable
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
+ */
+#define NPY_ARRAY_WRITEABLE       0x0400
 
 /*
  * If this flag is set, then base contains a pointer to an array of
  * the same size that should be updated with the current contents of
  * this array when this array is deallocated
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
  */
-#define NPY_ARRAY_UPDATEIFCOPY  0x1000
+#define NPY_ARRAY_UPDATEIFCOPY    0x1000
+
+/*
+ * If this flag is set, then the array has an NA mask corresponding
+ * to the array data. If the flag NPY_ARRAY_OWNMASKNA is requested
+ * in a constructor, this flag is also implied even if it is not set.
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
+ */
+#define NPY_ARRAY_MASKNA          0x2000
+
+/*
+ * If this flag is set, then the array owns the memory for the
+ * missing values NA mask.
+ *
+ * This flag may be requested in constructor functions.
+ * This flag may be tested for in PyArray_FLAGS(arr).
+ */
+#define NPY_ARRAY_OWNMASKNA       0x4000
+
+/*
+ * If this flag is set, then arrays which have an NA mask, or arrays
+ * which have an NA dtype are permitted to pass through. If not,
+ * an array with NA support causes an error to be thrown.
+ *
+ * This flag may be requested in constructor functions.
+ */
+#define NPY_ARRAY_ALLOWNA         0x8000
+
 
 #define NPY_ARRAY_BEHAVED      (NPY_ARRAY_ALIGNED | \
                                 NPY_ARRAY_WRITEABLE)
@@ -772,7 +904,7 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
                                 NPY_ARRAY_F_CONTIGUOUS | \
                                 NPY_ARRAY_ALIGNED)
 
-/* This flag is for the array interface */
+/* This flag is for the array interface, not PyArrayObject */
 #define NPY_ARR_HAS_DESCR  0x0800
 
 
@@ -821,12 +953,12 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 #define NPY_BEGIN_THREADS _save = PyEval_SaveThread();
 #define NPY_END_THREADS   do {if (_save) PyEval_RestoreThread(_save);} while (0);
 
-#define NPY_BEGIN_THREADS_DESCR(dtype)                          \
-        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI)))      \
+#define NPY_BEGIN_THREADS_DESCR(dtype) \
+        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \
                 NPY_BEGIN_THREADS;} while (0);
 
-#define NPY_END_THREADS_DESCR(dtype)                            \
-        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI)))      \
+#define NPY_END_THREADS_DESCR(dtype) \
+        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \
                 NPY_END_THREADS; } while (0);
 
 #define NPY_ALLOW_C_API_DEF  PyGILState_STATE __save__;
@@ -846,9 +978,20 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 #endif
 
 /*****************************
- * New iterator object
+ * NA object, added in 1.7
  *****************************/
 
+/* Direct access to the fields of the NA object is just internal to NumPy. */
+typedef struct tagNpyNA {
+        PyObject_HEAD
+} NpyNA;
+
+#define NpyNA_Check(op) PyObject_TypeCheck(op, &NpyNA_Type)
+
+/**********************************
+ * The nditer object, added in 1.6
+ **********************************/
+
 /* The actual structure of the iterator is an internal detail */
 typedef struct NpyIter_InternalOnly NpyIter;
 
@@ -916,6 +1059,10 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter,
 #define NPY_ITER_WRITEMASKED                0x10000000
 /* This array is the mask for all WRITEMASKED operands */
 #define NPY_ITER_ARRAYMASK                  0x20000000
+/* Split this operand up into data and an NA mask */
+#define NPY_ITER_USE_MASKNA                 0x40000000
+/* Iterate over the data, even if it has an NA mask and without USE_MASKNA */
+#define NPY_ITER_IGNORE_MASKNA              0x80000000
 
 #define NPY_ITER_GLOBAL_FLAGS               0x0000ffff
 #define NPY_ITER_PER_OP_FLAGS               0xffff0000
@@ -1278,8 +1425,6 @@ PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
 #define PyArray_FORTRAN_IF(m) ((PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) ? \
                                NPY_ARRAY_F_CONTIGUOUS : 0))
 
-#define FORTRAN_IF PyArray_FORTRAN_IF
-
 #ifdef NPY_NO_DEPRECATED_API
 /*
  * Changing access macros into functions, to allow for future hiding
@@ -1291,67 +1436,67 @@ PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
 static NPY_INLINE int
 PyArray_NDIM(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->nd;
+    return ((PyArrayObject_fields *)arr)->nd;
 }
 
 static NPY_INLINE char *
 PyArray_DATA(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->data;
+    return ((PyArrayObject_fields *)arr)->data;
 }
 
 static NPY_INLINE npy_intp *
 PyArray_DIMS(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->dimensions;
+    return ((PyArrayObject_fields *)arr)->dimensions;
 }
 
 static NPY_INLINE npy_intp *
 PyArray_STRIDES(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->strides;
+    return ((PyArrayObject_fields *)arr)->strides;
 }
 
 static NPY_INLINE npy_intp
 PyArray_DIM(PyArrayObject *arr, int idim)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->dimensions[idim];
+    return ((PyArrayObject_fields *)arr)->dimensions[idim];
 }
 
 static NPY_INLINE npy_intp
 PyArray_STRIDE(PyArrayObject *arr, int istride)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->strides[istride];
+    return ((PyArrayObject_fields *)arr)->strides[istride];
 }
 
 static NPY_INLINE PyObject *
 PyArray_BASE(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->base;
+    return ((PyArrayObject_fields *)arr)->base;
 }
 
 static NPY_INLINE PyArray_Descr *
 PyArray_DESCR(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->descr;
+    return ((PyArrayObject_fields *)arr)->descr;
 }
 
 static NPY_INLINE int
 PyArray_FLAGS(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->flags;
+    return ((PyArrayObject_fields *)arr)->flags;
 }
 
 static NPY_INLINE npy_intp
 PyArray_ITEMSIZE(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->descr->elsize;
+    return ((PyArrayObject_fields *)arr)->descr->elsize;
 }
 
 static NPY_INLINE int
 PyArray_TYPE(PyArrayObject *arr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->descr->type_num;
+    return ((PyArrayObject_fields *)arr)->descr->type_num;
 }
 
 static NPY_INLINE int
@@ -1363,18 +1508,15 @@ PyArray_CHKFLAGS(PyArrayObject *arr, int flags)
 static NPY_INLINE PyObject *
 PyArray_GETITEM(PyArrayObject *arr, char *itemptr)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->descr->f->getitem(
-                                                itemptr,
-                                                arr);
+    return ((PyArrayObject_fields *)arr)->descr->f->getitem(
+                                                        itemptr, arr);
 }
 
 static NPY_INLINE int
 PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
 {
-    return ((PyArrayObject_fieldaccess *)arr)->descr->f->setitem(
-                                                v,
-                                                itemptr,
-                                                arr);
+    return ((PyArrayObject_fields *)arr)->descr->f->setitem(
+                                                        v, itemptr, arr);
 }
 
 /* Same as PyArray_DATA */
@@ -1382,23 +1524,23 @@ PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
 
 #else
 
-/* Macros are deprecated as of NumPy 1.7. */
-#define PyArray_NDIM(obj) (((PyArrayObject_fieldaccess *)(obj))->nd)
-#define PyArray_BYTES(obj) ((char *)(((PyArrayObject_fieldaccess *)(obj))->data))
-#define PyArray_DATA(obj) ((void *)(((PyArrayObject_fieldaccess *)(obj))->data))
-#define PyArray_DIMS(obj) (((PyArrayObject_fieldaccess *)(obj))->dimensions)
-#define PyArray_STRIDES(obj) (((PyArrayObject_fieldaccess *)(obj))->strides)
+/* These macros are deprecated as of NumPy 1.7. */
+#define PyArray_NDIM(obj) (((PyArrayObject_fields *)(obj))->nd)
+#define PyArray_BYTES(obj) ((char *)(((PyArrayObject_fields *)(obj))->data))
+#define PyArray_DATA(obj) ((void *)(((PyArrayObject_fields *)(obj))->data))
+#define PyArray_DIMS(obj) (((PyArrayObject_fields *)(obj))->dimensions)
+#define PyArray_STRIDES(obj) (((PyArrayObject_fields *)(obj))->strides)
 #define PyArray_DIM(obj,n) (PyArray_DIMS(obj)[n])
 #define PyArray_STRIDE(obj,n) (PyArray_STRIDES(obj)[n])
-#define PyArray_BASE(obj) (((PyArrayObject_fieldaccess *)(obj))->base)
-#define PyArray_DESCR(obj) (((PyArrayObject_fieldaccess *)(obj))->descr)
-#define PyArray_FLAGS(obj) (((PyArrayObject_fieldaccess *)(obj))->flags)
+#define PyArray_BASE(obj) (((PyArrayObject_fields *)(obj))->base)
+#define PyArray_DESCR(obj) (((PyArrayObject_fields *)(obj))->descr)
+#define PyArray_FLAGS(obj) (((PyArrayObject_fields *)(obj))->flags)
 #define PyArray_CHKFLAGS(m, FLAGS) \
-        ((((PyArrayObject_fieldaccess *)(m))->flags & (FLAGS)) == (FLAGS))
+        ((((PyArrayObject_fields *)(m))->flags & (FLAGS)) == (FLAGS))
 #define PyArray_ITEMSIZE(obj) \
-                    (((PyArrayObject_fieldaccess *)(obj))->descr->elsize)
+                    (((PyArrayObject_fields *)(obj))->descr->elsize)
 #define PyArray_TYPE(obj) \
-                    (((PyArrayObject_fieldaccess *)(obj))->descr->type_num)
+                    (((PyArrayObject_fields *)(obj))->descr->type_num)
 #define PyArray_GETITEM(obj,itemptr) \
         PyArray_DESCR(obj)->f->getitem((char *)(itemptr), \
                                      (PyArrayObject *)(obj))
@@ -1409,6 +1551,18 @@ PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
                                      (PyArrayObject *)(obj))
 #endif
 
+static NPY_INLINE PyArray_Descr *
+PyArray_DTYPE(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->descr;
+}
+
+static NPY_INLINE npy_intp *
+PyArray_SHAPE(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->dimensions;
+}
+
 /*
  * Enables the specified array flags. Does no checking,
  * assumes you know what you're doing.
@@ -1416,7 +1570,7 @@ PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
 static NPY_INLINE void
 PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags)
 {
-    ((PyArrayObject_fieldaccess *)arr)->flags |= flags;
+    ((PyArrayObject_fields *)arr)->flags |= flags;
 }
 
 /*
@@ -1426,9 +1580,37 @@ PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags)
 static NPY_INLINE void
 PyArray_CLEARFLAGS(PyArrayObject *arr, int flags)
 {
-    ((PyArrayObject_fieldaccess *)arr)->flags &= ~flags;
+    ((PyArrayObject_fields *)arr)->flags &= ~flags;
+}
+
+/* Access to the missing values NA mask, added in 1.7 */
+
+static NPY_INLINE PyArray_Descr *
+PyArray_MASKNA_DTYPE(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->maskna_dtype;
 }
 
+static NPY_INLINE char *
+PyArray_MASKNA_DATA(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->maskna_data;
+}
+
+/* For the corresponding DIMS, use PyArray_DIMS(arr) */
+static NPY_INLINE npy_intp *
+PyArray_MASKNA_STRIDES(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->maskna_strides;
+}
+
+static NPY_INLINE npy_bool
+PyArray_HASMASKNA(PyArrayObject *arr)
+{
+    return (((PyArrayObject_fields *)arr)->flags & NPY_ARRAY_MASKNA) != 0;
+}
+
+
 #define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL)
 
 #define PyTypeNum_ISUNSIGNED(type) (((type) == NPY_UBYTE) ||   \
@@ -1592,46 +1774,154 @@ struct NpyAuxData_tag {
 #define NPY_AUXDATA_CLONE(auxdata) \
     ((auxdata)->clone(auxdata))
 
-/*********************************************************************
- * NumPy functions for dealing with masks, such as in masked iteration
- *********************************************************************/
+/************************************************************
+ * A struct used by PyArray_CreateSortedStridePerm, new in 1.7.
+ ************************************************************/
 
-typedef npy_uint8 npy_mask;
-#define NPY_MASK NPY_UINT8
+typedef struct {
+    npy_intp perm, stride;
+} npy_stride_sort_item;
+
+/************************************************************
+ * Typedefs used by PyArray_ReduceWrapper, new in 1.7.
+ ************************************************************/
 
 /*
- * Bit 0 of the mask indicates whether a value is exposed
- * or hidden. This is compatible with a 'where=' boolean
- * mask, because NumPy booleans are 1 byte, and contain
- * either the value 0 or 1.
+ * This is a function for assigning a reduction identity to the result,
+ * before doing the reduction computation. If 'preservena' is True,
+ * any masked NA values in 'result' should not be overwritten. The
+ * value in 'data' is passed through from PyArray_ReduceWrapper.
+ *
+ * This function could, for example, simply be a call like
+ *      return PyArray_AssignZero(result, NULL, preservena, NULL);
+ *
+ * It should return -1 on failure, or 0 on success.
  */
-static NPY_INLINE npy_bool
-NpyMask_IsExposed(npy_mask mask)
-{
-    return (mask & 0x01) != 0;
-}
+typedef int (PyArray_AssignReduceIdentityFunc)(PyArrayObject *result,
+                                            int preservena, void *data);
 
 /*
- * Bits 1 through 7 of the mask contain the payload.
+ * This is a function for the reduce loop. Both the unmasked and
+ * masked variants have the same prototype, but should behave differently.
+ *
+ * The needs_api parameter indicates whether it's ok to release the GIL during
+ * the loop, such as when the iternext() function never calls
+ * a function which could raise a Python exception.
+ *
+ * Ths skip_first_count parameter indicates how many elements need to be
+ * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
+ * when the 'assign_identity' parameter was NULL when calling
+ * PyArray_ReduceWrapper.
+ *
+ * The unmasked loop gets two data pointers and two strides, and should
+ * look roughly like this:
+ *  {
+ *      NPY_BEGIN_THREADS_DEF;
+ *      if (!needs_api) {
+ *          NPY_BEGIN_THREADS;
+ *      }
+ *      // This first-visit loop can be skipped if 'assign_identity' was non-NULL
+ *      if (skip_first_count > 0) {
+ *          do {
+ *              char *data0 = dataptr[0], *data1 = dataptr[1];
+ *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
+ *              npy_intp count = *countptr;
+ *
+ *              // Skip any first-visit elements
+ *              if (NpyIter_IsFirstVisit(iter, 0)) {
+ *                  if (stride0 == 0) {
+ *                      --count;
+ *                      --skip_first_count;
+ *                      data1 += stride1;
+ *                      //data2 += stride2; // In masked loop
+ *                  }
+ *                  else {
+ *                      skip_first_count -= count;
+ *                      count = 0;
+ *                  }
+ *              }
+ *
+ *              while (count--) {
+ *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                    *(operand_t *)data1);
+ *                  data0 += stride0;
+ *                  data1 += stride1;
+ *              }
+ *
+ *              // Jump to the faster loop when skipping is done
+ *              if (skip_first_count == 0) {
+ *                  if (iternext(iter)) {
+ *                      break;
+ *                  }
+ *                  else {
+ *                      goto finish_loop;
+ *                  }
+ *              }
+ *          } while (iternext(iter));
+ *      }
+ *      do {
+ *          char *data0 = dataptr[0], *data1 = dataptr[1];
+ *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
+ *          npy_intp count = *countptr;
+ *
+ *          while (count--) {
+ *              *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                *(operand_t *)data1);
+ *              data0 += stride0;
+ *              data1 += stride1;
+ *          }
+ *      } while (iternext(iter));
+ *  finish_loop:
+ *      if (!needs_api) {
+ *          NPY_END_THREADS;
+ *      }
+ *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
+ *  }
+ *
+ * The masked loop gets three data pointers and three strides, and
+ * looks identical except for the iteration loops which should be
+ * like this:
+ *      do {
+ *          char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
+ *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1],
+ *                      stride2 = strideptr[2];
+ *          npy_intp count = *countptr;
+ *
+ *          // Skipping first visits would go here
+ *
+ *          while (count--) {
+ *              if (NpyMaskValue_IsExposed((npy_mask)*data2)) {
+ *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                    *(operand_t *)data1);
+ *              }
+ *              data0 += stride0;
+ *              data1 += stride1;
+ *              data2 += stride2;
+ *          }
+ *
+ *          // Jumping to the faster loop would go here
+ *
+ *      } while (iternext(iter));
+ *
+ * If needs_api is True, this function should call PyErr_Occurred()
+ * to check if an error occurred during processing, and return -1 for
+ * error, 0 for success.
  */
-static NPY_INLINE npy_uint8
-NpyMask_GetPayload(npy_mask mask)
-{
-    return ((npy_uint8)mask) >> 1;
-}
+typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
+                                            char **dataptr,
+                                            npy_intp *strideptr,
+                                            npy_intp *countptr,
+                                            NpyIter_IterNextFunc *iternext,
+                                            int needs_api,
+                                            npy_intp skip_first_count,
+                                            void *data);
 
-static NPY_INLINE npy_mask
-NpyMask_Create(npy_bool exposed, npy_uint8 payload)
-{
-    return (npy_mask)(exposed != 0) | (npy_mask)(payload << 1);
-}
-
-/*
+/************************************************************
  * This is the form of the struct that's returned pointed by the
  * PyCObject attribute of an array __array_struct__. See
  * http://numpy.scipy.org/array_interface.shtml for the full
  * documentation.
- */
+ ************************************************************/
 typedef struct {
     int two;              /*
                            * contains the integer 2 as a sanity
diff --git a/numpy/core/include/numpy/npy_deprecated_api.h b/numpy/core/include/numpy/npy_deprecated_api.h
index 413d24d4e..a268f504a 100644
--- a/numpy/core/include/numpy/npy_deprecated_api.h
+++ b/numpy/core/include/numpy/npy_deprecated_api.h
@@ -89,4 +89,10 @@
  */
 #define fortran fortran_
 
+/*
+ * Deprecated as of NumPy 1.7, as it is a namespace-polluting
+ * macro.
+ */
+#define FORTRAN_IF PyArray_FORTRAN_IF
+
 #endif
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index d00fe10ea..dab5fd6e4 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -7,25 +7,38 @@
 extern "C" {
 #endif
 
-/* The most generic inner loop for a standard element-wise ufunc */
+/*
+ * The legacy generic inner loop for a standard element-wise or
+ * generalized ufunc.
+ */
 typedef void (*PyUFuncGenericFunction)
             (char **args,
              npy_intp *dimensions,
-             npy_intp *steps,
+             npy_intp *strides,
              void *innerloopdata);
 
 /*
- * The most generic inner loop for a masked standard element-wise ufunc.
- * The mask data and step is at args[narg] and steps[narg], after all
- * the operands.
+ * The most generic one-dimensional inner loop for
+ * a standard element-wise ufunc. This typedef is also
+ * more consistent with the other NumPy function pointer typedefs
+ * than PyUFuncGenericFunction.
  */
-typedef void (*PyUFuncGenericMaskedFunction)
-            (char **args,
-             npy_intp *dimensions,
-             npy_intp *steps,
-             NpyAuxData *innerloopdata);
+typedef void (PyUFunc_StridedInnerLoopFunc)(
+                char **dataptrs, npy_intp *strides,
+                npy_intp count,
+                NpyAuxData *innerloopdata);
 
-/* Forward declaration for the type resolution function */
+/*
+ * The most generic one-dimensional inner loop for
+ * a masked standard element-wise ufunc.
+ */
+typedef void (PyUFunc_MaskedStridedInnerLoopFunc)(
+                char **dataptrs, npy_intp *strides,
+                char *maskptr, npy_intp mask_stride,
+                npy_intp count,
+                NpyAuxData *innerloopdata);
+
+/* Forward declaration for the type resolver and loop selector typedefs */
 struct _tagPyUFuncObject;
 
 /*
@@ -49,10 +62,6 @@ struct _tagPyUFuncObject;
  *                    references to (ufunc->nin + ufunc->nout) new
  *                    dtypes, one for each input and output. These
  *                    dtypes should all be in native-endian format.
- * out_innerloop:     Should be populated with the correct ufunc inner
- *                    loop for the given type.
- * out_innerloopdata: Should be populated with the void* data to
- *                    be passed into the out_innerloop function.
  *
  * Should return 0 on success, -1 on failure (with exception set),
  * or -2 if Py_NotImplemented should be returned.
@@ -62,17 +71,60 @@ typedef int (PyUFunc_TypeResolutionFunc)(
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
-typedef int (PyUFunc_TypeResolutionMaskedFunc)(
-                                struct _tagPyUFuncObject *ufunc,
-                                NPY_CASTING casting,
-                                PyArrayObject **operands,
-                                PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericMaskedFunction *out_innerloop,
-                                NpyAuxData **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
+
+/*
+ * Given an array of DTypes as returned by the PyUFunc_TypeResolutionFunc,
+ * and an array of fixed strides (the array will contain NPY_MAX_INTP for
+ * strides which are not necessarily fixed), returns an inner loop
+ * with associated auxiliary data.
+ *
+ * For backwards compatibility, there is a variant of the inner loop
+ * selection which returns an inner loop irrespective of the strides,
+ * and with a void* static auxiliary data instead of an NpyAuxData *
+ * dynamically allocatable auxiliary data.
+ *
+ * ufunc:             The ufunc object.
+ * dtypes:            An array which has been populated with dtypes,
+ *                    in most cases by the type resolution funciton
+ *                    for the same ufunc.
+ * fixed_strides:     For each input/output, either the stride that
+ *                    will be used every time the function is called
+ *                    or NPY_MAX_INTP if the stride might change or
+ *                    is not known ahead of time. The loop selection
+ *                    function may use this stride to pick inner loops
+ *                    which are optimized for contiguous or 0-stride
+ *                    cases.
+ * out_innerloop:     Should be populated with the correct ufunc inner
+ *                    loop for the given type.
+ * out_innerloopdata: Should be populated with the void* data to
+ *                    be passed into the out_innerloop function.
+ * out_needs_api:     If the inner loop needs to use the Python API,
+ *                    should set the to 1, otherwise should leave
+ *                    this untouched.
+ */
+typedef int (PyUFunc_LegacyInnerLoopSelectionFunc)(
+                            struct _tagPyUFuncObject *ufunc,
+                            PyArray_Descr **dtypes,
+                            PyUFuncGenericFunction *out_innerloop,
+                            void **out_innerloopdata,
+                            int *out_needs_api);
+typedef int (PyUFunc_InnerLoopSelectionFunc)(
+                            struct _tagPyUFuncObject *ufunc,
+                            PyArray_Descr **dtypes,
+                            npy_intp *fixed_strides,
+                            PyUFunc_StridedInnerLoopFunc **out_innerloop,
+                            NpyAuxData **out_innerloopdata,
+                            int *out_needs_api);
+typedef int (PyUFunc_MaskedInnerLoopSelectionFunc)(
+                            struct _tagPyUFuncObject *ufunc,
+                            PyArray_Descr **dtypes,
+                            PyArray_Descr *mask_dtype,
+                            npy_intp *fixed_strides,
+                            npy_intp fixed_mask_stride,
+                            PyUFunc_MaskedStridedInnerLoopFunc **out_innerloop,
+                            NpyAuxData **out_innerloopdata,
+                            int *out_needs_api);
 
 typedef struct _tagPyUFuncObject {
         PyObject_HEAD
@@ -137,17 +189,27 @@ typedef struct _tagPyUFuncObject {
         char *core_signature;
 
         /*
-         * A function which resolves the types and returns an inner loop.
-         * This is used by the regular ufunc, the reduction operations
-         * have a different set of rules.
+         * A function which resolves the types and fills an array
+         * with the dtypes for the inputs and outputs.
+         */
+        PyUFunc_TypeResolutionFunc *type_resolver;
+        /*
+         * A function which returns an inner loop written for
+         * NumPy 1.6 and earlier ufuncs. This is for backwards
+         * compatibility, and may be NULL if inner_loop_selector
+         * is specified.
+         */
+        PyUFunc_LegacyInnerLoopSelectionFunc *legacy_inner_loop_selector;
+        /*
+         * A function which returns an inner loop for the new mechanism
+         * in NumPy 1.7 and later. If provided, this is used, otherwise
+         * if NULL the legacy_inner_loop_selector is used instead.
          */
-        PyUFunc_TypeResolutionFunc *type_resolution_function;
+        PyUFunc_InnerLoopSelectionFunc *inner_loop_selector;
         /*
-         * A function which resolves the types and returns an inner loop.
-         * This is used by the regular ufunc when it requires using
-         * a mask to select which elements to compute.
+         * A function which returns a masked inner loop for the ufunc.
          */
-        PyUFunc_TypeResolutionMaskedFunc *type_resolution_masked_function;
+        PyUFunc_MaskedInnerLoopSelectionFunc *masked_inner_loop_selector;
 } PyUFuncObject;
 
 #include "arrayobject.h"
@@ -200,9 +262,26 @@ typedef struct _tagPyUFuncObject {
 #define NPY_LOOP_END_THREADS
 #endif
 
+/*
+ * UFunc has unit of 1, and the order of operations can be reordered
+ * This case allows reduction with multiple axes at once.
+ */
 #define PyUFunc_One 1
+/*
+ * UFunc has unit of 0, and the order of operations can be reordered
+ * This case allows reduction with multiple axes at once.
+ */
 #define PyUFunc_Zero 0
+/*
+ * UFunc has no unit, and the order of operations cannot be reordered.
+ * This case does not allow reduction with multiple axes at once.
+ */
 #define PyUFunc_None -1
+/*
+ * UFunc has no unit, and the order of operations can be reordered
+ * This case allows reduction with multiple axes at once.
+ */
+#define PyUFunc_ReorderableNone -2
 
 #define UFUNC_REDUCE 0
 #define UFUNC_ACCUMULATE 1
@@ -231,12 +310,12 @@ typedef struct _loop1d_info {
 
 #define UFUNC_PYVALS_NAME "UFUNC_PYVALS"
 
-#define UFUNC_CHECK_ERROR(arg)                                          \
-        do {if ((((arg)->obj & UFUNC_OBJ_NEEDS_API) && PyErr_Occurred()) ||                         \
-            ((arg)->errormask &&                                        \
-             PyUFunc_checkfperr((arg)->errormask,                       \
-                                (arg)->errobj,                          \
-                                &(arg)->first)))                        \
+#define UFUNC_CHECK_ERROR(arg) \
+        do {if ((((arg)->obj & UFUNC_OBJ_NEEDS_API) && PyErr_Occurred()) || \
+            ((arg)->errormask && \
+             PyUFunc_checkfperr((arg)->errormask, \
+                                (arg)->errobj, \
+                                &(arg)->first))) \
                 goto fail;} while (0)
 
 /* This code checks the IEEE status flags in a platform-dependent way */
@@ -251,12 +330,12 @@ typedef struct _loop1d_info {
 
 #include <machine/fpu.h>
 
-#define UFUNC_CHECK_STATUS(ret) {               \
-        unsigned long fpstatus;                 \
-                                                \
-        fpstatus = ieee_get_fp_control();                               \
+#define UFUNC_CHECK_STATUS(ret) { \
+        unsigned long fpstatus; \
+         \
+        fpstatus = ieee_get_fp_control(); \
         /* clear status bits as well as disable exception mode if on */ \
-        ieee_set_fp_control( 0 );                                       \
+        ieee_set_fp_control( 0 ); \
         ret = ((IEEE_STATUS_DZE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
                 | ((IEEE_STATUS_OVF & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
                 | ((IEEE_STATUS_UNF & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
@@ -273,13 +352,13 @@ typedef struct _loop1d_info {
 #define UFUNC_NOFPE _control87(MCW_EM, MCW_EM);
 #endif
 
-#define UFUNC_CHECK_STATUS(ret) {                \
-        int fpstatus = (int) _clearfp();                        \
-                                                                        \
+#define UFUNC_CHECK_STATUS(ret) { \
+        int fpstatus = (int) _clearfp(); \
+         \
         ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
-                | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0)   \
+                | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
                 | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
-                | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0);    \
+                | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \
         }
 
 /* Solaris --------------------------------------------------------*/
@@ -290,15 +369,15 @@ typedef struct _loop1d_info {
       defined(__NetBSD__)
 #include <ieeefp.h>
 
-#define UFUNC_CHECK_STATUS(ret) {                               \
-        int fpstatus;                                           \
-                                                                \
-        fpstatus = (int) fpgetsticky();                                 \
-        ret = ((FP_X_DZ  & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0)      \
-                | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0)      \
-                | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0)     \
-                | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0);      \
-        (void) fpsetsticky(0);                                          \
+#define UFUNC_CHECK_STATUS(ret) { \
+        int fpstatus; \
+         \
+        fpstatus = (int) fpgetsticky(); \
+        ret = ((FP_X_DZ  & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
+                | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
+                | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
+                | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \
+        (void) fpsetsticky(0); \
         }
 
 #elif defined(__GLIBC__) || defined(__APPLE__) || \
@@ -312,15 +391,15 @@ typedef struct _loop1d_info {
 #include "fenv/fenv.c"
 #endif
 
-#define UFUNC_CHECK_STATUS(ret) {                                       \
-        int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |  \
-                                          FE_UNDERFLOW | FE_INVALID);   \
+#define UFUNC_CHECK_STATUS(ret) { \
+        int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | \
+                                          FE_UNDERFLOW | FE_INVALID); \
         ret = ((FE_DIVBYZERO  & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
                 | ((FE_OVERFLOW   & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
                 | ((FE_UNDERFLOW  & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
                 | ((FE_INVALID    & fpstatus) ? UFUNC_FPE_INVALID : 0); \
-        (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |               \
-                             FE_UNDERFLOW | FE_INVALID);                \
+        (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | \
+                             FE_UNDERFLOW | FE_INVALID); \
 }
 
 #elif defined(_AIX)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index f74b2548c..ba8fc1f52 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1,11 +1,11 @@
 __all__ = ['newaxis', 'ndarray', 'flatiter', 'nditer', 'nested_iters', 'ufunc',
-           'arange', 'array', 'zeros', 'count_nonzero', 'empty', 'broadcast',
-           'dtype', 'fromstring', 'fromfile', 'frombuffer',
-           'int_asbuffer', 'where', 'argwhere', 'copyto',
+           'arange', 'array', 'zeros', 'count_nonzero', 'count_reduce_items',
+           'empty', 'broadcast', 'dtype', 'fromstring', 'fromfile',
+           'frombuffer', 'int_asbuffer', 'where', 'argwhere', 'copyto',
            'concatenate', 'fastCopyAndTranspose', 'lexsort', 'set_numeric_ops',
            'can_cast', 'promote_types', 'min_scalar_type', 'result_type',
            'asarray', 'asanyarray', 'ascontiguousarray', 'asfortranarray',
-           'isfortran', 'empty_like', 'zeros_like',
+           'isfortran', 'isna', 'empty_like', 'zeros_like', 'ones_like',
            'correlate', 'convolve', 'inner', 'dot', 'einsum', 'outer', 'vdot',
            'alterdot', 'restoredot', 'roll', 'rollaxis', 'cross', 'tensordot',
            'array2string', 'get_printoptions', 'set_printoptions',
@@ -62,8 +62,7 @@ copyto = multiarray.copyto
 ufunc = type(sin)
 
 
-# originally from Fernando Perez's IPython
-def zeros_like(a, dtype=None, order='K', subok=True):
+def zeros_like(a, dtype=None, order='K', subok=True, maskna=False):
     """
     Return an array of zeros with the same shape and type as a given array.
 
@@ -81,6 +80,8 @@ def zeros_like(a, dtype=None, order='K', subok=True):
         'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
         'C' otherwise. 'K' means match the layout of `a` as closely
         as possible.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -113,11 +114,97 @@ def zeros_like(a, dtype=None, order='K', subok=True):
     array([ 0.,  0.,  0.])
 
     """
-    res = empty_like(a, dtype=dtype, order=order, subok=subok)
+    res = empty_like(a, dtype=dtype, order=order, subok=subok, maskna=maskna)
     multiarray.copyto(res, 0, casting='unsafe')
     return res
 
-# end Fernando's utilities
+def ones(shape, dtype=None, order='C', maskna=False):
+    """
+    Return a new array of given shape and type, filled with ones.
+
+    Please refer to the documentation for `zeros` for further details.
+
+    See Also
+    --------
+    zeros, ones_like
+
+    Examples
+    --------
+    >>> np.ones(5)
+    array([ 1.,  1.,  1.,  1.,  1.])
+
+    >>> np.ones((5,), dtype=np.int)
+    array([1, 1, 1, 1, 1])
+
+    >>> np.ones((2, 1))
+    array([[ 1.],
+           [ 1.]])
+
+    >>> s = (2,2)
+    >>> np.ones(s)
+    array([[ 1.,  1.],
+           [ 1.,  1.]])
+
+    """
+    a = empty(shape, dtype, order, maskna)
+    multiarray.copyto(a, 1, casting='unsafe')
+    return a
+
+def ones_like(a, dtype=None, order='K', subok=True, maskna=False):
+    """
+    Return an array of ones with the same shape and type as a given array.
+
+    With default parameters, is equivalent to ``a.copy().fill(1)``.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+    maskna : boolean
+        If this is true, the returned array will have an NA mask.
+
+    Returns
+    -------
+    out : ndarray
+        Array of ones with the same shape and type as `a`.
+
+    See Also
+    --------
+    zeros_like : Return an array of zeros with shape and type of input.
+    empty_like : Return an empty array with shape and type of input.
+    zeros : Return a new array setting values to zero.
+    ones : Return a new array setting values to one.
+    empty : Return a new uninitialized array.
+
+    Examples
+    --------
+    >>> x = np.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x
+    array([[0, 1, 2],
+           [3, 4, 5]])
+    >>> np.ones_like(x)
+    array([[1, 1, 1],
+           [1, 1, 1]])
+
+    >>> y = np.arange(3, dtype=np.float)
+    >>> y
+    array([ 0.,  1.,  2.])
+    >>> np.ones_like(y)
+    array([ 1.,  1.,  1.])
+
+    """
+    res = empty_like(a, dtype=dtype, order=order, subok=subok, maskna=maskna)
+    multiarray.copyto(res, 1, casting='unsafe')
+    return res
 
 
 def extend_all(module):
@@ -142,6 +229,7 @@ arange = multiarray.arange
 array = multiarray.array
 zeros = multiarray.zeros
 count_nonzero = multiarray.count_nonzero
+count_reduce_items = multiarray.count_reduce_items
 empty = multiarray.empty
 empty_like = multiarray.empty_like
 fromstring = multiarray.fromstring
@@ -164,8 +252,9 @@ lexsort = multiarray.lexsort
 compare_chararrays = multiarray.compare_chararrays
 putmask = multiarray.putmask
 einsum = multiarray.einsum
+isna = multiarray.isna
 
-def asarray(a, dtype=None, order=None):
+def asarray(a, dtype=None, order=None, maskna=None, ownmaskna=False):
     """
     Convert the input to an array.
 
@@ -180,6 +269,13 @@ def asarray(a, dtype=None, order=None):
     order : {'C', 'F'}, optional
         Whether to use row-major ('C') or column-major ('F' for FORTRAN)
         memory representation.  Defaults to 'C'.
+   maskna : bool or None, optional
+        If this is set to True, it forces the array to have an NA mask.
+        If this is set to False, it forces the array to not have an NA
+        mask.
+    ownmaskna : bool, optional
+        If this is set to True, forces the array to have a mask which
+        it owns.
 
     Returns
     -------
@@ -233,9 +329,10 @@ def asarray(a, dtype=None, order=None):
     True
 
     """
-    return array(a, dtype, copy=False, order=order)
+    return array(a, dtype, copy=False, order=order,
+                            maskna=maskna, ownmaskna=ownmaskna)
 
-def asanyarray(a, dtype=None, order=None):
+def asanyarray(a, dtype=None, order=None, maskna=None, ownmaskna=False):
     """
     Convert the input to an ndarray, but pass ndarray subclasses through.
 
@@ -250,6 +347,13 @@ def asanyarray(a, dtype=None, order=None):
     order : {'C', 'F'}, optional
         Whether to use row-major ('C') or column-major ('F') memory
         representation.  Defaults to 'C'.
+   maskna : bool or None, optional
+        If this is set to True, it forces the array to have an NA mask.
+        If this is set to False, it forces the array to not have an NA
+        mask.
+    ownmaskna : bool, optional
+        If this is set to True, forces the array to have a mask which
+        it owns.
 
     Returns
     -------
@@ -285,9 +389,10 @@ def asanyarray(a, dtype=None, order=None):
     True
 
     """
-    return array(a, dtype, copy=False, order=order, subok=True)
+    return array(a, dtype, copy=False, order=order, subok=True,
+                                maskna=maskna, ownmaskna=ownmaskna)
 
-def ascontiguousarray(a, dtype=None):
+def ascontiguousarray(a, dtype=None, maskna=None, ownmaskna=False):
     """
     Return a contiguous array in memory (C order).
 
@@ -297,6 +402,13 @@ def ascontiguousarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         Data-type of returned array.
+   maskna : bool or None, optional
+        If this is set to True, it forces the array to have an NA mask.
+        If this is set to False, it forces the array to not have an NA
+        mask.
+    ownmaskna : bool, optional
+        If this is set to True, forces the array to have a mask which
+        it owns.
 
     Returns
     -------
@@ -321,9 +433,10 @@ def ascontiguousarray(a, dtype=None):
     True
 
     """
-    return array(a, dtype, copy=False, order='C', ndmin=1)
+    return array(a, dtype, copy=False, order='C', ndmin=1,
+                                maskna=maskna, ownmaskna=ownmaskna)
 
-def asfortranarray(a, dtype=None):
+def asfortranarray(a, dtype=None, maskna=None, ownmaskna=False):
     """
     Return an array laid out in Fortran order in memory.
 
@@ -333,6 +446,13 @@ def asfortranarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         By default, the data-type is inferred from the input data.
+   maskna : bool or None, optional
+        If this is set to True, it forces the array to have an NA mask.
+        If this is set to False, it forces the array to not have an NA
+        mask.
+    ownmaskna : bool, optional
+        If this is set to True, forces the array to have a mask which
+        it owns.
 
     Returns
     -------
@@ -357,7 +477,8 @@ def asfortranarray(a, dtype=None):
     True
 
     """
-    return array(a, dtype, copy=False, order='F', ndmin=1)
+    return array(a, dtype, copy=False, order='F', ndmin=1,
+                                maskna=maskna, ownmaskna=ownmaskna)
 
 def require(a, dtype=None, requirements=None):
     """
@@ -1324,14 +1445,25 @@ def array_repr(arr, max_line_width=None, precision=None, suppress_small=None):
                            ', ', "array(")
     else: # show zero-length shape unless it is (0,)
         lst = "[], shape=%s" % (repr(arr.shape),)
-    typeless = arr.dtype.type in _typelessdata
 
     if arr.__class__ is not ndarray:
         cName= arr.__class__.__name__
     else:
         cName = "array"
-    if typeless and arr.size:
-        return cName + "(%s)" % lst
+
+    skipdtype = (arr.dtype.type in _typelessdata) and arr.size > 0
+
+    if arr.flags.maskna:
+        whichna = isna(arr)
+        # If nothing is NA, explicitly signal the NA-mask
+        if not any(whichna):
+            lst += ", maskna=True"
+        # If everything is NA, can't skip the dtype
+        if skipdtype and all(whichna):
+            skipdtype = False
+
+    if skipdtype:
+        return "%s(%s)" % (cName, lst)
     else:
         typename = arr.dtype.name
         # Quote typename in the output if it is "complex".
@@ -1793,39 +1925,7 @@ def _maketup(descr, val):
         res = [_maketup(fields[name][0],val) for name in dt.names]
         return tuple(res)
 
-def ones(shape, dtype=None, order='C'):
-    """
-    Return a new array of given shape and type, filled with ones.
-
-    Please refer to the documentation for `zeros` for further details.
-
-    See Also
-    --------
-    zeros, ones_like
-
-    Examples
-    --------
-    >>> np.ones(5)
-    array([ 1.,  1.,  1.,  1.,  1.])
-
-    >>> np.ones((5,), dtype=np.int)
-    array([1, 1, 1, 1, 1])
-
-    >>> np.ones((2, 1))
-    array([[ 1.],
-           [ 1.]])
-
-    >>> s = (2,2)
-    >>> np.ones(s)
-    array([[ 1.,  1.],
-           [ 1.,  1.]])
-
-    """
-    a = empty(shape, dtype, order)
-    multiarray.copyto(a, 1, casting='unsafe')
-    return a
-
-def identity(n, dtype=None):
+def identity(n, dtype=None, maskna=False):
     """
     Return the identity array.
 
@@ -1838,6 +1938,8 @@ def identity(n, dtype=None):
         Number of rows (and columns) in `n` x `n` output.
     dtype : data-type, optional
         Data-type of the output.  Defaults to ``float``.
+    maskna : bool, optional
+        If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -1853,8 +1955,8 @@ def identity(n, dtype=None):
            [ 0.,  0.,  1.]])
 
     """
-    a = zeros((n,n), dtype=dtype)
-    a.flat[::n+1] = 1
+    a = zeros((n,n), dtype=dtype, maskna=maskna)
+    a.diagonal()[...] = 1
     return a
 
 def allclose(a, b, rtol=1.e-5, atol=1.e-8):
@@ -1961,7 +2063,7 @@ def array_equal(a1, a2):
         return False
     if a1.shape != a2.shape:
         return False
-    return bool(logical_and.reduce(equal(a1,a2).ravel()))
+    return bool(equal(a1,a2).all())
 
 def array_equiv(a1, a2):
     """
@@ -2003,7 +2105,7 @@ def array_equiv(a1, a2):
     except:
         return False
     try:
-        return bool(logical_and.reduce(equal(a1,a2).ravel()))
+        return bool(equal(a1,a2).all())
     except ValueError:
         return False
 
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 017898cf0..6dc82f706 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -86,11 +86,13 @@ __all__ = ['sctypeDict', 'sctypeNA', 'typeDict', 'typeNA', 'sctypes',
            'ScalarType', 'obj2sctype', 'cast', 'nbytes', 'sctype2char',
            'maximum_sctype', 'issctype', 'typecodes', 'find_common_type',
            'issubdtype', 'datetime_data','datetime_as_string',
-           'busday_offset', 'busday_count', 'is_busday', 'busdaycalendar']
+           'busday_offset', 'busday_count', 'is_busday', 'busdaycalendar',
+           'NA', 'NAType']
 
 from numpy.core.multiarray import typeinfo, ndarray, array, \
                       empty, dtype, datetime_data, datetime_as_string, \
-                      busday_offset, busday_count, is_busday, busdaycalendar
+                      busday_offset, busday_count, is_busday, busdaycalendar, \
+                      NA, NAType
 import types as _types
 import sys
 
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 6b5b0df3b..e8f40e806 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -688,6 +688,7 @@ def configuration(parent_package='',top_path=None):
                    join(local_dir, subpath, 'arraytypes.c.src'),
                    join(local_dir, subpath, 'nditer_templ.c.src'),
                    join(local_dir, subpath, 'lowlevel_strided_loops.c.src'),
+                   join(local_dir, subpath, 'boolean_ops.c.src'),
                    join(local_dir, subpath, 'einsum.c.src')]
 
         # numpy.distutils generate .c from .c.src in weird directories, we have
@@ -700,6 +701,7 @@ def configuration(parent_package='',top_path=None):
     multiarray_deps = [
             join('src', 'multiarray', 'arrayobject.h'),
             join('src', 'multiarray', 'arraytypes.h'),
+            join('src', 'multiarray', 'array_assign.h'),
             join('src', 'multiarray', 'buffer.h'),
             join('src', 'multiarray', 'calculation.h'),
             join('src', 'multiarray', 'common.h'),
@@ -718,12 +720,15 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'numpymemoryview.h'),
             join('src', 'multiarray', 'number.h'),
             join('src', 'multiarray', 'numpyos.h'),
+            join('src', 'multiarray', 'reduction.h'),
             join('src', 'multiarray', 'refcount.h'),
             join('src', 'multiarray', 'scalartypes.h'),
             join('src', 'multiarray', 'sequence.h'),
             join('src', 'multiarray', 'shape.h'),
             join('src', 'multiarray', 'ucsnarrow.h'),
             join('src', 'multiarray', 'usertypes.h'),
+            join('src', 'multiarray', 'na_mask.h'),
+            join('src', 'multiarray', 'na_object.h'),
             join('src', 'private', 'lowlevel_strided_loops.h'),
             join('include', 'numpy', 'arrayobject.h'),
             join('include', 'numpy', '_neighborhood_iterator_imp.h'),
@@ -750,6 +755,10 @@ def configuration(parent_package='',top_path=None):
     multiarray_src = [
             join('src', 'multiarray', 'arrayobject.c'),
             join('src', 'multiarray', 'arraytypes.c.src'),
+            join('src', 'multiarray', 'array_assign.c'),
+            join('src', 'multiarray', 'array_assign_scalar.c'),
+            join('src', 'multiarray', 'array_assign_array.c'),
+            join('src', 'multiarray', 'boolean_ops.c.src'),
             join('src', 'multiarray', 'buffer.c'),
             join('src', 'multiarray', 'calculation.c'),
             join('src', 'multiarray', 'common.c'),
@@ -773,6 +782,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'mapping.c'),
             join('src', 'multiarray', 'methods.c'),
             join('src', 'multiarray', 'multiarraymodule.c'),
+            join('src', 'multiarray', 'na_mask.c'),
+            join('src', 'multiarray', 'na_object.c'),
             join('src', 'multiarray', 'nditer_templ.c.src'),
             join('src', 'multiarray', 'nditer_api.c'),
             join('src', 'multiarray', 'nditer_constr.c'),
@@ -780,6 +791,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'number.c'),
             join('src', 'multiarray', 'numpymemoryview.c'),
             join('src', 'multiarray', 'numpyos.c'),
+            join('src', 'multiarray', 'reduction.c'),
             join('src', 'multiarray', 'refcount.c'),
             join('src', 'multiarray', 'sequence.c'),
             join('src', 'multiarray', 'shape.c'),
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 93de19299..8a4c80e27 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -267,5 +267,10 @@ def hstack(tup):
            [3, 4]])
 
     """
-    return _nx.concatenate(map(atleast_1d,tup),1)
+    arrs = map(atleast_1d,tup)
+    # As a special case, dimension 0 of 1-dimensional arrays is "horizontal"
+    if arrs[0].ndim == 1:
+        return _nx.concatenate(arrs, 0)
+    else:
+        return _nx.concatenate(arrs, 1)
 
diff --git a/numpy/core/src/multiarray/array_assign.c b/numpy/core/src/multiarray/array_assign.c
new file mode 100644
index 000000000..896f32ffd
--- /dev/null
+++ b/numpy/core/src/multiarray/array_assign.c
@@ -0,0 +1,146 @@
+/*
+ * This file implements some helper functions for the array assignment
+ * routines. The actual assignment routines are in array_assign_*.c
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "shape.h"
+
+#include "array_assign.h"
+
+/* See array_assign.h for parameter documentation */
+NPY_NO_EXPORT int
+broadcast_strides(int ndim, npy_intp *shape,
+                int strides_ndim, npy_intp *strides_shape, npy_intp *strides,
+                char *strides_name,
+                npy_intp *out_strides)
+{
+    int idim, idim_start = ndim - strides_ndim;
+
+    /* Can't broadcast to fewer dimensions */
+    if (idim_start < 0) {
+        goto broadcast_error;
+    }
+
+    /*
+     * Process from the end to the start, so that 'strides' and 'out_strides'
+     * can point to the same memory.
+     */
+    for (idim = ndim - 1; idim >= idim_start; --idim) {
+        npy_intp strides_shape_value = strides_shape[idim - idim_start];
+        /* If it doesn't have dimension one, it must match */
+        if (strides_shape_value == 1) {
+            out_strides[idim] = 0;
+        }
+        else if (strides_shape_value != shape[idim]) {
+            goto broadcast_error;
+        }
+        else {
+            out_strides[idim] = strides[idim - idim_start];
+        }
+    }
+
+    /* New dimensions get a zero stride */
+    for (idim = 0; idim < idim_start; ++idim) {
+        out_strides[idim] = 0;
+    }
+
+    return 0;
+
+broadcast_error: {
+        PyObject *errmsg;
+
+        errmsg = PyUString_FromFormat("could not broadcast %s from shape ",
+                                strides_name);
+        PyUString_ConcatAndDel(&errmsg,
+                build_shape_string(strides_ndim, strides_shape));
+        PyUString_ConcatAndDel(&errmsg,
+                PyUString_FromString(" into shape "));
+        PyUString_ConcatAndDel(&errmsg,
+                build_shape_string(ndim, shape));
+        PyErr_SetObject(PyExc_ValueError, errmsg);
+
+        return -1;
+   }
+}
+
+/* See array_assign.h for parameter documentation */
+NPY_NO_EXPORT int
+raw_array_is_aligned(int ndim, char *data, npy_intp *strides, int alignment)
+{
+    if (alignment > 1) {
+        npy_intp align_check = (npy_intp)data;
+        int idim;
+
+        for (idim = 0; idim < ndim; ++idim) {
+            align_check |= strides[idim];
+        }
+
+        return ((align_check & (alignment - 1)) == 0);
+    }
+    else {
+        return 1;
+    }
+}
+
+
+/* Gets a half-open range [start, end) which contains the array data */
+NPY_NO_EXPORT void
+get_array_memory_extents(PyArrayObject *arr,
+                    npy_uintp *out_start, npy_uintp *out_end)
+{
+    npy_uintp start, end;
+    npy_intp idim, ndim = PyArray_NDIM(arr);
+    npy_intp *dimensions = PyArray_DIMS(arr),
+            *strides = PyArray_STRIDES(arr);
+
+    /* Calculate with a closed range [start, end] */
+    start = end = (npy_uintp)PyArray_DATA(arr);
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp stride = strides[idim], dim = dimensions[idim];
+        /* If the array size is zero, return an empty range */
+        if (dim == 0) {
+            *out_start = *out_end = (npy_uintp)PyArray_DATA(arr);
+            return;
+        }
+        /* Expand either upwards or downwards depending on stride */
+        else {
+            if (stride > 0) {
+                end += stride*(dim-1);
+            }
+            else if (stride < 0) {
+                start += stride*(dim-1);
+            }
+        }
+    }
+
+    /* Return a half-open range */
+    *out_start = start;
+    *out_end = end + PyArray_DESCR(arr)->elsize;
+}
+
+/* Returns 1 if the arrays have overlapping data, 0 otherwise */
+NPY_NO_EXPORT int
+arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
+{
+    npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
+
+    get_array_memory_extents(arr1, &start1, &end1);
+    get_array_memory_extents(arr2, &start2, &end2);
+
+    return (start1 < end2) && (start2 < end1);
+}
diff --git a/numpy/core/src/multiarray/array_assign.h b/numpy/core/src/multiarray/array_assign.h
new file mode 100644
index 000000000..c58a63d22
--- /dev/null
+++ b/numpy/core/src/multiarray/array_assign.h
@@ -0,0 +1,96 @@
+#ifndef _NPY_PRIVATE__ARRAY_ASSIGN_H_
+#define _NPY_PRIVATE__ARRAY_ASSIGN_H_
+
+/*
+ * An array assignment function for copying arrays, treating the
+ * arrays as flat according to their respective ordering rules.
+ * This function makes a temporary copy of 'src' if 'src' and
+ * 'dst' overlap, to be able to handle views of the same data with
+ * different strides.
+ *
+ * dst: The destination array.
+ * dst_order: The rule for how 'dst' is to be made flat.
+ * src: The source array.
+ * src_order: The rule for how 'src' is to be made flat.
+ * casting: An exception is raised if the copy violates this
+ *          casting rule.
+ * preservena: If 0, overwrites everything in 'dst', if 1, it
+ *              preserves elements in 'dst' which are NA.
+ * preservewhichna: Must be NULL. When multi-NA support is implemented,
+ *                   this will be an array of flags for 'preservena=True',
+ *                   indicating which NA payload values to preserve.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+/* Not yet implemented
+NPY_NO_EXPORT int
+PyArray_AssignArrayAsFlat(PyArrayObject *dst, NPY_ORDER dst_order,
+                  PyArrayObject *src, NPY_ORDER src_order,
+                  NPY_CASTING casting,
+                  npy_bool preservena, npy_bool *preservewhichna);
+*/
+
+
+
+/******** LOW-LEVEL SCALAR TO ARRAY ASSIGNMENT ********/
+
+/*
+ * Assigns the scalar value to every element of the destination raw array.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_assign_scalar(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data);
+
+/*
+ * Assigns the scalar value to every element of the destination raw array
+ * where the 'wheremask' value is True.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_wheremasked_assign_scalar(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data,
+        PyArray_Descr *wheremask_dtype, char *wheremask_data,
+        npy_intp *wheremask_strides);
+
+/******** LOW-LEVEL ARRAY MANIPULATION HELPERS ********/
+
+/*
+ * Internal detail of how much to buffer during array assignments which
+ * need it. This is for more complex NA masking operations where masks
+ * need to be inverted or combined together.
+ */
+#define NPY_ARRAY_ASSIGN_BUFFERSIZE 8192
+
+/*
+ * Broadcasts strides to match the given dimensions. Can be used,
+ * for instance, to set up a raw iteration.
+ *
+ * 'strides_name' is used to produce an error message if the strides
+ * cannot be broadcast.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+broadcast_strides(int ndim, npy_intp *shape,
+                int strides_ndim, npy_intp *strides_shape, npy_intp *strides,
+                char *strides_name,
+                npy_intp *out_strides);
+
+/*
+ * Checks whether a data pointer + set of strides refers to a raw
+ * array which is fully aligned data.
+ */
+NPY_NO_EXPORT int
+raw_array_is_aligned(int ndim, char *data, npy_intp *strides, int alignment);
+
+/* Returns 1 if the arrays have overlapping data, 0 otherwise */
+NPY_NO_EXPORT int
+arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2);
+
+
+#endif
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
new file mode 100644
index 000000000..82a70c0a4
--- /dev/null
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -0,0 +1,812 @@
+/*
+ * This file implements assignment from an ndarray to another ndarray.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "convert_datatype.h"
+#include "methods.h"
+#include "shape.h"
+#include "lowlevel_strided_loops.h"
+#include "na_object.h"
+#include "na_mask.h"
+
+#include "array_assign.h"
+
+/*
+ * Assigns the array from 'src' to 'dst'. The strides must already have
+ * been broadcast.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_assign_array(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data, npy_intp *src_strides)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS];
+    npy_intp dst_strides_it[NPY_MAXDIMS];
+    npy_intp src_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_StridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim,
+                        dst_data, dst_strides, dst_dtype->alignment) &&
+              raw_array_is_aligned(ndim,
+                        src_data, src_strides, src_dtype->alignment);
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareTwoRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    src_data, src_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it,
+                    &src_data, src_strides_it) < 0) {
+        return -1;
+    }
+
+    /*
+     * Overlap check for the 1D case. Higher dimensional arrays and
+     * opposite strides cause a temporary copy before getting here.
+     */
+    if (ndim == 1 && src_data < dst_data &&
+                src_data + shape_it[0] * src_strides_it[0] > dst_data) {
+        src_data += (shape_it[0] - 1) * src_strides_it[0];
+        dst_data += (shape_it[0] - 1) * dst_strides_it[0];
+        src_strides_it[0] = -src_strides_it[0];
+        dst_strides_it[0] = -dst_strides_it[0];
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                        src_strides_it[0], dst_strides_it[0],
+                        src_dtype, dst_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        stransfer(dst_data, dst_strides_it[0], src_data, src_strides_it[0],
+                    shape_it[0], src_itemsize, transferdata);
+    } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                            dst_data, dst_strides_it,
+                            src_data, src_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    NPY_AUXDATA_FREE(transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+/*
+ * Assigns the array from 'src' to 'dst, whereever the 'wheremask'
+ * value is True. The strides must already have been broadcast.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_wheremasked_assign_array(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data, npy_intp *src_strides,
+        PyArray_Descr *wheremask_dtype, char *wheremask_data,
+        npy_intp *wheremask_strides)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS];
+    npy_intp dst_strides_it[NPY_MAXDIMS];
+    npy_intp src_strides_it[NPY_MAXDIMS];
+    npy_intp wheremask_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_MaskedStridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim,
+                        dst_data, dst_strides, dst_dtype->alignment) &&
+              raw_array_is_aligned(ndim,
+                        src_data, src_strides, src_dtype->alignment);
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareThreeRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    src_data, src_strides,
+                    wheremask_data, wheremask_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it,
+                    &src_data, src_strides_it,
+                    &wheremask_data, wheremask_strides_it) < 0) {
+        return -1;
+    }
+
+    /*
+     * Overlap check for the 1D case. Higher dimensional arrays cause
+     * a temporary copy before getting here.
+     */
+    if (ndim == 1 && src_data < dst_data &&
+                src_data + shape_it[0] * src_strides_it[0] > dst_data) {
+        src_data += (shape_it[0] - 1) * src_strides_it[0];
+        dst_data += (shape_it[0] - 1) * dst_strides_it[0];
+        wheremask_data += (shape_it[0] - 1) * wheremask_strides_it[0];
+        src_strides_it[0] = -src_strides_it[0];
+        dst_strides_it[0] = -dst_strides_it[0];
+        wheremask_strides_it[0] = -wheremask_strides_it[0];
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetMaskedDTypeTransferFunction(aligned,
+                        src_strides_it[0],
+                        dst_strides_it[0],
+                        wheremask_strides_it[0],
+                        src_dtype, dst_dtype, wheremask_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        stransfer(dst_data, dst_strides_it[0], src_data, src_strides_it[0],
+                    (npy_mask *)wheremask_data, wheremask_strides_it[0],
+                    shape_it[0], src_itemsize, transferdata);
+    } NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape_it,
+                            dst_data, dst_strides_it,
+                            src_data, src_strides_it,
+                            wheremask_data, wheremask_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    NPY_AUXDATA_FREE(transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+/*
+ * Assigns the elements of 'src' to 'dst' where the 'wheremask'
+ * is True, except for those which are masked as NA according
+ * to 'maskna'.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_wheremasked_assign_array_preservena(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data, npy_intp *src_strides,
+        PyArray_Descr *maskna_dtype, char *maskna_data,
+        npy_intp *maskna_strides,
+        PyArray_Descr *wheremask_dtype, char *wheremask_data,
+        npy_intp *wheremask_strides)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS];
+    npy_intp dst_strides_it[NPY_MAXDIMS];
+    npy_intp src_strides_it[NPY_MAXDIMS];
+    npy_intp maskna_strides_it[NPY_MAXDIMS];
+    npy_intp wheremask_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_MaskedStridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    PyArray_StridedBinaryOp *maskand_stransfer = NULL;
+    NpyAuxData *maskand_transferdata = NULL;
+
+    char *maskna_buffer;
+    npy_intp maskna_itemsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim,
+                        dst_data, dst_strides, dst_dtype->alignment) &&
+              raw_array_is_aligned(ndim,
+                        src_data, src_strides, src_dtype->alignment);
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareFourRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    src_data, src_strides,
+                    maskna_data, maskna_strides,
+                    wheremask_data, wheremask_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it,
+                    &src_data, src_strides_it,
+                    &maskna_data, maskna_strides_it,
+                    &wheremask_data, wheremask_strides_it) < 0) {
+        return -1;
+    }
+
+    /* Allocate a buffer for inverting/anding the mask */
+    maskna_itemsize = maskna_dtype->elsize;
+    maskna_buffer = PyArray_malloc(NPY_ARRAY_ASSIGN_BUFFERSIZE *
+                                    maskna_itemsize);
+    if (maskna_buffer == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /*
+     * Overlap check for the 1D case. Higher dimensional arrays cause
+     * a temporary copy before getting here.
+     */
+    if (ndim == 1 && src_data < dst_data &&
+                src_data + shape_it[0] * src_strides_it[0] > dst_data) {
+        src_data += (shape_it[0] - 1) * src_strides_it[0];
+        dst_data += (shape_it[0] - 1) * dst_strides_it[0];
+        maskna_data += (shape_it[0] - 1) * maskna_strides_it[0];
+        wheremask_data += (shape_it[0] - 1) * wheremask_strides_it[0];
+        src_strides_it[0] = -src_strides_it[0];
+        dst_strides_it[0] = -dst_strides_it[0];
+        maskna_strides_it[0] = -maskna_strides_it[0];
+        wheremask_strides_it[0] = -wheremask_strides_it[0];
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetMaskedDTypeTransferFunction(aligned,
+                        src_strides[0], dst_strides_it[0], maskna_itemsize,
+                        src_dtype, dst_dtype, maskna_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        PyArray_free(maskna_buffer);
+        return -1;
+    }
+
+    /*
+     * Get the function to invert the mask. The output
+     * of the binary operation is the dtype 'maskna_dtype'
+     */
+    if (PyArray_GetMaskAndFunction(
+                        maskna_strides_it[0], maskna_dtype, 0,
+                        wheremask_strides_it[0], wheremask_dtype, 0,
+                        &maskand_stransfer, &maskand_transferdata) < 0) {
+        PyArray_free(maskna_buffer);
+        NPY_AUXDATA_FREE(transferdata);
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        npy_intp buffered_count, count;
+        char *dst_d, *src_d, *maskna_d, *wheremask_d;
+        /* Process the innermost dimension a buffer size at a time */
+        count = shape_it[0];
+        dst_d = dst_data;
+        src_d = src_data;
+        maskna_d = maskna_data;
+        wheremask_d = wheremask_data;
+        do {
+            buffered_count = count < NPY_ARRAY_ASSIGN_BUFFERSIZE
+                                        ? count
+                                        : NPY_ARRAY_ASSIGN_BUFFERSIZE;
+
+            /* Prepare the mask into the buffer */
+            maskand_stransfer(maskna_buffer, maskna_itemsize,
+                        maskna_d, maskna_strides_it[0],
+                        wheremask_d, wheremask_strides_it[0],
+                        buffered_count, maskand_transferdata);
+
+            /* Transfer the data based on the buffered mask */
+            stransfer(dst_d, dst_strides_it[0], src_d, src_strides_it[0],
+                        (npy_mask *)maskna_buffer, maskna_itemsize,
+                        buffered_count, src_itemsize, transferdata);
+
+            dst_d += buffered_count * dst_strides_it[0];
+            src_d += buffered_count * src_strides_it[0];
+            maskna_d += buffered_count * maskna_strides_it[0];
+            wheremask_d += buffered_count * wheremask_strides_it[0];
+            count -= buffered_count;
+        } while (count > 0);
+    } NPY_RAW_ITER_FOUR_NEXT(idim, ndim, coord, shape_it,
+                            dst_data, dst_strides_it,
+                            src_data, src_strides_it,
+                            maskna_data, maskna_strides_it,
+                            wheremask_data, wheremask_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    PyArray_free(maskna_buffer);
+    NPY_AUXDATA_FREE(transferdata);
+    NPY_AUXDATA_FREE(maskand_transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+/*NUMPY_API
+ *
+ * An array assignment function for copying arrays, broadcasting 'src' into
+ * 'dst'. This function makes a temporary copy of 'src' if 'src' and
+ * 'dst' overlap, to be able to handle views of the same data with
+ * different strides.
+ *
+ * dst: The destination array.
+ * src: The source array.
+ * wheremask: If non-NULL, a boolean mask specifying where to copy.
+ * casting: An exception is raised if the copy violates this
+ *          casting rule.
+ * preservena: If 0, overwrites everything in 'dst', if 1, it
+ *              preserves elements in 'dst' which are NA.
+ * preservewhichna: Must be NULL. When multi-NA support is implemented,
+ *                   this will be an array of flags for 'preservena=True',
+ *                   indicating which NA payload values to preserve.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignArray(PyArrayObject *dst, PyArrayObject *src,
+                    PyArrayObject *wheremask,
+                    NPY_CASTING casting,
+                    npy_bool preservena, npy_bool *preservewhichna)
+{
+    int dst_has_maskna = PyArray_HASMASKNA(dst);
+    int src_has_maskna = PyArray_HASMASKNA(src);
+    int copied_src = 0;
+
+    npy_intp src_strides[NPY_MAXDIMS], src_maskna_strides[NPY_MAXDIMS];
+
+    /* Use array_assign_scalar if 'src' NDIM is 0 */
+    if (PyArray_NDIM(src) == 0) {
+        /* If the array is masked, assign to the NA mask */
+        if (PyArray_HASMASKNA(src)) {
+            NpyNA *na = NpyNA_FromObject((PyObject *)src, 1);
+
+            if (na != NULL) {
+                /* TODO: With multi-NA, preservena must also be followed */
+                int retcode = PyArray_AssignNA(dst, na, wheremask,
+                                            preservena, preservewhichna);
+                Py_DECREF(na);
+                return retcode;
+            }
+        }
+
+        return PyArray_AssignRawScalar(
+                            dst, PyArray_DESCR(src), PyArray_DATA(src),
+                            wheremask, casting, preservena, preservewhichna);
+    }
+
+    /*
+     * Performance fix for expresions like "a[1000:6000] += x".  In this
+     * case, first an in-place add is done, followed by an assignment,
+     * equivalently expressed like this:
+     *
+     *   tmp = a[1000:6000]   # Calls array_subscript_nice in mapping.c
+     *   np.add(tmp, x, tmp)
+     *   a[1000:6000] = tmp   # Calls array_ass_sub in mapping.c
+     *
+     * In the assignment the underlying data type, shape, strides, and
+     * data pointers are identical, but src != dst because they are separately
+     * generated slices.  By detecting this and skipping the redundant
+     * copy of values to themselves, we potentially give a big speed boost.
+     *
+     * Note that we don't call EquivTypes, because usually the exact same
+     * dtype object will appear, and we don't want to slow things down
+     * with a complicated comparison.  The comparisons are ordered to
+     * try and reject this with as little work as possible.
+     */
+    if (PyArray_DATA(src) == PyArray_DATA(dst) &&
+                        PyArray_MASKNA_DATA(src) == PyArray_MASKNA_DATA(dst) &&
+                        PyArray_DESCR(src) == PyArray_DESCR(dst) &&
+                        PyArray_NDIM(src) == PyArray_NDIM(dst) &&
+                        PyArray_CompareLists(PyArray_DIMS(src),
+                                             PyArray_DIMS(dst),
+                                             PyArray_NDIM(src)) &&
+                        PyArray_CompareLists(PyArray_STRIDES(src),
+                                             PyArray_STRIDES(dst),
+                                             PyArray_NDIM(src))) {
+        /*printf("Redundant copy operation detected\n");*/
+        return 0;
+    }
+
+    /* Check that 'dst' is writeable */
+    if (!PyArray_ISWRITEABLE(dst)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "cannot assign to a read-only array");
+        goto fail;
+    }
+
+    /* Check the casting rule */
+    if (!PyArray_CanCastTypeTo(PyArray_DESCR(src),
+                                PyArray_DESCR(dst), casting)) {
+        PyObject *errmsg;
+        errmsg = PyUString_FromString("Cannot cast scalar from ");
+        PyUString_ConcatAndDel(&errmsg,
+                PyObject_Repr((PyObject *)PyArray_DESCR(src)));
+        PyUString_ConcatAndDel(&errmsg,
+                PyUString_FromString(" to "));
+        PyUString_ConcatAndDel(&errmsg,
+                PyObject_Repr((PyObject *)PyArray_DESCR(dst)));
+        PyUString_ConcatAndDel(&errmsg,
+                PyUString_FromFormat(" according to the rule %s",
+                        npy_casting_to_string(casting)));
+        PyErr_SetObject(PyExc_TypeError, errmsg);
+        goto fail;
+    }
+
+    if (preservewhichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA support is not yet implemented");
+        goto fail;
+    }
+
+    if (src_has_maskna && !dst_has_maskna) {
+        int containsna = PyArray_ContainsNA(src, wheremask, NULL);
+        if (containsna == -1) {
+            goto fail;
+        }
+        else if (containsna) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+            goto fail;
+        }
+        else {
+            src_has_maskna = 0;
+        }
+    }
+
+    /*
+     * When ndim is 1 and the strides point in the same direction,
+     * the lower-level inner loop handles copying
+     * of overlapping data. For bigger ndim and opposite-strided 1D
+     * data, we make a temporary copy of 'src' if 'src' and 'dst' overlap.'
+     */
+    if (((PyArray_NDIM(dst) == 1 && PyArray_NDIM(src) >= 1 &&
+                    PyArray_STRIDES(dst)[0] *
+                            PyArray_STRIDES(src)[PyArray_NDIM(src) - 1] < 0) ||
+                    PyArray_NDIM(dst) > 1) && arrays_overlap(src, dst)) {
+        PyArrayObject *tmp;
+
+        /*
+         * Allocate a temporary copy array.
+         */
+        tmp = (PyArrayObject *)PyArray_NewLikeArray(dst,
+                                        NPY_KEEPORDER, NULL, 0);
+        if (tmp == NULL) {
+            goto fail;
+        }
+
+        /* Make the temporary copy have an NA mask if necessary */
+        if (PyArray_HASMASKNA(src)) {
+            if (PyArray_AllocateMaskNA(tmp, 1, 0, 1) < 0) {
+                Py_DECREF(tmp);
+                goto fail;
+            }
+        }
+
+        if (PyArray_AssignArray(tmp, src,
+                                NULL, NPY_UNSAFE_CASTING, 0, NULL) < 0) {
+            Py_DECREF(tmp);
+            goto fail;
+        }
+
+        src = tmp;
+        copied_src = 1;
+    }
+
+    /* Broadcast 'src' to 'dst' for raw iteration */
+    if (PyArray_NDIM(src) > PyArray_NDIM(dst)) {
+        int ndim_tmp = PyArray_NDIM(src);
+        npy_intp *src_shape_tmp = PyArray_DIMS(src);
+        npy_intp *src_strides_tmp = PyArray_STRIDES(src);
+        /*
+         * As a special case for backwards compatibility, strip
+         * away unit dimensions from the left of 'src'
+         */
+        while (ndim_tmp > PyArray_NDIM(dst) && src_shape_tmp[0] == 1) {
+            --ndim_tmp;
+            ++src_shape_tmp;
+            ++src_strides_tmp;
+        }
+
+        if (broadcast_strides(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    ndim_tmp, src_shape_tmp,
+                    src_strides_tmp, "input array",
+                    src_strides) < 0) {
+            goto fail;
+        }
+    }
+    else {
+        if (broadcast_strides(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_NDIM(src), PyArray_DIMS(src),
+                    PyArray_STRIDES(src), "input array",
+                    src_strides) < 0) {
+            goto fail;
+        }
+    }
+
+    if (src_has_maskna) {
+        if (broadcast_strides(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_NDIM(src), PyArray_DIMS(src),
+                    PyArray_MASKNA_STRIDES(src), "input array",
+                    src_maskna_strides) < 0) {
+            goto fail;
+        }
+    }
+
+    if (wheremask == NULL) {
+        /* A straightforward value assignment */
+        if (!preservena || !dst_has_maskna) {
+            /* If assigning to an array with an NA mask, set to all exposed */
+            if (dst_has_maskna) {
+                if (src_has_maskna) {
+                    /* Assign the NA mask */
+                    if (raw_array_assign_array(
+                                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                    PyArray_MASKNA_DTYPE(dst),
+                                            PyArray_MASKNA_DATA(dst),
+                                            PyArray_MASKNA_STRIDES(dst),
+                                    PyArray_MASKNA_DTYPE(src),
+                                            PyArray_MASKNA_DATA(src),
+                                            src_maskna_strides) < 0) {
+                        goto fail;
+                    }
+
+                    /* Assign the values based on the 'src' NA mask */
+                    if (raw_array_wheremasked_assign_array(
+                                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                    PyArray_DESCR(dst), PyArray_DATA(dst),
+                                            PyArray_STRIDES(dst),
+                                    PyArray_DESCR(src), PyArray_DATA(src),
+                                            src_strides,
+                                    PyArray_MASKNA_DTYPE(src),
+                                            PyArray_MASKNA_DATA(src),
+                                            src_maskna_strides) < 0) {
+                        goto fail;
+                    }
+
+                    goto finish;
+                }
+                else {
+                    if (PyArray_AssignMaskNA(dst, 1, NULL,
+                                        preservena, preservewhichna) < 0) {
+                        goto fail;
+                    }
+                }
+            }
+
+            /* Do the assignment with raw array iteration */
+            if (raw_array_assign_array(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    PyArray_DESCR(src), PyArray_DATA(src), src_strides) < 0) {
+                goto fail;
+            }
+        }
+        /* A value assignment without overwriting NA values */
+        else {
+            if (src_has_maskna) {
+                /* Assign the NA mask, wheremasked with the 'dst' NA mask */
+                if (raw_array_wheremasked_assign_array(
+                                PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                PyArray_MASKNA_DTYPE(dst),
+                                        PyArray_MASKNA_DATA(dst),
+                                        PyArray_MASKNA_STRIDES(dst),
+                                PyArray_MASKNA_DTYPE(src),
+                                        PyArray_MASKNA_DATA(src),
+                                        src_maskna_strides,
+                                PyArray_MASKNA_DTYPE(dst),
+                                        PyArray_MASKNA_DATA(dst),
+                                        PyArray_MASKNA_STRIDES(dst)) < 0) {
+                    goto fail;
+                }
+            }
+
+            /*
+             * The 'dst' NA mask now has exposed precisely the values we
+             * want to assign, so use it for this assignment.
+             */
+            if (raw_array_wheremasked_assign_array(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst),
+                            PyArray_STRIDES(dst),
+                    PyArray_DESCR(src), PyArray_DATA(src),
+                            PyArray_STRIDES(src),
+                    PyArray_MASKNA_DTYPE(dst), PyArray_MASKNA_DATA(dst),
+                            PyArray_MASKNA_STRIDES(dst)) < 0) {
+                goto fail;
+            }
+        }
+    }
+    else {
+        npy_intp wheremask_strides[NPY_MAXDIMS];
+        int containsna = PyArray_ContainsNA(wheremask, NULL, NULL);
+
+        if (containsna == -1) {
+            goto fail;
+        }
+        else if (containsna) {
+            if (!dst_has_maskna) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot assign NA to an array which "
+                        "does not support NAs");
+                goto fail;
+            }
+            else {
+                /* TODO: add support for this */
+                PyErr_SetString(PyExc_ValueError,
+                        "A where mask with NA values is not supported "
+                        "yet");
+                goto fail;
+            }
+        }
+
+        /* Broadcast the wheremask to 'dst' for raw iteration */
+        if (broadcast_strides(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_NDIM(wheremask), PyArray_DIMS(wheremask),
+                    PyArray_STRIDES(wheremask), "where mask",
+                    wheremask_strides) < 0) {
+            goto fail;
+        }
+
+        /* A straightforward where-masked assignment */
+        if (!preservena || !dst_has_maskna) {
+            /* If assigning to an array with an NA mask, set to all exposed */
+            if (dst_has_maskna) {
+                /*
+                 * TODO: If the where mask has NA values, this part
+                 *       changes too.
+                 */
+                if (src_has_maskna) {
+                    /* Assign the NA mask */
+                    if (raw_array_wheremasked_assign_array(
+                                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                    PyArray_MASKNA_DTYPE(dst),
+                                            PyArray_MASKNA_DATA(dst),
+                                            PyArray_MASKNA_STRIDES(dst),
+                                    PyArray_MASKNA_DTYPE(src),
+                                            PyArray_MASKNA_DATA(src),
+                                            src_maskna_strides,
+                                    PyArray_DESCR(wheremask),
+                                            PyArray_DATA(wheremask),
+                                            wheremask_strides) < 0) {
+                        goto fail;
+                    }
+
+                    /*
+                     * Assign the values based on the wheremask, not
+                     * overwriting values also masked by the 'src' NA mask
+                     */
+                    if (raw_array_wheremasked_assign_array_preservena(
+                                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                    PyArray_DESCR(dst), PyArray_DATA(dst),
+                                            PyArray_STRIDES(dst),
+                                    PyArray_DESCR(src), PyArray_DATA(src),
+                                            src_strides,
+                                    PyArray_MASKNA_DTYPE(src),
+                                            PyArray_MASKNA_DATA(src),
+                                            src_maskna_strides,
+                                    PyArray_DESCR(wheremask),
+                                            PyArray_DATA(wheremask),
+                                            wheremask_strides)) {
+                        goto fail;
+                    }
+
+                    goto finish;
+                }
+                else {
+                    if (PyArray_AssignMaskNA(dst, 1, wheremask,
+                                        preservena, preservewhichna) < 0) {
+                        goto fail;
+                    }
+                }
+            }
+
+            /* Do the masked assignment with raw array iteration */
+            if (raw_array_wheremasked_assign_array(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    PyArray_DESCR(src), PyArray_DATA(src), src_strides,
+                    PyArray_DESCR(wheremask), PyArray_DATA(wheremask),
+                            wheremask_strides) < 0) {
+                goto fail;
+            }
+        }
+        /* A masked value assignment without overwriting NA values */
+        else {
+            if (src_has_maskna) {
+                /*
+                 * Assign the NA mask, wheremasked with the 'dst' NA mask
+                 * and the parameter 'wheremask'
+                 */
+                if (raw_array_wheremasked_assign_array_preservena(
+                                PyArray_NDIM(dst), PyArray_DIMS(dst),
+                                PyArray_MASKNA_DTYPE(dst),
+                                        PyArray_MASKNA_DATA(dst),
+                                        PyArray_MASKNA_STRIDES(dst),
+                                PyArray_MASKNA_DTYPE(src),
+                                        PyArray_MASKNA_DATA(src),
+                                        src_maskna_strides,
+                                PyArray_MASKNA_DTYPE(dst),
+                                        PyArray_MASKNA_DATA(dst),
+                                        PyArray_MASKNA_STRIDES(dst),
+                                PyArray_DESCR(wheremask),
+                                        PyArray_DATA(wheremask),
+                                        wheremask_strides) < 0) {
+                    goto fail;
+                }
+            }
+
+            /*
+             * The 'dst' NA mask together with the 'wheremask' now have
+             * exposed precisely the values we want to assign, so use
+             * it's another wheremasked preservena assignment.
+             */
+            if (raw_array_wheremasked_assign_array_preservena(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst),
+                            PyArray_STRIDES(dst),
+                    PyArray_DESCR(src), PyArray_DATA(src),
+                            PyArray_STRIDES(src),
+                    PyArray_MASKNA_DTYPE(dst), PyArray_MASKNA_DATA(dst),
+                    PyArray_MASKNA_STRIDES(dst),
+                    PyArray_DESCR(wheremask), PyArray_DATA(wheremask),
+                    wheremask_strides) < 0) {
+                goto fail;
+            }
+        }
+    }
+
+finish:
+    if (copied_src) {
+        Py_DECREF(src);
+    }
+    return 0;
+
+fail:
+    if (copied_src) {
+        Py_DECREF(src);
+    }
+    return -1;
+}
+
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
new file mode 100644
index 000000000..2dbad89e4
--- /dev/null
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -0,0 +1,515 @@
+/*
+ * This file implements assignment from a scalar to an ndarray.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/ndarraytypes.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "convert_datatype.h"
+#include "methods.h"
+#include "shape.h"
+#include "lowlevel_strided_loops.h"
+#include "na_mask.h"
+
+#include "array_assign.h"
+
+/*
+ * Assigns the scalar value to every element of the destination raw array.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_assign_scalar(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS], dst_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_StridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim, dst_data, dst_strides,
+                                    dst_dtype->alignment);
+    if (((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) {
+        aligned = 0;
+    }
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it) < 0) {
+        return -1;
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                        0, dst_strides_it[0],
+                        src_dtype, dst_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        stransfer(dst_data, dst_strides_it[0], src_data, 0,
+                    shape_it[0], src_itemsize, transferdata);
+    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
+                            shape_it, dst_data, dst_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    NPY_AUXDATA_FREE(transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+/*
+ * Assigns the scalar value to every element of the destination raw array
+ * where the 'wheremask' value is True.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_wheremasked_assign_scalar(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data,
+        PyArray_Descr *wheremask_dtype, char *wheremask_data,
+        npy_intp *wheremask_strides)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS], dst_strides_it[NPY_MAXDIMS];
+    npy_intp wheremask_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_MaskedStridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim, dst_data, dst_strides,
+                                    dst_dtype->alignment);
+    if (((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) {
+        aligned = 0;
+    }
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareTwoRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    wheremask_data, wheremask_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it,
+                    &wheremask_data, wheremask_strides_it) < 0) {
+        return -1;
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetMaskedDTypeTransferFunction(aligned,
+                        0, dst_strides_it[0], wheremask_strides_it[0],
+                        src_dtype, dst_dtype, wheremask_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        stransfer(dst_data, dst_strides_it[0], src_data, 0,
+                    (npy_mask *)wheremask_data, wheremask_strides_it[0],
+                    shape_it[0], src_itemsize, transferdata);
+    } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                            dst_data, dst_strides_it,
+                            wheremask_data, wheremask_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    NPY_AUXDATA_FREE(transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+/*
+ * Assigns the scalar value to every element of the destination raw array
+ * where the 'wheremask' is True, except for those which are masked as NA
+ * according to 'maskna'.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+raw_array_wheremasked_assign_scalar_preservena(int ndim, npy_intp *shape,
+        PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides,
+        PyArray_Descr *src_dtype, char *src_data,
+        PyArray_Descr *maskna_dtype, char *maskna_data,
+        npy_intp *maskna_strides,
+        PyArray_Descr *wheremask_dtype, char *wheremask_data,
+        npy_intp *wheremask_strides)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS], dst_strides_it[NPY_MAXDIMS];
+    npy_intp maskna_strides_it[NPY_MAXDIMS];
+    npy_intp wheremask_strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+
+    PyArray_MaskedStridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int aligned, needs_api = 0;
+    npy_intp src_itemsize = src_dtype->elsize;
+
+    PyArray_StridedBinaryOp *maskand_stransfer = NULL;
+    NpyAuxData *maskand_transferdata = NULL;
+
+    char *maskna_buffer;
+    npy_intp maskna_itemsize;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Check alignment */
+    aligned = raw_array_is_aligned(ndim, dst_data, dst_strides,
+                                    dst_dtype->alignment);
+    if (((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) {
+        aligned = 0;
+    }
+
+    /* Use raw iteration with no heap allocation */
+    if (PyArray_PrepareThreeRawArrayIter(
+                    ndim, shape,
+                    dst_data, dst_strides,
+                    maskna_data, maskna_strides,
+                    wheremask_data, wheremask_strides,
+                    &ndim, shape_it,
+                    &dst_data, dst_strides_it,
+                    &maskna_data, maskna_strides_it,
+                    &wheremask_data, wheremask_strides_it) < 0) {
+        return -1;
+    }
+
+    /* Allocate a buffer for inverting/anding the mask */
+    maskna_itemsize = maskna_dtype->elsize;
+    maskna_buffer = PyArray_malloc(NPY_ARRAY_ASSIGN_BUFFERSIZE *
+                                    maskna_itemsize);
+    if (maskna_buffer == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetMaskedDTypeTransferFunction(aligned,
+                        0, dst_strides_it[0], maskna_itemsize,
+                        src_dtype, dst_dtype, maskna_dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        PyArray_free(maskna_buffer);
+        return -1;
+    }
+
+    /*
+     * Get the function to invert the mask. The output
+     * of the binary operation is the dtype 'maskna_dtype'
+     */
+    if (PyArray_GetMaskAndFunction(
+                        maskna_strides_it[0], maskna_dtype, 0,
+                        wheremask_strides_it[0], wheremask_dtype, 0,
+                        &maskand_stransfer, &maskand_transferdata) < 0) {
+        PyArray_free(maskna_buffer);
+        NPY_AUXDATA_FREE(transferdata);
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        npy_intp buffered_count, count;
+        char *dst_d, *maskna_d, *wheremask_d;
+        /* Process the innermost dimension a buffer size at a time */
+        count = shape_it[0];
+        dst_d = dst_data;
+        maskna_d = maskna_data;
+        wheremask_d = wheremask_data;
+        do {
+            buffered_count = count < NPY_ARRAY_ASSIGN_BUFFERSIZE
+                                        ? count
+                                        : NPY_ARRAY_ASSIGN_BUFFERSIZE;
+
+            /* Prepare the mask into the buffer */
+            maskand_stransfer(maskna_buffer, maskna_itemsize,
+                        maskna_d, maskna_strides_it[0],
+                        wheremask_d, wheremask_strides_it[0],
+                        buffered_count, maskand_transferdata);
+
+            /* Transfer the data based on the buffered mask */
+            stransfer(dst_d, dst_strides_it[0], src_data, 0,
+                        (npy_mask *)maskna_buffer, maskna_itemsize,
+                        buffered_count, src_itemsize, transferdata);
+
+            dst_d += buffered_count * dst_strides_it[0];
+            maskna_d += buffered_count * maskna_strides_it[0];
+            wheremask_d += buffered_count * wheremask_strides_it[0];
+            count -= buffered_count;
+        } while (count > 0);
+    } NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape_it,
+                            dst_data, dst_strides_it,
+                            maskna_data, maskna_strides_it,
+                            wheremask_data, wheremask_strides_it);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    PyArray_free(maskna_buffer);
+    NPY_AUXDATA_FREE(transferdata);
+    NPY_AUXDATA_FREE(maskand_transferdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+
+/*NUMPY_API
+ *
+ * Assigns a scalar value specified by 'src_dtype' and 'src_data'
+ * to elements of 'dst'.
+ *
+ * dst: The destination array.
+ * src_dtype: The data type of the source scalar.
+ * src_data: The memory element of the source scalar.
+ * wheremask: If non-NULL, a boolean mask specifying where to copy.
+ * casting: An exception is raised if the assignment violates this
+ *          casting rule.
+ * preservena: If 0, overwrites everything in 'dst', if 1, it
+ *              preserves elements in 'dst' which are NA.
+ * preservewhichna: Must be NULL. When multi-NA support is implemented,
+ *                   this will be an array of flags for 'preservena=True',
+ *                   indicating which NA payload values to preserve.
+ *
+ * This function is implemented in array_assign_scalar.c.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignRawScalar(PyArrayObject *dst,
+                    PyArray_Descr *src_dtype, char *src_data,
+                    PyArrayObject *wheremask,
+                    NPY_CASTING casting,
+                    npy_bool preservena, npy_bool *preservewhichna)
+{
+    int allocated_src_data = 0, dst_has_maskna = PyArray_HASMASKNA(dst);
+    npy_longlong scalarbuffer[4];
+
+    /* Check that 'dst' is writeable */
+    if (!PyArray_ISWRITEABLE(dst)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "cannot assign a scalar value to a read-only array");
+        return -1;
+    }
+
+    /* Check the casting rule */
+    if (!can_cast_scalar_to(src_dtype, src_data,
+                            PyArray_DESCR(dst), casting)) {
+        PyObject *errmsg;
+        errmsg = PyUString_FromString("Cannot cast scalar from ");
+        PyUString_ConcatAndDel(&errmsg,
+                PyObject_Repr((PyObject *)src_dtype));
+        PyUString_ConcatAndDel(&errmsg,
+                PyUString_FromString(" to "));
+        PyUString_ConcatAndDel(&errmsg,
+                PyObject_Repr((PyObject *)PyArray_DESCR(dst)));
+        PyUString_ConcatAndDel(&errmsg,
+                PyUString_FromFormat(" according to the rule %s",
+                        npy_casting_to_string(casting)));
+        PyErr_SetObject(PyExc_TypeError, errmsg);
+        return -1;
+    }
+
+    if (preservewhichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA support is not yet implemented");
+        return -1;
+    }
+
+    /*
+     * Make a copy of the src data if it's a different dtype than 'dst'
+     * or isn't aligned, and the destination we're copying to has
+     * more than one element. To avoid having to manage object lifetimes,
+     * we also skip this if 'dst' has an object dtype.
+     */
+    if ((!PyArray_EquivTypes(PyArray_DESCR(dst), src_dtype) ||
+                ((npy_intp)src_data & (src_dtype->alignment - 1)) != 0) &&
+                    PyArray_SIZE(dst) > 1 &&
+                    !PyDataType_REFCHK(PyArray_DESCR(dst))) {
+        char *tmp_src_data;
+
+        /*
+         * Use a static buffer to store the aligned/cast version,
+         * or allocate some memory if more space is needed.
+         */
+        if (sizeof(scalarbuffer) >= PyArray_DESCR(dst)->elsize) {
+            tmp_src_data = (char *)&scalarbuffer[0];
+        }
+        else {
+            tmp_src_data = PyArray_malloc(PyArray_DESCR(dst)->elsize);
+            allocated_src_data = 1;
+        }
+        if (PyArray_CastRawArrays(1, src_data, tmp_src_data, 0, 0,
+                            src_dtype, PyArray_DESCR(dst), 0) != NPY_SUCCEED) {
+            goto fail;
+        }
+
+        /* Replace src_data/src_dtype */
+        src_data = tmp_src_data;
+        src_dtype = PyArray_DESCR(dst);
+    }
+
+    if (wheremask == NULL) {
+        /* A straightforward value assignment */
+        if (!preservena || !dst_has_maskna) {
+            /* If assigning to an array with an NA mask, set to all exposed */
+            if (dst_has_maskna) {
+                if (PyArray_AssignMaskNA(dst, 1, NULL,
+                                    preservena, preservewhichna) < 0) {
+                    goto fail;
+                }
+            }
+
+            /* Do the assignment with raw array iteration */
+            if (raw_array_assign_scalar(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    src_dtype, src_data) < 0) {
+                goto fail;
+            }
+        }
+        /* A value assignment without overwriting NA values */
+        else {
+            if (raw_array_wheremasked_assign_scalar(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    src_dtype, src_data,
+                    PyArray_MASKNA_DTYPE(dst), PyArray_MASKNA_DATA(dst),
+                    PyArray_MASKNA_STRIDES(dst)) < 0) {
+                goto fail;
+            }
+        }
+    }
+    else {
+        npy_intp wheremask_strides[NPY_MAXDIMS];
+        int containsna = PyArray_ContainsNA(wheremask, NULL, NULL);
+
+        if (containsna == -1) {
+            goto fail;
+        }
+        else if (containsna) {
+            if (!dst_has_maskna) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot assign NA to an array which "
+                        "does not support NAs");
+                goto fail;
+            }
+            else {
+                /* TODO: add support for this */
+                PyErr_SetString(PyExc_ValueError,
+                        "A where mask with NA values is not supported "
+                        "yet");
+                goto fail;
+            }
+        }
+
+        /* Broadcast the wheremask to 'dst' for raw iteration */
+        if (broadcast_strides(PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_NDIM(wheremask), PyArray_DIMS(wheremask),
+                    PyArray_STRIDES(wheremask), "where mask",
+                    wheremask_strides) < 0) {
+            goto fail;
+        }
+
+        /* A straightforward where-masked assignment */
+        if (!preservena || !dst_has_maskna) {
+            /* If assigning to an array with an NA mask, set to all exposed */
+            if (dst_has_maskna) {
+                /*
+                 * TODO: If the where mask has NA values, this part
+                 *       changes too.
+                 */
+                if (PyArray_AssignMaskNA(dst, 1, wheremask,
+                                    preservena, preservewhichna) < 0) {
+                    goto fail;
+                }
+            }
+
+            /* Do the masked assignment with raw array iteration */
+            if (raw_array_wheremasked_assign_scalar(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    src_dtype, src_data,
+                    PyArray_DESCR(wheremask), PyArray_DATA(wheremask),
+                    wheremask_strides) < 0) {
+                goto fail;
+            }
+        }
+        /* A masked value assignment without overwriting NA values */
+        else {
+            if (raw_array_wheremasked_assign_scalar_preservena(
+                    PyArray_NDIM(dst), PyArray_DIMS(dst),
+                    PyArray_DESCR(dst), PyArray_DATA(dst), PyArray_STRIDES(dst),
+                    src_dtype, src_data,
+                    PyArray_MASKNA_DTYPE(dst), PyArray_MASKNA_DATA(dst),
+                    PyArray_MASKNA_STRIDES(dst),
+                    PyArray_DESCR(wheremask), PyArray_DATA(wheremask),
+                    wheremask_strides) < 0) {
+                goto fail;
+            }
+        }
+    }
+
+    if (allocated_src_data) {
+        PyArray_free(src_data);
+    }
+
+    return 0;
+
+fail:
+    if (allocated_src_data) {
+        PyArray_free(src_data);
+    }
+
+    return -1;
+}
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index e7ed8ba02..f79bf1737 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -50,6 +50,8 @@ maintainer email:  oliphant.travis@ieee.org
 #include "getset.h"
 #include "sequence.h"
 #include "buffer.h"
+#include "na_object.h"
+#include "na_mask.h"
 
 /*NUMPY_API
   Compute the size of an array (in number of items)
@@ -91,18 +93,41 @@ PyArray_SetBaseObject(PyArrayObject *arr, PyObject *obj)
                 "dependency more than once");
         return -1;
     }
+
     /*
      * Don't allow chains of views, always set the base
-     * to the owner of the data
+     * to the owner of the data. That is, either the first object
+     * which isn't an array, the first object with an NA mask
+     * which owns that NA mask, or the first object which owns
+     * its own data.
      */
-    while (PyArray_Check(obj) &&
-                            (PyObject *)arr != obj &&
-                            PyArray_BASE((PyArrayObject *)obj) != NULL) {
-        PyObject *tmp = PyArray_BASE((PyArrayObject *)obj);
+    while (PyArray_Check(obj) && (PyObject *)arr != obj) {
+        PyArrayObject *obj_arr = (PyArrayObject *)arr;
+        PyObject *tmp;
+
+        /* If this array owns its own data, stop collapsing */
+        if (PyArray_CHKFLAGS(obj_arr, NPY_ARRAY_OWNDATA)) {
+            break;
+        }
+        /*
+         * If 'arr' doesn't own its NA mask, then if
+         * 'obj' is NA masked and owns the mask, stop collapsing
+         */
+        if (!PyArray_CHKFLAGS(arr, NPY_ARRAY_OWNMASKNA) &&
+                        PyArray_CHKFLAGS(obj_arr, NPY_ARRAY_OWNMASKNA)) {
+            break;
+        }
+        /* If there's no base, stop collapsing */
+        tmp = PyArray_BASE(obj_arr);
+        if (tmp == NULL) {
+            break;
+        }
+
         Py_INCREF(tmp);
         Py_DECREF(obj);
         obj = tmp;
     }
+
     /* Disallow circular references */
     if ((PyObject *)arr == obj) {
         Py_DECREF(obj);
@@ -110,7 +135,8 @@ PyArray_SetBaseObject(PyArrayObject *arr, PyObject *obj)
                 "Cannot create a circular NumPy array 'base' dependency");
         return -1;
     }
-    ((PyArrayObject_fieldaccess *)arr)->base = obj;
+
+    ((PyArrayObject_fields *)arr)->base = obj;
 
     return 0;
 }
@@ -120,7 +146,7 @@ PyArray_SetBaseObject(PyArrayObject *arr, PyObject *obj)
 NPY_NO_EXPORT int
 PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
 {
-    int ret;
+    int ret, contains_na = 0;
     PyArrayObject *src;
     PyArray_Descr *dtype = NULL;
     int ndim = 0;
@@ -155,8 +181,17 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
      * Get either an array object we can copy from, or its parameters
      * if there isn't a convenient array available.
      */
-    if (PyArray_GetArrayParamsFromObject(src_object, PyArray_DESCR(dest),
-                0, &dtype, &ndim, dims, &src, NULL) < 0) {
+    if (PyArray_GetArrayParamsFromObjectEx(src_object, PyArray_DESCR(dest),
+                0, &dtype, &ndim, dims, &contains_na, &src, NULL) < 0) {
+        Py_DECREF(src_object);
+        return -1;
+    }
+
+    if (contains_na && !(PyArray_HasNASupport(dest) ||
+                         PyArray_DESCR(dest)->type_num == NPY_OBJECT)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot set NumPy array values to NA values without first "
+                "enabling NA support in the array");
         Py_DECREF(src_object);
         return -1;
     }
@@ -165,19 +200,61 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
     if (src == NULL) {
         /* If the input is scalar */
         if (ndim == 0) {
+            NpyNA *na = NULL;
+
+            /* Get an NpyNA if src is NA, but don't raise an error */
+            if (PyArray_HasNASupport(dest)) {
+                na = NpyNA_FromObject(src_object, 1);
+            }
+
             /* If there's one dest element and src is a Python scalar */
             if (PyArray_IsScalar(src_object, Generic)) {
-                src = (PyArrayObject *)PyArray_FromScalar(src_object, dtype);
-                if (src == NULL) {
+                PyArray_Descr *dtype;
+                char *value;
+                int retcode;
+
+                dtype = PyArray_DescrFromScalar(src_object);
+                if (dtype == NULL) {
+                    Py_DECREF(src_object);
+                    return -1;
+                }
+                value = scalar_value(src_object, dtype);
+                if (value == NULL) {
+                    Py_DECREF(dtype);
                     Py_DECREF(src_object);
                     return -1;
                 }
+
+                /* TODO: switch to SAME_KIND casting */
+                retcode = PyArray_AssignRawScalar(dest, dtype, value,
+                                        NULL, NPY_UNSAFE_CASTING, 0, NULL);
+                Py_DECREF(dtype);
+                Py_DECREF(src_object);
+                return retcode;
             }
+            /* Assigning NA affects the mask if it exists */
+            else if (na != NULL) {
+                if (PyArray_AssignNA(dest, na, NULL, 0, NULL) < 0) {
+                    Py_DECREF(na);
+                    Py_DECREF(src_object);
+                    return -1;
+                }
+
+                Py_DECREF(na);
+                Py_DECREF(src_object);
+                return 0;
+            }
+            /* Otherwise use the dtype's setitem function */
             else {
                 if (PyArray_SIZE(dest) == 1) {
                     Py_DECREF(dtype);
-                    return PyArray_DESCR(dest)->f->setitem(src_object,
-                                                    PyArray_DATA(dest), dest);
+                    ret = PyArray_DESCR(dest)->f->setitem(src_object,
+                                                PyArray_DATA(dest), dest);
+                    /* Unmask the value if necessary */
+                    if (ret == 0 && PyArray_HASMASKNA(dest)) {
+                        PyArray_MASKNA_DATA(dest)[0] = 1;
+                    }
+                    return ret;
                 }
                 else {
                     src = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
@@ -216,6 +293,12 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
                 Py_DECREF(src_object);
                 return -1;
             }
+            if (PyArray_HASMASKNA(dest)) {
+                if (PyArray_AllocateMaskNA(dest, 1, 0, 1) < 0) {
+                    Py_DECREF(src_object);
+                    return -1;
+                }
+            }
             if (PyArray_AssignFromSequence(src, src_object) < 0) {
                 Py_DECREF(src);
                 Py_DECREF(src_object);
@@ -274,7 +357,7 @@ PyArray_TypeNumFromName(char *str)
 static void
 array_dealloc(PyArrayObject *self)
 {
-    PyArrayObject_fieldaccess *fa = (PyArrayObject_fieldaccess *)self;
+    PyArrayObject_fields *fa = (PyArrayObject_fields *)self;
 
     _array_dealloc_buffer_info(self);
 
@@ -323,6 +406,14 @@ array_dealloc(PyArrayObject *self)
         PyDataMem_FREE(fa->data);
     }
 
+    /* If the array has an NA mask, free its associated data */
+    if (fa->flags & NPY_ARRAY_MASKNA) {
+        Py_XDECREF(fa->maskna_dtype);
+        if (fa->flags & NPY_ARRAY_OWNMASKNA) {
+            PyDataMem_FREE(fa->maskna_data);
+        }
+    }
+
     PyDimMem_FREE(fa->dimensions);
     Py_DECREF(fa->descr);
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -387,6 +478,84 @@ dump_data(char **string, int *n, int *max_n, char *data, int nd,
 #undef CHECK_MEMORY
 }
 
+/*NUMPY_API
+ * Prints the raw data of the ndarray in a form useful for debugging
+ * low-level C issues.
+ */
+NPY_NO_EXPORT void
+PyArray_DebugPrint(PyArrayObject *obj)
+{
+    int i;
+    PyArrayObject_fields *fobj = (PyArrayObject_fields *)obj;
+
+    printf("-------------------------------------------------------\n");
+    printf(" Dump of NumPy ndarray at address %p\n", obj);
+    if (obj == NULL) {
+        printf(" It's NULL!\n");
+        printf("-------------------------------------------------------\n");
+        fflush(stdout);
+        return;
+    }
+    printf(" ndim   : %d\n", fobj->nd);
+    printf(" shape  :");
+    for (i = 0; i < fobj->nd; ++i) {
+        printf(" %d", (int)fobj->dimensions[i]);
+    }
+    printf("\n");
+
+    printf(" dtype  : ");
+    PyObject_Print((PyObject *)fobj->descr, stdout, 0);
+    printf("\n");
+    printf(" data   : %p\n", fobj->data);
+    printf(" strides:");
+    for (i = 0; i < fobj->nd; ++i) {
+        printf(" %d", (int)fobj->strides[i]);
+    }
+    printf("\n");
+
+    printf(" base   : %p\n", fobj->base);
+
+    printf(" flags :");
+    if (fobj->flags & NPY_ARRAY_C_CONTIGUOUS)
+        printf(" C_CONTIGUOUS");
+    if (fobj->flags & NPY_ARRAY_F_CONTIGUOUS)
+        printf(" F_CONTIGUOUS");
+    if (fobj->flags & NPY_ARRAY_OWNDATA)
+        printf(" OWNDATA");
+    if (fobj->flags & NPY_ARRAY_ALIGNED)
+        printf(" ALIGNED");
+    if (fobj->flags & NPY_ARRAY_WRITEABLE)
+        printf(" WRITEABLE");
+    if (fobj->flags & NPY_ARRAY_UPDATEIFCOPY)
+        printf(" UPDATEIFCOPY");
+    if (fobj->flags & NPY_ARRAY_MASKNA)
+        printf(" MASKNA");
+    if (fobj->flags & NPY_ARRAY_OWNMASKNA)
+        printf(" OWNMASKNA");
+    printf("\n");
+
+    if (fobj->flags & NPY_ARRAY_MASKNA) {
+        printf(" maskna dtype  : ");
+        PyObject_Print((PyObject *)fobj->maskna_dtype, stdout, 0);
+        printf("\n");
+        printf(" maskna data   : %p\n", fobj->maskna_data);
+        printf(" maskna strides:");
+        for (i = 0; i < fobj->nd; ++i) {
+            printf(" %d", (int)fobj->maskna_strides[i]);
+        }
+        printf("\n");
+    }
+
+    if (fobj->base != NULL && PyArray_Check(fobj->base)) {
+        printf("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n");
+        printf("Dump of array's BASE:\n");
+        PyArray_DebugPrint((PyArrayObject *)fobj->base);
+        printf(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
+    }
+    printf("-------------------------------------------------------\n");
+    fflush(stdout);
+}
+
 static PyObject *
 array_repr_builtin(PyArrayObject *self, int repr)
 {
@@ -850,7 +1019,7 @@ _strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
     int val;
 
     /* Cast arrays to a common type */
-    if (PyArray_DESCR(self)->type_num != PyArray_DESCR(other)->type_num) {
+    if (PyArray_TYPE(self) != PyArray_DESCR(other)->type_num) {
 #if defined(NPY_PY3K)
         /*
          * Comparison between Bytes and Unicode is not defined in Py3K;
@@ -860,7 +1029,7 @@ _strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
         return Py_NotImplemented;
 #else
         PyObject *new;
-        if (PyArray_DESCR(self)->type_num == PyArray_STRING &&
+        if (PyArray_TYPE(self) == PyArray_STRING &&
             PyArray_DESCR(other)->type_num == PyArray_UNICODE) {
             PyArray_Descr* unicode = PyArray_DescrNew(PyArray_DESCR(other));
             unicode->elsize = PyArray_DESCR(self)->elsize << 2;
@@ -872,7 +1041,7 @@ _strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
             Py_INCREF(other);
             self = (PyArrayObject *)new;
         }
-        else if (PyArray_DESCR(self)->type_num == PyArray_UNICODE &&
+        else if (PyArray_TYPE(self) == PyArray_UNICODE &&
                  PyArray_DESCR(other)->type_num == PyArray_STRING) {
             PyArray_Descr* unicode = PyArray_DescrNew(PyArray_DESCR(self));
             unicode->elsize = PyArray_DESCR(other)->elsize << 2;
@@ -915,7 +1084,7 @@ _strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
         goto finish;
     }
 
-    if (PyArray_DESCR(self)->type_num == NPY_UNICODE) {
+    if (PyArray_TYPE(self) == NPY_UNICODE) {
         val = _compare_strings(result, mit, cmp_op, _myunincmp, rstrip);
     }
     else {
@@ -1055,7 +1224,7 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
 {
     PyArrayObject *array_other;
     PyObject *result = NULL;
-    int typenum;
+    PyArray_Descr *dtype = NULL;
 
     switch (cmp_op) {
     case Py_LT:
@@ -1072,33 +1241,27 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
             return Py_False;
         }
         /* Make sure 'other' is an array */
-        if (PyArray_Check(other)) {
-            Py_INCREF(other);
-            array_other = (PyArrayObject *)other;
+        if (PyArray_TYPE(self) == NPY_OBJECT) {
+            dtype = PyArray_DTYPE(self);
+            Py_INCREF(dtype);
         }
-        else {
-            typenum = PyArray_DESCR(self)->type_num;
-            if (typenum != PyArray_OBJECT) {
-                typenum = PyArray_NOTYPE;
-            }
-            array_other = (PyArrayObject *)PyArray_FromObject(other,
-                                                        typenum, 0, 0);
-            /*
-             * If not successful, indicate that the items cannot be compared
-             * this way.
-             */
-            if (array_other == NULL) {
-                Py_XDECREF(array_other);
-                PyErr_Clear();
-                Py_INCREF(Py_NotImplemented);
-                return Py_NotImplemented;
-            }
+        array_other = (PyArrayObject *)PyArray_FromAny(other, dtype, 0, 0,
+                                                    NPY_ARRAY_ALLOWNA, NULL);
+        /*
+         * If not successful, indicate that the items cannot be compared
+         * this way.
+         */
+        if (array_other == NULL) {
+            PyErr_Clear();
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
         }
+
         result = PyArray_GenericBinaryFunction(self,
                 (PyObject *)array_other,
                 n_ops.equal);
         if ((result == Py_NotImplemented) &&
-                (PyArray_DESCR(self)->type_num == PyArray_VOID)) {
+                (PyArray_TYPE(self) == NPY_VOID)) {
             int _res;
 
             _res = PyObject_RichCompareBool
@@ -1135,32 +1298,26 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
             return Py_True;
         }
         /* Make sure 'other' is an array */
-        if (PyArray_Check(other)) {
-            Py_INCREF(other);
-            array_other = (PyArrayObject *)other;
+        if (PyArray_TYPE(self) == NPY_OBJECT) {
+            dtype = PyArray_DTYPE(self);
+            Py_INCREF(dtype);
         }
-        else {
-            typenum = PyArray_DESCR(self)->type_num;
-            if (typenum != PyArray_OBJECT) {
-                typenum = PyArray_NOTYPE;
-            }
-            array_other = (PyArrayObject *)PyArray_FromObject(other,
-                                                            typenum, 0, 0);
-            /*
-             * If not successful, then objects cannot be
-             * compared this way
-             */
-            if (array_other == NULL || (PyObject *)array_other == Py_None) {
-                Py_XDECREF(array_other);
-                PyErr_Clear();
-                Py_INCREF(Py_NotImplemented);
-                return Py_NotImplemented;
-            }
+        array_other = (PyArrayObject *)PyArray_FromAny(other, dtype, 0, 0,
+                                                    NPY_ARRAY_ALLOWNA, NULL);
+        /*
+         * If not successful, indicate that the items cannot be compared
+         * this way.
+         */
+        if (array_other == NULL) {
+            PyErr_Clear();
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
         }
+
         result = PyArray_GenericBinaryFunction(self, (PyObject *)array_other,
                 n_ops.not_equal);
         if ((result == Py_NotImplemented) &&
-                (PyArray_DESCR(self)->type_num == PyArray_VOID)) {
+                (PyArray_TYPE(self) == NPY_VOID)) {
             int _res;
 
             _res = PyObject_RichCompareBool(
@@ -1201,7 +1358,7 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
     }
     if (result == Py_NotImplemented) {
         /* Try to handle string comparisons */
-        if (PyArray_DESCR(self)->type_num == PyArray_OBJECT) {
+        if (PyArray_TYPE(self) == PyArray_OBJECT) {
             return result;
         }
         array_other = (PyArrayObject *)PyArray_FromObject(other,
@@ -1504,7 +1661,7 @@ NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     (traverseproc)0,                            /* tp_traverse */
     (inquiry)0,                                 /* tp_clear */
     (richcmpfunc)array_richcompare,             /* tp_richcompare */
-    offsetof(PyArrayObject_fieldaccess, weakreflist), /* tp_weaklistoffset */
+    offsetof(PyArrayObject_fields, weakreflist), /* tp_weaklistoffset */
     (getiterfunc)array_iter,                    /* tp_iter */
     (iternextfunc)0,                            /* tp_iternext */
     array_methods,                              /* tp_methods */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index cb76d5af9..423298afd 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -18,6 +18,7 @@
 #include "ctors.h"
 #include "usertypes.h"
 #include "_datetime.h"
+#include "na_object.h"
 
 #include "numpyos.h"
 
@@ -142,6 +143,12 @@ static int
     if (PyArray_IsScalar(op, @kind@)) {
         temp = ((Py@kind@ScalarObject *)op)->obval;
     }
+    else if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
     else {
         temp = (@type@)@func2@(op);
     }
@@ -204,7 +211,16 @@ static int
     c@type@ temp;
     int rsize;
 
-    if (!(PyArray_IsScalar(op, @kind@))) {
+    if (PyArray_IsScalar(op, @kind@)){
+        temp = ((Py@kind@ScalarObject *)op)->obval;
+    }
+    else if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
+    else {
         if (PyArray_Check(op) && (PyArray_NDIM((PyArrayObject *)op) == 0)) {
             op2 = PyArray_DESCR((PyArrayObject *)op)->f->getitem(
                                     PyArray_BYTES((PyArrayObject *)op),
@@ -227,9 +243,7 @@ static int
         temp.real = (@type@) oop.real;
         temp.imag = (@type@) oop.imag;
     }
-    else {
-        temp = ((Py@kind@ScalarObject *)op)->obval;
-    }
+
     memcpy(ov, &temp, PyArray_DESCR(ap)->elsize);
     if (!PyArray_ISNOTSWAPPED(ap)) {
         byte_swap_vector(ov, 2, sizeof(@type@));
@@ -259,6 +273,12 @@ LONGDOUBLE_setitem(PyObject *op, char *ov, PyArrayObject *ap) {
     if (PyArray_IsScalar(op, LongDouble)) {
         temp = ((PyLongDoubleScalarObject *)op)->obval;
     }
+    else if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
     else {
         temp = (longdouble) MyPyFloat_AsDouble(op);
     }
@@ -353,6 +373,12 @@ UNICODE_setitem(PyObject *op, char *ov, PyArrayObject *ap)
                 "setting an array element with a sequence");
         return -1;
     }
+    if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
     /* Sequence_Size might have returned an error */
     if (PyErr_Occurred()) {
         PyErr_Clear();
@@ -457,6 +483,12 @@ STRING_setitem(PyObject *op, char *ov, PyArrayObject *ap)
                 "cannot set an array element with a sequence");
         return -1;
     }
+    if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
 #if defined(NPY_PY3K)
     if (PyUnicode_Check(op)) {
         /* Assume ASCII codec -- function similarly as Python 2 */
@@ -567,14 +599,14 @@ VOID_getitem(char *ip, PyArrayObject *ap)
             tup = PyDict_GetItem(descr->fields, key);
             if (!PyArg_ParseTuple(tup, "Oi|O", &new, &offset, &title)) {
                 Py_DECREF(ret);
-                ((PyArrayObject_fieldaccess *)ap)->descr = descr;
+                ((PyArrayObject_fields *)ap)->descr = descr;
                 return NULL;
             }
             /*
              * TODO: temporarily modifying the array like this
              *       is bad coding style, should be changed.
              */
-            ((PyArrayObject_fieldaccess *)ap)->descr = new;
+            ((PyArrayObject_fields *)ap)->descr = new;
             /* update alignment based on offset */
             if ((new->alignment > 1)
                     && ((((intp)(ip+offset)) % new->alignment) != 0)) {
@@ -584,9 +616,9 @@ VOID_getitem(char *ip, PyArrayObject *ap)
                 PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
             }
             PyTuple_SET_ITEM(ret, i, new->f->getitem(ip+offset, ap));
-            ((PyArrayObject_fieldaccess *)ap)->flags = savedflags;
+            ((PyArrayObject_fields *)ap)->flags = savedflags;
         }
-        ((PyArrayObject_fieldaccess *)ap)->descr = descr;
+        ((PyArrayObject_fields *)ap)->descr = descr;
         return ret;
     }
 
@@ -697,14 +729,14 @@ VOID_setitem(PyObject *op, char *ip, PyArrayObject *ap)
             key = PyTuple_GET_ITEM(names, i);
             tup = PyDict_GetItem(descr->fields, key);
             if (!PyArg_ParseTuple(tup, "Oi|O", &new, &offset, &title)) {
-                ((PyArrayObject_fieldaccess *)ap)->descr = descr;
+                ((PyArrayObject_fields *)ap)->descr = descr;
                 return -1;
             }
             /*
              * TODO: temporarily modifying the array like this
              *       is bad coding style, should be changed.
              */
-            ((PyArrayObject_fieldaccess *)ap)->descr = new;
+            ((PyArrayObject_fields *)ap)->descr = new;
             /* remember to update alignment flags */
             if ((new->alignment > 1)
                     && ((((intp)(ip+offset)) % new->alignment) != 0)) {
@@ -714,12 +746,12 @@ VOID_setitem(PyObject *op, char *ip, PyArrayObject *ap)
                 PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
             }
             res = new->f->setitem(PyTuple_GET_ITEM(op, i), ip+offset, ap);
-            ((PyArrayObject_fieldaccess *)ap)->flags = savedflags;
+            ((PyArrayObject_fields *)ap)->flags = savedflags;
             if (res < 0) {
                 break;
             }
         }
-        ((PyArrayObject_fieldaccess *)ap)->descr = descr;
+        ((PyArrayObject_fields *)ap)->descr = descr;
         return res;
     }
 
@@ -752,6 +784,13 @@ VOID_setitem(PyObject *op, char *ip, PyArrayObject *ap)
         return res;
     }
 
+    if (NpyNA_Check(op) || NpyNA_IsZeroDimArrayNA(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
+
     /* Default is to use buffer interface to set item */
     {
         const void *buffer;
@@ -1844,19 +1883,19 @@ VOID_copyswapn (char *dst, intp dstride, char *src, intp sstride,
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
-                ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+                ((PyArrayObject_fields *)arr)->descr = descr;
                 return;
             }
             /*
              * TODO: temporarily modifying the array like this
              *       is bad coding style, should be changed.
              */
-            ((PyArrayObject_fieldaccess *)arr)->descr = new;
+            ((PyArrayObject_fields *)arr)->descr = new;
             new->f->copyswapn(dst+offset, dstride,
                     (src != NULL ? src+offset : NULL),
                     sstride, n, swap, arr);
         }
-        ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+        ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
     if (swap && PyArray_DESCR(arr)->subarray != NULL) {
@@ -1872,7 +1911,7 @@ VOID_copyswapn (char *dst, intp dstride, char *src, intp sstride,
          * TODO: temporarily modifying the array like this
          *       is bad coding style, should be changed.
          */
-        ((PyArrayObject_fieldaccess *)arr)->descr = new;
+        ((PyArrayObject_fields *)arr)->descr = new;
         dstptr = dst;
         srcptr = src;
         subitemsize = new->elsize;
@@ -1885,7 +1924,7 @@ VOID_copyswapn (char *dst, intp dstride, char *src, intp sstride,
                 srcptr += sstride;
             }
         }
-        ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+        ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
     if (src != NULL) {
@@ -1912,19 +1951,19 @@ VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
                 continue;
             }
             if (!PyArg_ParseTuple(value, "Oi|O", &new, &offset, &title)) {
-                ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+                ((PyArrayObject_fields *)arr)->descr = descr;
                 return;
             }
             /*
              * TODO: temporarily modifying the array like this
              *       is bad coding style, should be changed.
              */
-            ((PyArrayObject_fieldaccess *)arr)->descr = new;
+            ((PyArrayObject_fields *)arr)->descr = new;
             new->f->copyswap(dst+offset,
                     (src != NULL ? src+offset : NULL),
                     swap, arr);
         }
-        ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+        ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
     if (swap && PyArray_DESCR(arr)->subarray != NULL) {
@@ -1938,12 +1977,12 @@ VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
          * TODO: temporarily modifying the array like this
          *       is bad coding style, should be changed.
          */
-        ((PyArrayObject_fieldaccess *)arr)->descr = new;
+        ((PyArrayObject_fields *)arr)->descr = new;
         itemsize = new->elsize;
         num = descr->elsize / itemsize;
         new->f->copyswapn(dst, itemsize, src,
                 itemsize, num, swap, arr);
-        ((PyArrayObject_fieldaccess *)arr)->descr = descr;
+        ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
     if (src != NULL) {
@@ -2219,8 +2258,8 @@ VOID_nonzero (char *ip, PyArrayObject *ap)
              * TODO: temporarily modifying the array like this
              *       is bad coding style, should be changed.
              */
-            ((PyArrayObject_fieldaccess *)ap)->descr = descr;
-            ((PyArrayObject_fieldaccess *)ap)->flags = savedflags;
+            ((PyArrayObject_fields *)ap)->descr = descr;
+            ((PyArrayObject_fields *)ap)->flags = savedflags;
             if ((new->alignment > 1) && !__ALIGNED(ip+offset, new->alignment)) {
                 PyArray_CLEARFLAGS(ap, NPY_ARRAY_ALIGNED);
             }
@@ -2232,8 +2271,8 @@ VOID_nonzero (char *ip, PyArrayObject *ap)
                 break;
             }
         }
-        ((PyArrayObject_fieldaccess *)ap)->descr = descr;
-        ((PyArrayObject_fieldaccess *)ap)->flags = savedflags;
+        ((PyArrayObject_fields *)ap)->descr = descr;
+        ((PyArrayObject_fields *)ap)->flags = savedflags;
         return nonz;
     }
     len = PyArray_DESCR(ap)->elsize;
@@ -2523,7 +2562,7 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
          * TODO: temporarily modifying the array like this
          *       is bad coding style, should be changed.
          */
-        ((PyArrayObject_fieldaccess *)ap)->descr = new;
+        ((PyArrayObject_fields *)ap)->descr = new;
         swap = PyArray_ISBYTESWAPPED(ap);
         nip1 = ip1+offset;
         nip2 = ip2+offset;
@@ -2567,7 +2606,7 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     }
 
 finish:
-    ((PyArrayObject_fieldaccess *)ap)->descr = descr;
+    ((PyArrayObject_fields *)ap)->descr = descr;
     return res;
 }
 
diff --git a/numpy/core/src/multiarray/boolean_ops.c.src b/numpy/core/src/multiarray/boolean_ops.c.src
new file mode 100644
index 000000000..f3d87cc34
--- /dev/null
+++ b/numpy/core/src/multiarray/boolean_ops.c.src
@@ -0,0 +1,405 @@
+/*
+ * This file implements some boolean methods which have special NA
+ * interactions, including np.any and np.all. Could later gain
+ * np.logical_or and np.logical_and, as well as np.bitwise_or and
+ * np.bitwise_and because they are often used for boolean operations.
+ *
+ * NOTE: These functions assume that the input boolean data is valid,
+ *       i.e. that each boolean is either 0 or 1. Any deviation from this
+ *       may produce incorrect answers.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/arrayobject.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "reduction.h"
+
+/* Typedef for the reduction inner loop */
+typedef void (reduce_inner_loop)(char **, npy_intp *, npy_intp);
+
+#define ANY_COMBINE_MASK(val, mask) ((val) & (mask))
+#define ALL_COMBINE_MASK(val, mask) ((val) | ((mask) ^ 0x01))
+
+/**begin repeat
+ * #oper = any, all#
+ * #Oper = Any, All#
+ * #Identity = Zero, One#
+ * #combineop = |=, &=#
+ * #combinemask = ANY_COMBINE_MASK, ALL_COMBINE_MASK#
+ * #shortcircuit = !value, value#
+ * #idval = 0, 1#
+ */
+
+static int
+assign_identity_@oper@(PyArrayObject *result, int preservena, void *data)
+{
+    return PyArray_Assign@Identity@(result, NULL, preservena, NULL);
+}
+
+static void
+@oper@_inner_gen_gen(char **dataptr, npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    npy_intp stride0 = strides[0], stride1 = strides[1];
+    npy_intp i;
+
+    for (i = 0; i < count; ++i) {
+        *data0 @combineop@ *data1;
+
+        data0 += stride0;
+        data1 += stride1;
+    }
+}
+
+static void
+@oper@_inner_0stride_gen(char **dataptr, npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+    npy_intp i;
+    char value = *data0;
+
+    for (i = 0; i < count && @shortcircuit@; ++i) {
+        value @combineop@ *data1;
+
+        data1 += stride1;
+    }
+
+    *(npy_bool *)data0 = value;
+}
+
+static void
+@oper@_inner_0stride_contig(char **dataptr, npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    char value = *data0;
+
+    if (@shortcircuit@ && memchr(data1, 1-@idval@, count) != NULL) {
+        *(npy_bool *)data0 = 1-@idval@;
+    }
+}
+
+static void
+@oper@_inner_contig_contig(char **dataptr, npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    npy_intp i;
+
+    for (i = 0; i < count; ++i) {
+        *data0 @combineop@ *data1;
+
+        ++data0;
+        ++data1;
+    }
+}
+
+static int
+reduce_@oper@_loop(NpyIter *iter,
+                char **dataptr,
+                npy_intp *strides,
+                npy_intp *countptr,
+                NpyIter_IterNextFunc *iternext,
+                int needs_api,
+                npy_intp skip_first_count,
+                void *data)
+{
+    npy_intp fixed_strides[2];
+    reduce_inner_loop *inner_loop;
+    NPY_BEGIN_THREADS_DEF;
+
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    /* Choose a loop specialized based on the strides */
+    if (fixed_strides[0] == 0) {
+        if (fixed_strides[1] == 1) {
+            inner_loop = &@oper@_inner_0stride_contig;
+        }
+        else {
+            inner_loop = &@oper@_inner_0stride_gen;
+        }
+    }
+    else {
+        if (fixed_strides[0] == 1 && fixed_strides[1] == 1) {
+            inner_loop = &@oper@_inner_contig_contig;
+        }
+        else {
+            inner_loop = &@oper@_inner_gen_gen;
+        }
+    }
+
+    /*
+     * 'skip_first_count' will always be 0 because we are doing a reduction
+     * with an identity.
+     */
+
+    do {
+        inner_loop(dataptr, strides, *countptr);
+    } while (iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+static void
+@oper@_masked_inner_gen_gen_gen(char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
+    npy_intp stride0 = strides[0], stride1 = strides[1], stride2 = strides[2];
+    npy_intp i;
+
+    for (i = 0; i < count; ++i) {
+        *data0 @combineop@ @combinemask@(*data1, *data2);
+
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+    }
+}
+
+static void
+@oper@_masked_inner_0stride_gen_gen(char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
+    npy_intp stride1 = strides[1], stride2 = strides[2];
+    npy_intp i;
+    char value = *data0;
+
+    for (i = 0; i < count && @shortcircuit@; ++i) {
+        value @combineop@ @combinemask@(*data1, *data2);
+
+        data1 += stride1;
+        data2 += stride2;
+    }
+
+    *(npy_bool *)data0 = value;
+}
+
+static void
+@oper@_masked_inner_0stride_gen_0stride(char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
+    npy_intp stride1 = strides[1];
+    npy_intp i;
+    char maskvalue = *data2;
+
+    if (maskvalue) {
+        char value = *data0;
+
+        for (i = 0; i < count && @shortcircuit@; ++i) {
+            value @combineop@ *data1;
+
+            data1 += stride1;
+        }
+
+        *(npy_bool *)data0 = value;
+    }
+}
+
+static int
+reduce_@oper@_masked_loop(NpyIter *iter,
+                char **dataptr,
+                npy_intp *strides,
+                npy_intp *countptr,
+                NpyIter_IterNextFunc *iternext,
+                int needs_api,
+                npy_intp skip_first_count,
+                void *data)
+{
+    npy_intp fixed_strides[3];
+    reduce_inner_loop *inner_loop;
+    NPY_BEGIN_THREADS_DEF;
+
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    /* Choose a loop specialized based on the strides */
+    if (fixed_strides[0] == 0) {
+        if (fixed_strides[2] == 0) {
+            inner_loop = &@oper@_masked_inner_0stride_gen_0stride;
+        }
+        else {
+            inner_loop = &@oper@_masked_inner_0stride_gen_gen;
+        }
+    }
+    else {
+        inner_loop = &@oper@_masked_inner_gen_gen_gen;
+    }
+
+    /*
+     * 'skip_first_count' will always be 0 because we are doing a reduction
+     * with an identity.
+     */
+
+    do {
+        inner_loop(dataptr, strides, *countptr);
+    } while (iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+static void
+@oper@_adv_masked_inner_gen_gen_gen_gen(char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    char *data2 = dataptr[2], *data3 = dataptr[3];
+    npy_intp stride0 = strides[0], stride1 = strides[1];
+    npy_intp stride2 = strides[2], stride3 = strides[3];
+    npy_intp i;
+
+    for (i = 0; i < count; ++i) {
+        /* Normal case */
+        if (*data2) {
+            *data0 @combineop@ *data1;
+        }
+        /*
+         * If the value is an exposed True (for any) or False (for all),
+         * expose the result as well
+         */
+        else if (@combinemask@(*data1, *data3) == 1-@idval@) {
+            *data0 = 1-@idval@;
+            *data2 = 1;
+        }
+
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data3 += stride3;
+    }
+}
+
+static void
+@oper@_adv_masked_inner_0stride_gen_0stride_gen(char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    char *data0 = dataptr[0], *data1 = dataptr[1];
+    char *data2 = dataptr[2], *data3 = dataptr[3];
+    npy_intp stride1 = strides[1], stride3 = strides[3];
+    npy_intp i;
+    char maskvalue = *data2;
+    char value = maskvalue ? *data0 : @idval@;
+
+    for (i = 0; i < count && @shortcircuit@; ++i) {
+        /* Normal case */
+        if (maskvalue) {
+            value @combineop@ *data1;
+        }
+        /*
+         * If the value is an exposed True (for any) or False (for all),
+         * expose the result as well
+         */
+        else if (@combinemask@(*data1, *data3) == 1-@idval@) {
+            value = 1-@idval@;
+            maskvalue = 1;
+            break;
+        }
+
+        data1 += stride1;
+        data3 += stride3;
+    }
+
+    if (maskvalue) {
+        *data0 = value;
+        *data2 = maskvalue;
+    }
+}
+
+static int
+reduce_@oper@_advanced_masked_loop(NpyIter *iter,
+                char **dataptr,
+                npy_intp *strides,
+                npy_intp *countptr,
+                NpyIter_IterNextFunc *iternext,
+                int needs_api,
+                npy_intp skip_first_count,
+                void *data)
+{
+    npy_intp fixed_strides[4];
+    reduce_inner_loop *inner_loop;
+    NPY_BEGIN_THREADS_DEF;
+
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    /* Choose a loop specialized based on the strides */
+    if (fixed_strides[0] == 0 && fixed_strides[2] == 0) {
+        inner_loop = &@oper@_adv_masked_inner_0stride_gen_0stride_gen;
+    }
+    else {
+        inner_loop = &@oper@_adv_masked_inner_gen_gen_gen_gen;
+    }
+
+    /*
+     * 'skip_first_count' will always be 0 because we are doing a reduction
+     * with an identity.
+     */
+
+    do {
+        inner_loop(dataptr, strides, *countptr);
+    } while (iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+NPY_NO_EXPORT PyArrayObject *
+PyArray_Reduce@Oper@(PyArrayObject *arr, PyArrayObject *out,
+            npy_bool *axis_flags, int skipna, int keepdims)
+{
+    PyArrayObject *result;
+    PyArray_Descr *bool_dtype;
+
+    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+    if (bool_dtype == NULL) {
+        return NULL;
+    }
+
+    result = PyArray_ReduceWrapper(arr, out, NULL,
+                        bool_dtype, bool_dtype,
+                        NPY_UNSAFE_CASTING,
+                        axis_flags, 1, skipna, NULL, keepdims, 1,
+                        &assign_identity_@oper@,
+                        &reduce_@oper@_loop,
+                        &reduce_@oper@_masked_loop,
+                        &reduce_@oper@_advanced_masked_loop,
+                        NULL, 0, "@oper@");
+    Py_DECREF(bool_dtype);
+    return result;
+}
+
+/**end repeat**/
diff --git a/numpy/core/src/multiarray/boolean_ops.h b/numpy/core/src/multiarray/boolean_ops.h
new file mode 100644
index 000000000..a6674e2aa
--- /dev/null
+++ b/numpy/core/src/multiarray/boolean_ops.h
@@ -0,0 +1,13 @@
+#ifndef _NPY_PRIVATE__BOOLEAN_OPS_H_
+#define _NPY_PRIVATE__BOOLEAN_OPS_H_
+
+NPY_NO_EXPORT PyArrayObject *
+PyArray_ReduceAny(PyArrayObject *arr, PyArrayObject *out,
+            npy_bool *axis_flags, int skipna, int keepdims);
+
+NPY_NO_EXPORT PyArrayObject *
+PyArray_ReduceAll(PyArrayObject *arr, PyArrayObject *out,
+            npy_bool *axis_flags, int skipna, int keepdims);
+
+
+#endif
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index a6cb82ecb..9cc2b1509 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -152,7 +152,7 @@ PyArray_ArgMin(PyArrayObject *ap, int axis, PyArrayObject *out)
 
     if (PyArray_ISFLEXIBLE(ap)) {
         PyErr_SetString(PyExc_TypeError,
-                        "argmax is unsupported for this type");
+                        "argmin is unsupported for this type");
         return NULL;
     }
     else if (PyArray_ISUNSIGNED(ap)) {
@@ -183,7 +183,8 @@ PyArray_Max(PyArrayObject *ap, int axis, PyArrayObject *out)
     PyArrayObject *arr;
     PyObject *ret;
 
-    if ((arr=(PyArrayObject *)PyArray_CheckAxis(ap, &axis, 0)) == NULL) {
+    arr = (PyArrayObject *)PyArray_CheckAxis(ap, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
     ret = PyArray_GenericReduceFunction(arr, n_ops.maximum, axis,
@@ -201,7 +202,8 @@ PyArray_Min(PyArrayObject *ap, int axis, PyArrayObject *out)
     PyArrayObject *arr;
     PyObject *ret;
 
-    if ((arr=(PyArrayObject *)PyArray_CheckAxis(ap, &axis, 0)) == NULL) {
+    arr=(PyArrayObject *)PyArray_CheckAxis(ap, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
     ret = PyArray_GenericReduceFunction(arr, n_ops.minimum, axis,
@@ -220,7 +222,8 @@ PyArray_Ptp(PyArrayObject *ap, int axis, PyArrayObject *out)
     PyObject *ret;
     PyObject *obj1 = NULL, *obj2 = NULL;
 
-    if ((arr=(PyArrayObject *)PyArray_CheckAxis(ap, &axis, 0)) == NULL) {
+    arr=(PyArrayObject *)PyArray_CheckAxis(ap, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
     obj1 = PyArray_Max(arr, axis, out);
@@ -267,26 +270,27 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
                   int variance, int num)
 {
     PyObject *obj1 = NULL, *obj2 = NULL, *obj3 = NULL;
-    PyArrayObject *arr1 = NULL, *arr2 = NULL, *new = NULL;
+    PyArrayObject *arr1 = NULL, *arr2 = NULL, *arrnew = NULL;
     PyObject *ret = NULL, *newshape = NULL;
     int i, n;
     intp val;
 
-    if ((new = (PyArrayObject *)PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arrnew = (PyArrayObject *)PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arrnew == NULL) {
         return NULL;
     }
     /* Compute and reshape mean */
     arr1 = (PyArrayObject *)PyArray_EnsureAnyArray(
-                    PyArray_Mean(new, axis, rtype, NULL));
+                    PyArray_Mean(arrnew, axis, rtype, NULL));
     if (arr1 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
-    n = PyArray_NDIM(new);
+    n = PyArray_NDIM(arrnew);
     newshape = PyTuple_New(n);
     if (newshape == NULL) {
         Py_DECREF(arr1);
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
     for (i = 0; i < n; i++) {
@@ -294,7 +298,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
             val = 1;
         }
         else {
-            val = PyArray_DIM(new,i);
+            val = PyArray_DIM(arrnew,i);
         }
         PyTuple_SET_ITEM(newshape, i, PyInt_FromLong((long)val));
     }
@@ -302,16 +306,16 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
     Py_DECREF(arr1);
     Py_DECREF(newshape);
     if (arr2 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
 
     /* Compute x = x - mx */
     arr1 = (PyArrayObject *)PyArray_EnsureAnyArray(
-                PyNumber_Subtract((PyObject *)new, (PyObject *)arr2));
+                PyNumber_Subtract((PyObject *)arrnew, (PyObject *)arr2));
     Py_DECREF(arr2);
     if (arr1 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
     /* Compute x * x */
@@ -323,7 +327,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
         Py_INCREF(arr1);
     }
     if (obj3 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
     arr2 = (PyArrayObject *)PyArray_EnsureAnyArray(
@@ -331,7 +335,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
     Py_DECREF(arr1);
     Py_DECREF(obj3);
     if (arr2 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
     if (PyArray_ISCOMPLEX(arr2)) {
@@ -353,7 +357,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
         Py_INCREF(arr2);
     }
     if (obj3 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
     /* Compute add.reduce(x*x,axis) */
@@ -362,11 +366,11 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
     Py_DECREF(obj3);
     Py_DECREF(arr2);
     if (obj1 == NULL) {
-        Py_DECREF(new);
+        Py_DECREF(arrnew);
         return NULL;
     }
-    n = PyArray_DIM(new,axis);
-    Py_DECREF(new);
+    n = PyArray_DIM(arrnew,axis);
+    Py_DECREF(arrnew);
     n = (n-num);
     if (n == 0) {
         n = 1;
@@ -404,7 +408,9 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
 
 finish:
     if (out) {
-        if (PyArray_CopyAnyInto(out, (PyArrayObject *)ret) < 0) {
+        if (PyArray_AssignArray(out, (PyArrayObject *)ret,
+                    NULL, NPY_DEFAULT_ASSIGN_CASTING,
+                    0, NULL) < 0) {
             Py_DECREF(ret);
             return NULL;
         }
@@ -422,14 +428,15 @@ finish:
 NPY_NO_EXPORT PyObject *
 PyArray_Sum(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    ret = PyArray_GenericReduceFunction((PyArrayObject *)new, n_ops.add, axis,
+    ret = PyArray_GenericReduceFunction((PyArrayObject *)arr, n_ops.add, axis,
                                         rtype, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -439,14 +446,16 @@ PyArray_Sum(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 NPY_NO_EXPORT PyObject *
 PyArray_Prod(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    ret = PyArray_GenericReduceFunction((PyArrayObject *)new, n_ops.multiply, axis,
+    ret = PyArray_GenericReduceFunction((PyArrayObject *)arr,
+                                        n_ops.multiply, axis,
                                         rtype, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -456,14 +465,16 @@ PyArray_Prod(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 NPY_NO_EXPORT PyObject *
 PyArray_CumSum(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    ret = PyArray_GenericAccumulateFunction((PyArrayObject *)new, n_ops.add, axis,
+    ret = PyArray_GenericAccumulateFunction((PyArrayObject *)arr,
+                                            n_ops.add, axis,
                                             rtype, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -473,16 +484,17 @@ PyArray_CumSum(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 NPY_NO_EXPORT PyObject *
 PyArray_CumProd(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
 
-    ret = PyArray_GenericAccumulateFunction((PyArrayObject *)new,
+    ret = PyArray_GenericAccumulateFunction((PyArrayObject *)arr,
                                             n_ops.multiply, axis,
                                             rtype, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -503,24 +515,24 @@ PyArray_Round(PyArrayObject *a, int decimals, PyArrayObject *out)
     if (PyArray_ISCOMPLEX(a)) {
         PyObject *part;
         PyObject *round_part;
-        PyObject *new;
+        PyObject *arr;
         int res;
 
         if (out) {
-            new = (PyObject *)out;
-            Py_INCREF(new);
+            arr = (PyObject *)out;
+            Py_INCREF(arr);
         }
         else {
-            new = PyArray_Copy(a);
-            if (new == NULL) {
+            arr = PyArray_Copy(a);
+            if (arr == NULL) {
                 return NULL;
             }
         }
 
-        /* new.real = a.real.round(decimals) */
-        part = PyObject_GetAttrString(new, "real");
+        /* arr.real = a.real.round(decimals) */
+        part = PyObject_GetAttrString(arr, "real");
         if (part == NULL) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
         part = PyArray_EnsureAnyArray(part);
@@ -528,20 +540,20 @@ PyArray_Round(PyArrayObject *a, int decimals, PyArrayObject *out)
                                    decimals, NULL);
         Py_DECREF(part);
         if (round_part == NULL) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
-        res = PyObject_SetAttrString(new, "real", round_part);
+        res = PyObject_SetAttrString(arr, "real", round_part);
         Py_DECREF(round_part);
         if (res < 0) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
 
-        /* new.imag = a.imag.round(decimals) */
-        part = PyObject_GetAttrString(new, "imag");
+        /* arr.imag = a.imag.round(decimals) */
+        part = PyObject_GetAttrString(arr, "imag");
         if (part == NULL) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
         part = PyArray_EnsureAnyArray(part);
@@ -549,22 +561,24 @@ PyArray_Round(PyArrayObject *a, int decimals, PyArrayObject *out)
                                    decimals, NULL);
         Py_DECREF(part);
         if (round_part == NULL) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
-        res = PyObject_SetAttrString(new, "imag", round_part);
+        res = PyObject_SetAttrString(arr, "imag", round_part);
         Py_DECREF(round_part);
         if (res < 0) {
-            Py_DECREF(new);
+            Py_DECREF(arr);
             return NULL;
         }
-        return new;
+        return arr;
     }
     /* do the most common case first */
     if (decimals >= 0) {
         if (PyArray_ISINTEGER(a)) {
             if (out) {
-                if (PyArray_CopyAnyInto(out, a) < 0) {
+                if (PyArray_AssignArray(out, a,
+                            NULL, NPY_DEFAULT_ASSIGN_CASTING,
+                            0, NULL) < 0) {
                     return NULL;
                 }
                 Py_INCREF(out);
@@ -652,15 +666,16 @@ NPY_NO_EXPORT PyObject *
 PyArray_Mean(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 {
     PyObject *obj1 = NULL, *obj2 = NULL, *ret;
-    PyArrayObject *new;
+    PyArrayObject *arr;
 
-    if ((new = (PyArrayObject *)PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = (PyArrayObject *)PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    obj1 = PyArray_GenericReduceFunction(new, n_ops.add, axis,
+    obj1 = PyArray_GenericReduceFunction(arr, n_ops.add, axis,
                                          rtype, out);
-    obj2 = PyFloat_FromDouble((double) PyArray_DIM(new,axis));
-    Py_DECREF(new);
+    obj2 = PyFloat_FromDouble((double)PyArray_DIM(arr,axis));
+    Py_DECREF(arr);
     if (obj1 == NULL || obj2 == NULL) {
         Py_XDECREF(obj1);
         Py_XDECREF(obj2);
@@ -687,15 +702,16 @@ PyArray_Mean(PyArrayObject *self, int axis, int rtype, PyArrayObject *out)
 NPY_NO_EXPORT PyObject *
 PyArray_Any(PyArrayObject *self, int axis, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    ret = PyArray_GenericReduceFunction((PyArrayObject *)new,
+    ret = PyArray_GenericReduceFunction((PyArrayObject *)arr,
                                         n_ops.logical_or, axis,
                                         PyArray_BOOL, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -705,15 +721,16 @@ PyArray_Any(PyArrayObject *self, int axis, PyArrayObject *out)
 NPY_NO_EXPORT PyObject *
 PyArray_All(PyArrayObject *self, int axis, PyArrayObject *out)
 {
-    PyObject *new, *ret;
+    PyObject *arr, *ret;
 
-    if ((new = PyArray_CheckAxis(self, &axis, 0)) == NULL) {
+    arr = PyArray_CheckAxis(self, &axis, NPY_ARRAY_ALLOWNA);
+    if (arr == NULL) {
         return NULL;
     }
-    ret = PyArray_GenericReduceFunction((PyArrayObject *)new,
+    ret = PyArray_GenericReduceFunction((PyArrayObject *)arr,
                                         n_ops.logical_and, axis,
                                         PyArray_BOOL, out);
-    Py_DECREF(new);
+    Py_DECREF(arr);
     return ret;
 }
 
@@ -787,6 +804,11 @@ _slow_array_clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObjec
 
 /*NUMPY_API
  * Clip
+ *
+ * TODO: For adding NA support, a Clip UFunc should be created, then
+ *       this should call that ufunc. 'min' and 'max' can default to
+ *       the -inf/+inf or the smallest/largest representable values
+ *       of the dtype respectively.
  */
 NPY_NO_EXPORT PyObject *
 PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *out)
@@ -800,15 +822,32 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     char *max_data, *min_data;
     PyObject *zero;
 
+    /* Treat None the same as NULL */
+    if (min == Py_None) {
+        min = NULL;
+    }
+    if (max == Py_None) {
+        max = NULL;
+    }
+
     if ((max == NULL) && (min == NULL)) {
-        PyErr_SetString(PyExc_ValueError, "array_clip: must set either max "\
-                        "or min");
+        PyErr_SetString(PyExc_ValueError,
+                        "array_clip: must set either max or min");
         return NULL;
     }
 
     func = PyArray_DESCR(self)->f->fastclip;
-    if (func == NULL || (min != NULL && !PyArray_CheckAnyScalar(min)) ||
-        (max != NULL && !PyArray_CheckAnyScalar(max))) {
+    /* Trigger the slow array clip for NA support as well */
+    if (func == NULL ||
+            PyArray_HASMASKNA(self) ||
+            (min != NULL &&
+                (!PyArray_CheckAnyScalar(min) ||
+                 (PyArray_Check(min) &&
+                  PyArray_HASMASKNA((PyArrayObject *)min)))) ||
+            (max != NULL &&
+                (!PyArray_CheckAnyScalar(max) ||
+                 (PyArray_Check(max) &&
+                  PyArray_HASMASKNA((PyArrayObject *)max))))) {
         return _slow_array_clip(self, min, max, out);
     }
     /* Use the fast scalar clip function */
@@ -866,7 +905,7 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     /* Convert max to an array */
     if (max != NULL) {
         maxa = (PyArrayObject *)PyArray_FromAny(max, indescr, 0, 0,
-                                         NPY_ARRAY_DEFAULT, NULL);
+                                 NPY_ARRAY_DEFAULT, NULL);
         if (maxa == NULL) {
             return NULL;
         }
@@ -908,7 +947,7 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
         /* Convert min to an array */
         Py_INCREF(indescr);
         mina = (PyArrayObject *)PyArray_FromAny(min, indescr, 0, 0,
-                                         NPY_ARRAY_DEFAULT, NULL);
+                                 NPY_ARRAY_DEFAULT, NULL);
         Py_DECREF(min);
         if (mina == NULL) {
             goto fail;
@@ -978,6 +1017,13 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
         if (out == NULL) {
             goto fail;
         }
+
+        if ((maxa != NULL && PyArray_HASMASKNA(maxa)) ||
+                                (mina != NULL && PyArray_HASMASKNA(mina))) {
+            if (PyArray_AllocateMaskNA(out, 1, 0, 1) < 0) {
+                goto fail;
+            }
+        }
         outgood = 1;
     }
     else Py_INCREF(out);
@@ -1021,7 +1067,11 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
         goto fail;
     }
     if (PyArray_DATA(newout) != PyArray_DATA(newin)) {
-        memcpy(PyArray_DATA(newout), PyArray_DATA(newin), PyArray_NBYTES(newin));
+        if (PyArray_AssignArray(newout, newin,
+                    NULL, NPY_DEFAULT_ASSIGN_CASTING,
+                    0, NULL) < 0) {
+            goto fail;
+        }
     }
 
     /* Now we can call the fast-clip function */
@@ -1071,7 +1121,9 @@ PyArray_Conjugate(PyArrayObject *self, PyArrayObject *out)
     else {
         PyArrayObject *ret;
         if (out) {
-            if (PyArray_CopyAnyInto(out, self) < 0) {
+            if (PyArray_AssignArray(out, self,
+                        NULL, NPY_DEFAULT_ASSIGN_CASTING,
+                        0, NULL) < 0) {
                 return NULL;
             }
             ret = out;
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 546883ddd..d146b2a51 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -53,132 +53,180 @@ _use_default_type(PyObject *op)
     typenum = -1;
     l = 0;
     type = (PyObject *)Py_TYPE(op);
-    while (l < PyArray_NUMUSERTYPES) {
+    while (l < NPY_NUMUSERTYPES) {
         if (type == (PyObject *)(userdescrs[l]->typeobj)) {
-            typenum = l + PyArray_USERDEF;
+            typenum = l + NPY_USERDEF;
             break;
         }
         l++;
     }
     if (typenum == -1) {
-        typenum = PyArray_OBJECT;
+        typenum = NPY_OBJECT;
     }
     return PyArray_DescrFromType(typenum);
 }
 
-
 /*
- * op is an object to be converted to an ndarray.
+ * Recursively examines the object to determine an appropriate dtype
+ * to use for converting to an ndarray.
+ *
+ * 'obj' is the object to be converted to an ndarray.
+ *
+ * 'maxdims' is the maximum recursion depth.
+ *
+ * 'out_contains_na' gets set to 1 if an np.NA object is encountered.
+ * The NA does not affect the dtype produced, so if this is set to 1
+ * and the result is for an array without NA support, the dtype should
+ * be switched to NPY_OBJECT. When adding multi-NA support, this should
+ * also signal whether just regular NAs or NAs with payloads were seen.
  *
- * minitype is the minimum type-descriptor needed.
+ * 'out_dtype' should be either NULL or a minimal starting dtype when
+ * the function is called. It is updated with the results of type
+ * promotion. This dtype does not get updated when processing NA objects.
+ * This is reset to NULL on failure.
  *
- * max is the maximum number of dimensions -- used for recursive call
- * to avoid infinite recursion...
+ * Returns 0 on success, -1 on failure.
  */
-NPY_NO_EXPORT PyArray_Descr *
-_array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
+NPY_NO_EXPORT int
+PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
+                        PyArray_Descr **out_dtype)
 {
-    int l;
+    int i, size;
+    PyArray_Descr *dtype = NULL;
     PyObject *ip;
-    PyArray_Descr *chktype = NULL;
-    PyArray_Descr *outtype;
 #if PY_VERSION_HEX >= 0x02060000
     Py_buffer buffer_view;
 #endif
 
-    /*
-     * These need to come first because if op already carries
-     * a descr structure, then we want it to be the result if minitype
-     * is NULL.
-     */
-    if (PyArray_Check(op)) {
-        chktype = PyArray_DESCR((PyArrayObject *)op);
-        Py_INCREF(chktype);
-        if (minitype == NULL) {
-            return chktype;
+    /* Check if it's an ndarray */
+    if (PyArray_Check(obj)) {
+        /* Check for any NAs in the array */
+        int containsna = PyArray_ContainsNA((PyArrayObject *)obj, NULL, NULL);
+        if (containsna == -1) {
+            goto fail;
+        }
+        else if (containsna) {
+            *out_contains_na = 1;
         }
-        Py_INCREF(minitype);
-        goto finish;
+        dtype = PyArray_DESCR((PyArrayObject *)obj);
+        Py_INCREF(dtype);
+        goto promote_types;
     }
 
-    if (PyArray_IsScalar(op, Generic)) {
-        chktype = PyArray_DescrFromScalar(op);
-        if (minitype == NULL) {
-            return chktype;
+    /* Check if it's a NumPy scalar */
+    if (PyArray_IsScalar(obj, Generic)) {
+        dtype = PyArray_DescrFromScalar(obj);
+        if (dtype == NULL) {
+            goto fail;
         }
-        Py_INCREF(minitype);
-        goto finish;
+        goto promote_types;
     }
 
-    Py_XINCREF(minitype);
-
-    if (max < 0) {
-        goto deflt;
+    /* Check if it's a Python scalar */
+    dtype = _array_find_python_scalar_type(obj);
+    if (dtype != NULL) {
+        goto promote_types;
     }
-    chktype = _array_find_python_scalar_type(op);
-    if (chktype) {
-        goto finish;
+
+    /* Check if it's an NA */
+    if (NpyNA_Check(obj)) {
+        *out_contains_na = 1;
+        return 0;
     }
 
-    if (PyBytes_Check(op)) {
-        chktype = PyArray_DescrNewFromType(PyArray_STRING);
-        chktype->elsize = PyString_GET_SIZE(op);
-        goto finish;
+    /* Check if it's an ASCII string */
+    if (PyBytes_Check(obj)) {
+        int itemsize = PyString_GET_SIZE(obj);
+
+        /* If it's already a big enough string, don't bother type promoting */
+        if (*out_dtype != NULL &&
+                        (*out_dtype)->type_num == NPY_STRING &&
+                        (*out_dtype)->elsize >= itemsize) {
+            return 0;
+        }
+        dtype = PyArray_DescrNewFromType(NPY_STRING);
+        if (dtype == NULL) {
+            goto fail;
+        }
+        dtype->elsize = itemsize;
+        goto promote_types;
     }
 
-    if (PyUnicode_Check(op)) {
-        chktype = PyArray_DescrNewFromType(PyArray_UNICODE);
-        chktype->elsize = PyUnicode_GET_DATA_SIZE(op);
+    /* Check if it's a Unicode string */
+    if (PyUnicode_Check(obj)) {
+        int itemsize = PyUnicode_GET_DATA_SIZE(obj);
 #ifndef Py_UNICODE_WIDE
-        chktype->elsize <<= 1;
+        itemsize <<= 1;
 #endif
-        goto finish;
+
+        /*
+         * If it's already a big enough unicode object,
+         * don't bother type promoting
+         */
+        if (*out_dtype != NULL &&
+                        (*out_dtype)->type_num == NPY_UNICODE &&
+                        (*out_dtype)->elsize >= itemsize) {
+            return 0;
+        }
+        dtype = PyArray_DescrNewFromType(NPY_UNICODE);
+        if (dtype == NULL) {
+            goto fail;
+        }
+        dtype->elsize = itemsize;
+        goto promote_types;
     }
 
 #if PY_VERSION_HEX >= 0x02060000
     /* PEP 3118 buffer interface */
     memset(&buffer_view, 0, sizeof(Py_buffer));
-    if (PyObject_GetBuffer(op, &buffer_view, PyBUF_FORMAT|PyBUF_STRIDES) == 0 ||
-        PyObject_GetBuffer(op, &buffer_view, PyBUF_FORMAT) == 0) {
+    if (PyObject_GetBuffer(obj, &buffer_view, PyBUF_FORMAT|PyBUF_STRIDES) == 0 ||
+        PyObject_GetBuffer(obj, &buffer_view, PyBUF_FORMAT) == 0) {
 
         PyErr_Clear();
-        chktype = _descriptor_from_pep3118_format(buffer_view.format);
+        dtype = _descriptor_from_pep3118_format(buffer_view.format);
         PyBuffer_Release(&buffer_view);
-        if (chktype) {
-            goto finish;
+        if (dtype) {
+            goto promote_types;
         }
     }
-    else if (PyObject_GetBuffer(op, &buffer_view, PyBUF_STRIDES) == 0 ||
-             PyObject_GetBuffer(op, &buffer_view, PyBUF_SIMPLE) == 0) {
+    else if (PyObject_GetBuffer(obj, &buffer_view, PyBUF_STRIDES) == 0 ||
+             PyObject_GetBuffer(obj, &buffer_view, PyBUF_SIMPLE) == 0) {
 
         PyErr_Clear();
-        chktype = PyArray_DescrNewFromType(PyArray_VOID);
-        chktype->elsize = buffer_view.itemsize;
+        dtype = PyArray_DescrNewFromType(NPY_VOID);
+        dtype->elsize = buffer_view.itemsize;
         PyBuffer_Release(&buffer_view);
-        goto finish;
+        goto promote_types;
     }
     else {
         PyErr_Clear();
     }
 #endif
 
-    if ((ip=PyObject_GetAttrString(op, "__array_interface__"))!=NULL) {
+    /* The array interface */
+    ip = PyObject_GetAttrString(obj, "__array_interface__");
+    if (ip != NULL) {
         if (PyDict_Check(ip)) {
-            PyObject *new;
-            new = PyDict_GetItemString(ip, "typestr");
-            if (new && PyString_Check(new)) {
-                chktype =_array_typedescr_fromstr(PyString_AS_STRING(new));
+            PyObject *typestr;
+            typestr = PyDict_GetItemString(ip, "typestr");
+            if (typestr && PyString_Check(typestr)) {
+                dtype =_array_typedescr_fromstr(PyString_AS_STRING(typestr));
+                Py_DECREF(ip);
+                if (dtype == NULL) {
+                    goto fail;
+                }
+                goto promote_types;
             }
         }
         Py_DECREF(ip);
-        if (chktype) {
-            goto finish;
-        }
     }
     else {
         PyErr_Clear();
     }
-    if ((ip=PyObject_GetAttrString(op, "__array_struct__")) != NULL) {
+
+    /* The array struct interface */
+    ip = PyObject_GetAttrString(obj, "__array_struct__");
+    if (ip != NULL) {
         PyArrayInterface *inter;
         char buf[40];
 
@@ -187,112 +235,119 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
             if (inter->two == 2) {
                 PyOS_snprintf(buf, sizeof(buf),
                         "|%c%d", inter->typekind, inter->itemsize);
-                chktype = _array_typedescr_fromstr(buf);
+                dtype = _array_typedescr_fromstr(buf);
+                Py_DECREF(ip);
+                if (dtype == NULL) {
+                    goto fail;
+                }
+                goto promote_types;
             }
         }
         Py_DECREF(ip);
-        if (chktype) {
-            goto finish;
-        }
     }
     else {
         PyErr_Clear();
     }
 
+    /* The old buffer interface */
 #if !defined(NPY_PY3K)
-    if (PyBuffer_Check(op)) {
-        chktype = PyArray_DescrNewFromType(PyArray_VOID);
-        chktype->elsize = Py_TYPE(op)->tp_as_sequence->sq_length(op);
+    if (PyBuffer_Check(obj)) {
+        dtype = PyArray_DescrNewFromType(NPY_VOID);
+        if (dtype == NULL) {
+            goto fail;
+        }
+        dtype->elsize = Py_TYPE(obj)->tp_as_sequence->sq_length(obj);
         PyErr_Clear();
-        goto finish;
+        goto promote_types;
     }
 #endif
 
-    if (PyObject_HasAttrString(op, "__array__")) {
-        ip = PyObject_CallMethod(op, "__array__", NULL);
+    /* The __array__ attribute */
+    if (PyObject_HasAttrString(obj, "__array__")) {
+        ip = PyObject_CallMethod(obj, "__array__", NULL);
         if(ip && PyArray_Check(ip)) {
-            chktype = PyArray_DESCR((PyArrayObject *)ip);
-            Py_INCREF(chktype);
+            dtype = PyArray_DESCR((PyArrayObject *)ip);
+            Py_INCREF(dtype);
             Py_DECREF(ip);
-            goto finish;
+            goto promote_types;
         }
         Py_XDECREF(ip);
-        if (PyErr_Occurred()) PyErr_Clear();
+        if (PyErr_Occurred()) {
+            goto fail;
+        }
     }
 
-#if defined(NPY_PY3K)
-    /* FIXME: XXX -- what is the correct thing to do here? */
-#else
-    if (PyInstance_Check(op)) {
-        goto deflt;
+    /* Not exactly sure what this is about... */
+#if !defined(NPY_PY3K)
+    if (PyInstance_Check(obj)) {
+        dtype = _use_default_type(obj);
+        if (dtype == NULL) {
+            goto fail;
+        }
+        else {
+            goto promote_types;
+        }
     }
 #endif
-    if (PySequence_Check(op)) {
-        l = PyObject_Length(op);
-        if (l < 0 && PyErr_Occurred()) {
-            PyErr_Clear();
-            goto deflt;
-        }
-        if (l == 0 && minitype == NULL) {
-            minitype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
-            if (minitype == NULL) {
-                return NULL;
+
+    /*
+     * If we reached the maximum recursion depth without hitting one
+     * of the above cases, the output dtype should be OBJECT
+     */
+    if (maxdims == 0 || !PySequence_Check(obj)) {
+        if (*out_dtype == NULL || (*out_dtype)->type_num != NPY_OBJECT) {
+            Py_XDECREF(*out_dtype);
+            *out_dtype = PyArray_DescrFromType(NPY_OBJECT);
+            if (*out_dtype == NULL) {
+                return -1;
             }
         }
-        while (--l >= 0) {
-            PyArray_Descr *newtype;
-            ip = PySequence_GetItem(op, l);
-            if (ip==NULL) {
-                PyErr_Clear();
-                goto deflt;
-            }
-            chktype = _array_find_type(ip, minitype, max-1);
-            if (chktype == NULL) {
-                Py_XDECREF(minitype);
-                return NULL;
-            }
-            if (minitype == NULL) {
-                minitype = chktype;
-            }
-            else {
-                newtype = PyArray_PromoteTypes(chktype, minitype);
-                Py_DECREF(minitype);
-                minitype = newtype;
-                Py_DECREF(chktype);
-            }
-            Py_DECREF(ip);
+        return 0;
+    }
+
+    /* Recursive case */
+    size = PySequence_Size(obj);
+    if (size < 0) {
+        goto fail;
+    }
+    /* Recursive call for each sequence item */
+    for (i = 0; i < size; ++i) {
+        ip = PySequence_GetItem(obj, i);
+        if (ip==NULL) {
+            goto fail;
+        }
+        if (PyArray_DTypeFromObject(ip, maxdims - 1,
+                            out_contains_na, out_dtype) < 0) {
+            goto fail;
         }
-        chktype = minitype;
-        minitype = NULL;
-        goto finish;
+        Py_DECREF(ip);
     }
 
+    return 0;
 
- deflt:
-    chktype = _use_default_type(op);
 
- finish:
-    if (minitype == NULL) {
-        outtype = chktype;
+promote_types:
+    /* Set 'out_dtype' if it's NULL */
+    if (*out_dtype == NULL) {
+        *out_dtype = dtype;
+        return 0;
     }
+    /* Do type promotion with 'out_dtype' */
     else {
-        outtype = PyArray_PromoteTypes(chktype, minitype);
-        Py_DECREF(chktype);
-        Py_DECREF(minitype);
-    }
-    if (outtype == NULL) {
-        return NULL;
-    }
-    /*
-     * VOID Arrays should not occur by "default"
-     * unless input was already a VOID
-     */
-    if (outtype->type_num == PyArray_VOID &&
-            (minitype == NULL || minitype->type_num != PyArray_VOID)) {
-        Py_DECREF(outtype);
-        return PyArray_DescrFromType(NPY_OBJECT);
+        PyArray_Descr *res_dtype = PyArray_PromoteTypes(dtype, *out_dtype);
+        Py_DECREF(dtype);
+        if (res_dtype == NULL) {
+            return -1;
+        }
+        Py_DECREF(*out_dtype);
+        *out_dtype = res_dtype;
+        return 0;
     }
-    return outtype;
+
+fail:
+    Py_XDECREF(*out_dtype);
+    *out_dtype = NULL;
+    return -1;
 }
 
 /* new reference */
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 8242a0d18..248d752f6 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -3,9 +3,34 @@
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
-NPY_NO_EXPORT PyArray_Descr *
-_array_find_type(PyObject *op, PyArray_Descr *minitype, int max);
+/*
+ * Recursively examines the object to determine an appropriate dtype
+ * to use for converting to an ndarray.
+ *
+ * 'obj' is the object to be converted to an ndarray.
+ *
+ * 'maxdims' is the maximum recursion depth.
+ *
+ * 'out_contains_na' gets set to 1 if an np.NA object is encountered.
+ * The NA does not affect the dtype produced, so if this is set to 1
+ * and the result is for an array without NA support, the dtype should
+ * be switched to NPY_OBJECT. When adding multi-NA support, this should
+ * also signal whether just regular NAs or NAs with payloads were seen.
+ *
+ * 'out_dtype' should be either NULL or a minimal starting dtype when
+ * the function is called. It is updated with the results of type
+ * promotion. This dtype does not get updated when processing NA objects.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
+                        PyArray_Descr **out_dtype);
 
+/*
+ * Returns NULL without setting an exception if no scalar is matched, a
+ * new dtype reference otherwise.
+ */
 NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op);
 
@@ -13,7 +38,7 @@ NPY_NO_EXPORT PyArray_Descr *
 _array_typedescr_fromstr(char *str);
 
 NPY_NO_EXPORT char *
-index2ptr(PyArrayObject *mp, intp i);
+index2ptr(PyArrayObject *mp, npy_intp i);
 
 NPY_NO_EXPORT int
 _zerofill(PyArrayObject *ret);
@@ -21,7 +46,7 @@ _zerofill(PyArrayObject *ret);
 NPY_NO_EXPORT int
 _IsAligned(PyArrayObject *ap);
 
-NPY_NO_EXPORT Bool
+NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap);
 
 #ifndef Py_UNICODE_WIDE
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 74d7e1192..7823b6960 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -29,19 +29,54 @@
  * PyArg_ParseTuple.  It will immediately return an object of array type
  * or will convert to a NPY_ARRAY_CARRAY any other object.
  *
+ * This function will not allow an array which supports NA through,
+ * to allow code which doesn't support NA to continue working as is.
+ *
  * If you use PyArray_Converter, you must DECREF the array when finished
  * as you get a new reference to it.
  */
 NPY_NO_EXPORT int
 PyArray_Converter(PyObject *object, PyObject **address)
 {
+    if (PyArray_Check(object) && !PyArray_HASMASKNA((PyArrayObject *)object)) {
+        *address = object;
+        Py_INCREF(object);
+        return PY_SUCCEED;
+    }
+    else {
+        *address = PyArray_FromAny(object, NULL, 0, 0,
+                                NPY_ARRAY_CARRAY, NULL);
+        if (*address == NULL) {
+            return PY_FAIL;
+        }
+        return PY_SUCCEED;
+    }
+}
+
+/*NUMPY_API
+ *
+ * Useful to pass as converter function for O& processing in PyArgs_ParseTuple.
+ *
+ * This conversion function can be used with the "O&" argument for
+ * PyArg_ParseTuple.  It will immediately return an object of array type
+ * or will convert to a NPY_ARRAY_CARRAY any other object.
+ *
+ * This function allows NA-arrays through.
+ *
+ * If you use PyArray_AllowNAConverter, you must DECREF the array when finished
+ * as you get a new reference to it.
+ */
+NPY_NO_EXPORT int
+PyArray_AllowNAConverter(PyObject *object, PyObject **address)
+{
     if (PyArray_Check(object)) {
         *address = object;
         Py_INCREF(object);
         return PY_SUCCEED;
     }
     else {
-        *address = PyArray_FromAny(object, NULL, 0, 0, NPY_ARRAY_CARRAY, NULL);
+        *address = PyArray_FromAny(object, NULL, 0, 0,
+                                NPY_ARRAY_CARRAY | NPY_ARRAY_ALLOWNA, NULL);
         if (*address == NULL) {
             return PY_FAIL;
         }
@@ -61,6 +96,36 @@ PyArray_OutputConverter(PyObject *object, PyArrayObject **address)
         return PY_SUCCEED;
     }
     if (PyArray_Check(object)) {
+        if (PyArray_HASMASKNA((PyArrayObject *)object)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "this operation does not yet support output "
+                            "arrays with NA support");
+            *address = NULL;
+            return PY_FAIL;
+        }
+        *address = (PyArrayObject *)object;
+        return PY_SUCCEED;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "output must be an array");
+        *address = NULL;
+        return PY_FAIL;
+    }
+}
+
+/*NUMPY_API
+ * Useful to pass as converter function for O& processing in
+ * PyArgs_ParseTuple for output arrays
+ */
+NPY_NO_EXPORT int
+PyArray_OutputAllowNAConverter(PyObject *object, PyArrayObject **address)
+{
+    if (object == NULL || object == Py_None) {
+        *address = NULL;
+        return PY_SUCCEED;
+    }
+    if (PyArray_Check(object)) {
         *address = (PyArrayObject *)object;
         return PY_SUCCEED;
     }
@@ -104,9 +169,9 @@ PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq)
                         "expected sequence object with len >= 0");
         return PY_FAIL;
     }
-    if (len > MAX_DIMS) {
-        PyErr_Format(PyExc_ValueError, "sequence too large; "   \
-                     "must be smaller than %d", MAX_DIMS);
+    if (len > NPY_MAXDIMS) {
+        PyErr_Format(PyExc_ValueError, "sequence too large; "
+                     "must be smaller than %d", NPY_MAXDIMS);
         return PY_FAIL;
     }
     if (len > 0) {
@@ -177,12 +242,14 @@ PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf)
 
 /*NUMPY_API
  * Get axis from an object (possibly None) -- a converter function,
+ *
+ * See also PyArray_ConvertMultiAxis, which also handles a tuple of axes.
  */
 NPY_NO_EXPORT int
 PyArray_AxisConverter(PyObject *obj, int *axis)
 {
     if (obj == Py_None) {
-        *axis = MAX_DIMS;
+        *axis = NPY_MAXDIMS;
     }
     else {
         *axis = (int) PyInt_AsLong(obj);
@@ -193,6 +260,94 @@ PyArray_AxisConverter(PyObject *obj, int *axis)
     return PY_SUCCEED;
 }
 
+/*
+ * Converts an axis parameter into an ndim-length C-array of
+ * boolean flags, True for each axis specified.
+ *
+ * If obj is None or NULL, everything is set to True. If obj is a tuple,
+ * each axis within the tuple is set to True. If obj is an integer,
+ * just that axis is set to True.
+ */
+NPY_NO_EXPORT int
+PyArray_ConvertMultiAxis(PyObject *axis_in, int ndim, npy_bool *out_axis_flags)
+{
+    /* None means all of the axes */
+    if (axis_in == Py_None || axis_in == NULL) {
+        memset(out_axis_flags, 1, ndim);
+        return NPY_SUCCEED;
+    }
+    /* A tuple of which axes */
+    else if (PyTuple_Check(axis_in)) {
+        int i, naxes;
+
+        memset(out_axis_flags, 0, ndim);
+
+        naxes = PyTuple_Size(axis_in);
+        if (naxes < 0) {
+            return NPY_FAIL;
+        }
+        for (i = 0; i < naxes; ++i) {
+            PyObject *tmp = PyTuple_GET_ITEM(axis_in, i);
+            long axis = PyInt_AsLong(tmp);
+            long axis_orig = axis;
+            if (axis == -1 && PyErr_Occurred()) {
+                return NPY_FAIL;
+            }
+            if (axis < 0) {
+                axis += ndim;
+            }
+            if (axis < 0 || axis >= ndim) {
+                PyErr_Format(PyExc_ValueError,
+                        "'axis' entry %ld is out of bounds [-%d, %d)",
+                        axis_orig, ndim, ndim);
+                return NPY_FAIL;
+            }
+            if (out_axis_flags[axis]) {
+                PyErr_SetString(PyExc_ValueError,
+                        "duplicate value in 'axis'");
+                return NPY_FAIL;
+            }
+            out_axis_flags[axis] = 1;
+        }
+
+        return NPY_SUCCEED;
+    }
+    /* Try to interpret axis as an integer */
+    else {
+        long axis, axis_orig;
+
+        memset(out_axis_flags, 0, ndim);
+
+        axis = PyInt_AsLong(axis_in);
+        axis_orig = axis;
+        /* TODO: PyNumber_Index would be good to use here */
+        if (axis == -1 && PyErr_Occurred()) {
+            return NPY_FAIL;
+        }
+        if (axis < 0) {
+            axis += ndim;
+        }
+        /*
+         * Special case letting axis={-1,0} slip through for scalars,
+         * for backwards compatibility reasons.
+         */
+        if (ndim == 0 && (axis == 0 || axis == -1)) {
+            return NPY_SUCCEED;
+        }
+
+        if (axis < 0 || axis >= ndim) {
+            PyErr_Format(PyExc_ValueError,
+                    "'axis' entry %ld is out of bounds [-%d, %d)",
+                    axis_orig, ndim, ndim);
+            return NPY_FAIL;
+        }
+
+        out_axis_flags[axis] = 1;
+
+        return NPY_SUCCEED;
+    }
+}
+
 /*NUMPY_API
  * Convert an object to true / false
  */
@@ -349,6 +504,223 @@ PyArray_SearchsideConverter(PyObject *obj, void *addr)
     return PY_SUCCEED;
 }
 
+/*NUMPY_API
+ * Convert an object to FORTRAN / C / ANY / KEEP
+ */
+NPY_NO_EXPORT int
+PyArray_OrderConverter(PyObject *object, NPY_ORDER *val)
+{
+    char *str;
+    /* Leave the desired default from the caller for NULL/Py_None */
+    if (object == NULL || object == Py_None) {
+        return PY_SUCCEED;
+    }
+    else if (PyUnicode_Check(object)) {
+        PyObject *tmp;
+        int ret;
+        tmp = PyUnicode_AsASCIIString(object);
+        ret = PyArray_OrderConverter(tmp, val);
+        Py_DECREF(tmp);
+        return ret;
+    }
+    else if (!PyBytes_Check(object) || PyBytes_GET_SIZE(object) < 1) {
+        if (PyObject_IsTrue(object)) {
+            *val = NPY_FORTRANORDER;
+        }
+        else {
+            *val = NPY_CORDER;
+        }
+        if (PyErr_Occurred()) {
+            return PY_FAIL;
+        }
+        return PY_SUCCEED;
+    }
+    else {
+        str = PyBytes_AS_STRING(object);
+        if (str[0] == 'C' || str[0] == 'c') {
+            *val = NPY_CORDER;
+        }
+        else if (str[0] == 'F' || str[0] == 'f') {
+            *val = NPY_FORTRANORDER;
+        }
+        else if (str[0] == 'A' || str[0] == 'a') {
+            *val = NPY_ANYORDER;
+        }
+        else if (str[0] == 'K' || str[0] == 'k') {
+            *val = NPY_KEEPORDER;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                            "order not understood");
+            return PY_FAIL;
+        }
+    }
+    return PY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Convert an object to NPY_RAISE / NPY_CLIP / NPY_WRAP
+ */
+NPY_NO_EXPORT int
+PyArray_ClipmodeConverter(PyObject *object, NPY_CLIPMODE *val)
+{
+    if (object == NULL || object == Py_None) {
+        *val = NPY_RAISE;
+    }
+    else if (PyBytes_Check(object)) {
+        char *str;
+        str = PyBytes_AS_STRING(object);
+        if (str[0] == 'C' || str[0] == 'c') {
+            *val = NPY_CLIP;
+        }
+        else if (str[0] == 'W' || str[0] == 'w') {
+            *val = NPY_WRAP;
+        }
+        else if (str[0] == 'R' || str[0] == 'r') {
+            *val = NPY_RAISE;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                            "clipmode not understood");
+            return PY_FAIL;
+        }
+    }
+    else if (PyUnicode_Check(object)) {
+        PyObject *tmp;
+        int ret;
+        tmp = PyUnicode_AsASCIIString(object);
+        ret = PyArray_ClipmodeConverter(tmp, val);
+        Py_DECREF(tmp);
+        return ret;
+    }
+    else {
+        int number = PyInt_AsLong(object);
+        if (number == -1 && PyErr_Occurred()) {
+            goto fail;
+        }
+        if (number <= (int) NPY_RAISE
+                && number >= (int) NPY_CLIP) {
+            *val = (NPY_CLIPMODE) number;
+        }
+        else {
+            goto fail;
+        }
+    }
+    return PY_SUCCEED;
+
+ fail:
+    PyErr_SetString(PyExc_TypeError,
+                    "clipmode not understood");
+    return PY_FAIL;
+}
+
+/*NUMPY_API
+ * Convert an object to an array of n NPY_CLIPMODE values.
+ * This is intended to be used in functions where a different mode
+ * could be applied to each axis, like in ravel_multi_index.
+ */
+NPY_NO_EXPORT int
+PyArray_ConvertClipmodeSequence(PyObject *object, NPY_CLIPMODE *modes, int n)
+{
+    int i;
+    /* Get the clip mode(s) */
+    if (object && (PyTuple_Check(object) || PyList_Check(object))) {
+        if (PySequence_Size(object) != n) {
+            PyErr_Format(PyExc_ValueError,
+                    "list of clipmodes has wrong length (%d instead of %d)",
+                    (int)PySequence_Size(object), n);
+            return PY_FAIL;
+        }
+
+        for (i = 0; i < n; ++i) {
+            PyObject *item = PySequence_GetItem(object, i);
+            if(item == NULL) {
+                return PY_FAIL;
+            }
+
+            if(PyArray_ClipmodeConverter(item, &modes[i]) != PY_SUCCEED) {
+                Py_DECREF(item);
+                return PY_FAIL;
+            }
+
+            Py_DECREF(item);
+        }
+    }
+    else if (PyArray_ClipmodeConverter(object, &modes[0]) == PY_SUCCEED) {
+        for (i = 1; i < n; ++i) {
+            modes[i] = modes[0];
+        }
+    }
+    else {
+        return PY_FAIL;
+    }
+    return PY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Convert any Python object, *obj*, to an NPY_CASTING enum.
+ */
+NPY_NO_EXPORT int
+PyArray_CastingConverter(PyObject *obj, NPY_CASTING *casting)
+{
+    char *str = NULL;
+    Py_ssize_t length = 0;
+
+    if (PyUnicode_Check(obj)) {
+        PyObject *str_obj;
+        int ret;
+        str_obj = PyUnicode_AsASCIIString(obj);
+        if (str_obj == NULL) {
+            return 0;
+        }
+        ret = PyArray_CastingConverter(str_obj, casting);
+        Py_DECREF(str_obj);
+        return ret;
+    }
+
+    if (PyBytes_AsStringAndSize(obj, &str, &length) == -1) {
+        return 0;
+    }
+
+    if (length >= 2) switch (str[2]) {
+        case 0:
+            if (strcmp(str, "no") == 0) {
+                *casting = NPY_NO_CASTING;
+                return 1;
+            }
+            break;
+        case 'u':
+            if (strcmp(str, "equiv") == 0) {
+                *casting = NPY_EQUIV_CASTING;
+                return 1;
+            }
+            break;
+        case 'f':
+            if (strcmp(str, "safe") == 0) {
+                *casting = NPY_SAFE_CASTING;
+                return 1;
+            }
+            break;
+        case 'm':
+            if (strcmp(str, "same_kind") == 0) {
+                *casting = NPY_SAME_KIND_CASTING;
+                return 1;
+            }
+            break;
+        case 's':
+            if (strcmp(str, "unsafe") == 0) {
+                *casting = NPY_UNSAFE_CASTING;
+                return 1;
+            }
+            break;
+    }
+
+    PyErr_SetString(PyExc_ValueError,
+            "casting must be one of 'no', 'equiv', 'safe', "
+            "'same_kind', or 'unsafe'");
+    return 0;
+}
+
 /*****************************
 * Other conversion functions
 *****************************/
@@ -405,7 +777,7 @@ PyArray_PyIntAsInt(PyObject *o)
         goto finish;
     }
 #endif
-    if (Py_TYPE(o)->tp_as_number != NULL &&         \
+    if (Py_TYPE(o)->tp_as_number != NULL &&
         Py_TYPE(o)->tp_as_number->nb_int != NULL) {
         obj = Py_TYPE(o)->tp_as_number->nb_int(o);
         if (obj == NULL) {
@@ -415,7 +787,7 @@ PyArray_PyIntAsInt(PyObject *o)
         Py_DECREF(obj);
     }
 #if !defined(NPY_PY3K)
-    else if (Py_TYPE(o)->tp_as_number != NULL &&                    \
+    else if (Py_TYPE(o)->tp_as_number != NULL &&
              Py_TYPE(o)->tp_as_number->nb_long != NULL) {
         obj = Py_TYPE(o)->tp_as_number->nb_long(o);
         if (obj == NULL) {
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 64b26b23e..e344a19b0 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -2,12 +2,6 @@
 #define _NPY_PRIVATE_CONVERSION_UTILS_H_
 
 NPY_NO_EXPORT int
-PyArray_Converter(PyObject *object, PyObject **address);
-
-NPY_NO_EXPORT int
-PyArray_OutputConverter(PyObject *object, PyArrayObject **address);
-
-NPY_NO_EXPORT int
 PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq);
 
 NPY_NO_EXPORT int
@@ -40,4 +34,15 @@ PyArray_TypestrConvert(int itemsize, int gentype);
 NPY_NO_EXPORT PyObject *
 PyArray_IntTupleFromIntp(int len, intp *vals);
 
+/*
+ * Converts an axis parameter into an ndim-length C-array of
+ * boolean flags, True for each axis specified.
+ *
+ * If obj is None, everything is set to True. If obj is a tuple,
+ * each axis within the tuple is set to True. If obj is an integer,
+ * just that axis is set to True.
+ */
+NPY_NO_EXPORT int
+PyArray_ConvertMultiAxis(PyObject *axis_in, int ndim, npy_bool *out_axis_flags);
+
 #endif
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index d399912a6..e69714088 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -15,6 +15,7 @@
 #include "arrayobject.h"
 #include "mapping.h"
 #include "lowlevel_strided_loops.h"
+#include "scalartypes.h"
 
 #include "convert.h"
 
@@ -308,174 +309,227 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
 NPY_NO_EXPORT int
 PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 {
-    PyArrayObject *newarr;
-    int itemsize, swap;
-    void *fromptr;
-    PyArray_Descr *descr;
-    intp size;
-    PyArray_CopySwapFunc *copyswap;
-
-    itemsize = PyArray_DESCR(arr)->elsize;
-    if (PyArray_ISOBJECT(arr)) {
-        fromptr = &obj;
-        swap = 0;
-        newarr = NULL;
-    }
-    else {
-        descr = PyArray_DESCR(arr);
-        Py_INCREF(descr);
-        newarr = (PyArrayObject *)PyArray_FromAny(obj, descr,
-                                        0,0, NPY_ARRAY_ALIGNED, NULL);
-        if (newarr == NULL) {
+    PyArray_Descr *dtype = NULL;
+    npy_longlong value_buffer[4];
+    char *value = NULL;
+    int retcode = 0;
+
+    /*
+     * If 'arr' is an object array, copy the object as is unless
+     * 'obj' is a zero-dimensional array, in which case we copy
+     * the element in that array instead.
+     */
+    if (PyArray_DESCR(arr)->type_num == NPY_OBJECT &&
+                        !(PyArray_Check(obj) &&
+                          PyArray_NDIM((PyArrayObject *)obj) == 0)) {
+        value = (char *)&obj;
+
+        dtype = PyArray_DescrFromType(NPY_OBJECT);
+        if (dtype == NULL) {
             return -1;
         }
-        fromptr = PyArray_DATA(newarr);
-        swap = (PyArray_ISNOTSWAPPED(arr) != PyArray_ISNOTSWAPPED(newarr));
     }
-    size=PyArray_SIZE(arr);
-    copyswap = PyArray_DESCR(arr)->f->copyswap;
-    if (PyArray_ISONESEGMENT(arr)) {
-        char *toptr=PyArray_DATA(arr);
-        PyArray_FillWithScalarFunc* fillwithscalar =
-            PyArray_DESCR(arr)->f->fillwithscalar;
-        if (fillwithscalar && PyArray_ISALIGNED(arr)) {
-            copyswap(fromptr, NULL, swap, newarr);
-            fillwithscalar(toptr, size, fromptr, arr);
+    /* NumPy scalar */
+    else if (PyArray_IsScalar(obj, Generic)) {
+        dtype = PyArray_DescrFromScalar(obj);
+        if (dtype == NULL) {
+            return -1;
         }
-        else {
-            while (size--) {
-                copyswap(toptr, fromptr, swap, arr);
-                toptr += itemsize;
-            }
+        value = scalar_value(obj, dtype);
+        if (value == NULL) {
+            Py_DECREF(dtype);
+            return -1;
         }
     }
-    else {
-        PyArrayIterObject *iter;
+    /* Python boolean */
+    else if (PyBool_Check(obj)) {
+        value = (char *)value_buffer;
+        *value = (obj == Py_True);
 
-        iter = (PyArrayIterObject *)\
-            PyArray_IterNew((PyObject *)arr);
-        if (iter == NULL) {
-            Py_XDECREF(newarr);
+        dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
             return -1;
         }
-        while (size--) {
-            copyswap(iter->dataptr, fromptr, swap, arr);
-            PyArray_ITER_NEXT(iter);
-        }
-        Py_DECREF(iter);
     }
-    Py_XDECREF(newarr);
-    return 0;
-}
+    /* Python integer */
+    else if (PyLong_Check(obj) || PyInt_Check(obj)) {
+        npy_longlong v = PyLong_AsLongLong(obj);
+        if (v == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        value = (char *)value_buffer;
+        *(npy_longlong *)value = v;
 
-/*
- * Fills an array with zeros.
- *
- * Returns 0 on success, -1 on failure.
- */
-NPY_NO_EXPORT int
-PyArray_FillWithZero(PyArrayObject *a)
-{
-    PyArray_StridedTransferFn *stransfer = NULL;
-    NpyAuxData *transferdata = NULL;
-    PyArray_Descr *dtype = PyArray_DESCR(a);
-    NpyIter *iter;
+        dtype = PyArray_DescrFromType(NPY_LONGLONG);
+        if (dtype == NULL) {
+            return -1;
+        }
+    }
+    /* Python float */
+    else if (PyFloat_Check(obj)) {
+        npy_double v = PyFloat_AsDouble(obj);
+        if (v == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        value = (char *)value_buffer;
+        *(npy_double *)value = v;
 
-    NpyIter_IterNextFunc *iternext;
-    char **dataptr;
-    npy_intp stride, *countptr;
-    int needs_api;
+        dtype = PyArray_DescrFromType(NPY_DOUBLE);
+        if (dtype == NULL) {
+            return -1;
+        }
+    }
+    /* Python complex */
+    else if (PyComplex_Check(obj)) {
+        npy_double re, im;
 
-    NPY_BEGIN_THREADS_DEF;
+        re = PyComplex_RealAsDouble(obj);
+        if (re == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        im = PyComplex_ImagAsDouble(obj);
+        if (im == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        value = (char *)value_buffer;
+        ((npy_double *)value)[0] = re;
+        ((npy_double *)value)[1] = im;
 
-    if (!PyArray_ISWRITEABLE(a)) {
-        PyErr_SetString(PyExc_RuntimeError, "cannot write to array");
-        return -1;
+        dtype = PyArray_DescrFromType(NPY_CDOUBLE);
+        if (dtype == NULL) {
+            return -1;
+        }
     }
 
-    /* A zero-sized array needs no zeroing */
-    if (PyArray_SIZE(a) == 0) {
-        return 0;
+    /* Use the value pointer we got if possible */
+    if (value != NULL) {
+        /* TODO: switch to SAME_KIND casting */
+        retcode = PyArray_AssignRawScalar(arr, dtype, value,
+                                NULL, NPY_UNSAFE_CASTING, 0, NULL);
+        Py_DECREF(dtype);
+        return retcode;
     }
+    /* Otherwise convert to an array to do the assignment */
+    else {
+        PyArrayObject *src_arr;
 
-    /* If it's possible to do a simple memset, do so */
-    if (!PyDataType_REFCHK(dtype) && (PyArray_ISCONTIGUOUS(a) ||
-                                      PyArray_ISFORTRAN(a))) {
-        memset(PyArray_DATA(a), 0, PyArray_NBYTES(a));
-        return 0;
-    }
+        src_arr = (PyArrayObject *)PyArray_FromAny(obj, NULL, 0, 0,
+                                            NPY_ARRAY_ALLOWNA, NULL);
+        if (src_arr == NULL) {
+            return -1;
+        }
 
-    /* Use an iterator to go through all the data */
-    iter = NpyIter_New(a, NPY_ITER_WRITEONLY|NPY_ITER_EXTERNAL_LOOP,
-                    NPY_KEEPORDER, NPY_NO_CASTING, NULL);
+        if (PyArray_NDIM(src_arr) != 0) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Input object to FillWithScalar is not a scalar");
+            Py_DECREF(src_arr);
+            return -1;
+        }
 
-    if (iter == NULL) {
-        return -1;
-    }
+        retcode = PyArray_CopyInto(arr, src_arr);
 
-    iternext = NpyIter_GetIterNext(iter, NULL);
-    if (iternext == NULL) {
-        NpyIter_Deallocate(iter);
-        return -1;
+        Py_DECREF(src_arr);
+        return retcode;
     }
-    dataptr = NpyIter_GetDataPtrArray(iter);
-    stride = NpyIter_GetInnerStrideArray(iter)[0];
-    countptr = NpyIter_GetInnerLoopSizePtr(iter);
+}
 
-    needs_api = NpyIter_IterationNeedsAPI(iter);
+/*NUMPY_API
+ *
+ * Fills an array with zeros.
+ *
+ * dst: The destination array.
+ * wheremask: If non-NULL, a boolean mask specifying where to set the values.
+ * preservena: If 0, overwrites everything in 'dst', if 1, it
+ *              preserves elements in 'dst' which are NA.
+ * preservewhichna: Must be NULL. When multi-NA support is implemented,
+ *                   this will be an array of flags for 'preservena=True',
+ *                   indicating which NA payload values to preserve.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignZero(PyArrayObject *dst,
+                    PyArrayObject *wheremask,
+                    npy_bool preservena, npy_bool *preservewhichna)
+{
+    npy_bool value;
+    PyArray_Descr *bool_dtype;
+    int retcode;
 
-    /*
-     * Because buffering is disabled in the iterator, the inner loop
-     * strides will be the same throughout the iteration loop.  Thus,
-     * we can pass them to this function to take advantage of
-     * contiguous strides, etc.
-     *
-     * By setting the src_dtype to NULL, we get a function which sets
-     * the destination to zeros.
-     */
-    if (PyArray_GetDTypeTransferFunction(
-                    PyArray_ISALIGNED(a),
-                    0, stride,
-                    NULL, PyArray_DESCR(a),
-                    0,
-                    &stransfer, &transferdata,
-                    &needs_api) != NPY_SUCCEED) {
-        NpyIter_Deallocate(iter);
+    /* Create a raw bool scalar with the value False */
+    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+    if (bool_dtype == NULL) {
         return -1;
     }
+    value = 0;
 
-    if (!needs_api) {
-        NPY_BEGIN_THREADS;
-    }
+    retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
+                                wheremask, NPY_SAFE_CASTING,
+                                preservena, preservewhichna);
 
-    do {
-        stransfer(*dataptr, stride, NULL, 0,
-                    *countptr, 0, transferdata);
-    } while(iternext(iter));
+    Py_DECREF(bool_dtype);
+    return retcode;
+}
+
+/*NUMPY_API
+ *
+ * Fills an array with ones.
+ *
+ * dst: The destination array.
+ * wheremask: If non-NULL, a boolean mask specifying where to set the values.
+ * preservena: If 0, overwrites everything in 'dst', if 1, it
+ *              preserves elements in 'dst' which are NA.
+ * preservewhichna: Must be NULL. When multi-NA support is implemented,
+ *                   this will be an array of flags for 'preservena=True',
+ *                   indicating which NA payload values to preserve.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignOne(PyArrayObject *dst,
+                    PyArrayObject *wheremask,
+                    npy_bool preservena, npy_bool *preservewhichna)
+{
+    npy_bool value;
+    PyArray_Descr *bool_dtype;
+    int retcode;
 
-    if (!needs_api) {
-        NPY_END_THREADS;
+    /* Create a raw bool scalar with the value True */
+    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+    if (bool_dtype == NULL) {
+        return -1;
     }
+    value = 1;
 
-    NPY_AUXDATA_FREE(transferdata);
-    NpyIter_Deallocate(iter);
+    retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
+                                wheremask, NPY_SAFE_CASTING,
+                                preservena, preservewhichna);
 
-    return 0;
+    Py_DECREF(bool_dtype);
+    return retcode;
 }
 
 /*NUMPY_API
  * Copy an array.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_NewCopy(PyArrayObject *m1, NPY_ORDER order)
+PyArray_NewCopy(PyArrayObject *obj, NPY_ORDER order)
 {
-    PyArrayObject *ret = (PyArrayObject *)PyArray_NewLikeArray(
-                                                    m1, order, NULL, 1);
+    PyArrayObject *ret;
+
+    ret = (PyArrayObject *)PyArray_NewLikeArray(obj, order, NULL, 1);
     if (ret == NULL) {
         return NULL;
     }
 
-    if (PyArray_CopyInto(ret, m1) == -1) {
+    if (PyArray_HASMASKNA(obj)) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    if (PyArray_AssignArray(ret, obj, NULL, NPY_UNSAFE_CASTING, 0, NULL) < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -490,9 +544,10 @@ PyArray_NewCopy(PyArrayObject *m1, NPY_ORDER order)
 NPY_NO_EXPORT PyObject *
 PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
 {
-    PyArrayObject *new = NULL;
+    PyArrayObject *ret = NULL;
     PyArray_Descr *dtype;
     PyTypeObject *subtype;
+    int flags;
 
     if (pytype) {
         subtype = pytype;
@@ -500,32 +555,61 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
     else {
         subtype = Py_TYPE(self);
     }
+
+    flags = PyArray_FLAGS(self);
+    flags &= ~(NPY_ARRAY_MASKNA|NPY_ARRAY_OWNMASKNA);
+
     dtype = PyArray_DESCR(self);
     Py_INCREF(dtype);
-    new = (PyArrayObject *)PyArray_NewFromDescr(subtype,
+    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype,
                                dtype,
                                PyArray_NDIM(self), PyArray_DIMS(self),
                                PyArray_STRIDES(self),
                                PyArray_DATA(self),
-                               PyArray_FLAGS(self), (PyObject *)self);
-    if (new == NULL) {
+                               flags,
+                               (PyObject *)self);
+    if (ret == NULL) {
         return NULL;
     }
+
+    /* Take a view of the mask if it exists */
+    if (PyArray_HASMASKNA(self)) {
+        PyArrayObject_fields *fa = (PyArrayObject_fields *)ret;
+
+        if (PyArray_HASFIELDS(self)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "NA masks with fields are not supported yet");
+            Py_DECREF(ret);
+            Py_DECREF(type);
+            return NULL;
+        }
+
+        fa->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fa->maskna_dtype);
+        fa->maskna_data = PyArray_MASKNA_DATA(self);
+        if (fa->nd > 0) {
+            memcpy(fa->maskna_strides, PyArray_MASKNA_STRIDES(self),
+                                            fa->nd * sizeof(npy_intp));
+        }
+        fa->flags |= NPY_ARRAY_MASKNA;
+    }
+
+    /* Set the base object */
     Py_INCREF(self);
-    if (PyArray_SetBaseObject(new, (PyObject *)self) < 0) {
-        Py_DECREF(new);
+    if (PyArray_SetBaseObject(ret, (PyObject *)self) < 0) {
+        Py_DECREF(ret);
         Py_DECREF(type);
         return NULL;
     }
 
     if (type != NULL) {
-        if (PyObject_SetAttrString((PyObject *)new, "dtype",
+        if (PyObject_SetAttrString((PyObject *)ret, "dtype",
                                    (PyObject *)type) < 0) {
-            Py_DECREF(new);
+            Py_DECREF(ret);
             Py_DECREF(type);
             return NULL;
         }
         Py_DECREF(type);
     }
-    return (PyObject *)new;
+    return (PyObject *)ret;
 }
diff --git a/numpy/core/src/multiarray/convert.h b/numpy/core/src/multiarray/convert.h
index 1a34cfc52..de24e27cf 100644
--- a/numpy/core/src/multiarray/convert.h
+++ b/numpy/core/src/multiarray/convert.h
@@ -1,7 +1,4 @@
 #ifndef _NPY_ARRAYOBJECT_CONVERT_H_
 #define _NPY_ARRAYOBJECT_CONVERT_H_
 
-NPY_NO_EXPORT int
-PyArray_FillWithZero(PyArrayObject *a);
-
 #endif
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index f4d3e8c57..818d558aa 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -639,6 +639,62 @@ PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
 static int min_scalar_type_num(char *valueptr, int type_num,
                                             int *is_small_unsigned);
 
+NPY_NO_EXPORT npy_bool
+can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
+                    PyArray_Descr *to, NPY_CASTING casting)
+{
+    int swap;
+    int is_small_unsigned = 0, type_num;
+    npy_bool ret;
+    PyArray_Descr *dtype;
+
+    /* An aligned memory buffer large enough to hold any type */
+    npy_longlong value[4];
+
+    if (casting == NPY_UNSAFE_CASTING) {
+        return 1;
+    }
+
+    /*
+     * If the scalar isn't a number, or the rule is stricter than
+     * NPY_SAFE_CASTING, use the straight type-based rules
+     */
+    if (!PyTypeNum_ISNUMBER(scal_type->type_num) ||
+                            casting < NPY_SAFE_CASTING) {
+        return PyArray_CanCastTypeTo(scal_type, to, casting);
+    }
+
+    swap = !PyArray_ISNBO(scal_type->byteorder);
+    scal_type->f->copyswap(&value, scal_data, swap, NULL);
+
+    type_num = min_scalar_type_num((char *)&value, scal_type->type_num,
+                                    &is_small_unsigned);
+
+    /*
+     * If we've got a small unsigned scalar, and the 'to' type
+     * is not unsigned, then make it signed to allow the value
+     * to be cast more appropriately.
+     */
+    if (is_small_unsigned && !(PyTypeNum_ISUNSIGNED(to->type_num))) {
+        type_num = type_num_unsigned_to_signed(type_num);
+    }
+
+    dtype = PyArray_DescrFromType(type_num);
+    if (dtype == NULL) {
+        return 0;
+    }
+#if 0
+    printf("min scalar cast ");
+    PyObject_Print(dtype, stdout, 0);
+    printf(" to ");
+    PyObject_Print(to, stdout, 0);
+    printf("\n");
+#endif
+    ret = PyArray_CanCastTypeTo(dtype, to, casting);
+    Py_DECREF(dtype);
+    return ret;
+}
+
 /*NUMPY_API
  * Returns 1 if the array object may be cast to the given data type using
  * the casting rule, 0 otherwise.  This differs from PyArray_CanCastTo in
@@ -651,49 +707,17 @@ PyArray_CanCastArrayTo(PyArrayObject *arr, PyArray_Descr *to,
 {
     PyArray_Descr *from = PyArray_DESCR(arr);
 
-    /* If it's not a scalar, use the standard rules */
-    if (PyArray_NDIM(arr) > 0 || !PyTypeNum_ISNUMBER(from->type_num)) {
-        return PyArray_CanCastTypeTo(from, to, casting);
-    }
-    /* Otherwise, check the value */
-    else {
-        int swap = !PyArray_ISNBO(from->byteorder);
-        int is_small_unsigned = 0, type_num;
-        npy_bool ret;
-        PyArray_Descr *dtype;
-
-        /* An aligned memory buffer large enough to hold any type */
-        npy_longlong value[4];
-
-        from->f->copyswap(&value, PyArray_BYTES(arr), swap, NULL);
-
-        type_num = min_scalar_type_num((char *)&value, from->type_num,
-                                        &is_small_unsigned);
-
-        /*
-         * If we've got a small unsigned scalar, and the 'to' type
-         * is not unsigned, then make it signed to allow the value
-         * to be cast more appropriately.
-         */
-        if (is_small_unsigned && !(PyTypeNum_ISUNSIGNED(to->type_num))) {
-            type_num = type_num_unsigned_to_signed(type_num);
+    /* If it's a scalar, check the value */
+    if (PyArray_NDIM(arr) == 0 && !PyArray_HASFIELDS(arr)) {
+        /* Only check the value if it's not masked */
+        if (!PyArray_HASMASKNA(arr) ||
+                NpyMaskValue_IsExposed((npy_mask)*PyArray_MASKNA_DATA(arr))) {
+            return can_cast_scalar_to(from, PyArray_DATA(arr), to, casting);
         }
-
-        dtype = PyArray_DescrFromType(type_num);
-        if (dtype == NULL) {
-            return 0;
-        }
-#if 0
-        printf("min scalar cast ");
-        PyObject_Print(dtype, stdout, 0);
-        printf(" to ");
-        PyObject_Print(to, stdout, 0);
-        printf("\n");
-#endif
-        ret = PyArray_CanCastTypeTo(dtype, to, casting);
-        Py_DECREF(dtype);
-        return ret;
     }
+
+    /* Otherwise, use the standard rules */
+    return PyArray_CanCastTypeTo(from, to, casting);
 }
 
 /*NUMPY_API
@@ -1307,7 +1331,13 @@ NPY_NO_EXPORT PyArray_Descr *
 PyArray_MinScalarType(PyArrayObject *arr)
 {
     PyArray_Descr *dtype = PyArray_DESCR(arr);
-    if (PyArray_NDIM(arr) > 0 || !PyTypeNum_ISNUMBER(dtype->type_num)) {
+    /*
+     * If the array isn't a numeric scalar or is a scalar but with
+     * its value masked out, just return the array's dtype.
+     */
+    if (PyArray_NDIM(arr) > 0 || !PyTypeNum_ISNUMBER(dtype->type_num) ||
+                    (PyArray_HASMASKNA(arr) && !NpyMaskValue_IsExposed(
+                                    (npy_mask)*PyArray_MASKNA_DATA(arr)))) {
         Py_INCREF(dtype);
         return dtype;
     }
@@ -1651,7 +1681,7 @@ PyArray_Zero(PyArrayObject *arr)
     storeflags = PyArray_FLAGS(arr);
     PyArray_ENABLEFLAGS(arr, NPY_ARRAY_BEHAVED);
     ret = PyArray_DESCR(arr)->f->setitem(obj, zeroval, arr);
-    ((PyArrayObject_fieldaccess *)arr)->flags = storeflags;
+    ((PyArrayObject_fields *)arr)->flags = storeflags;
     Py_DECREF(obj);
     if (ret < 0) {
         PyDataMem_FREE(zeroval);
@@ -1689,7 +1719,7 @@ PyArray_One(PyArrayObject *arr)
     storeflags = PyArray_FLAGS(arr);
     PyArray_ENABLEFLAGS(arr, NPY_ARRAY_BEHAVED);
     ret = PyArray_DESCR(arr)->f->setitem(obj, oneval, arr);
-    ((PyArrayObject_fieldaccess *)arr)->flags = storeflags;
+    ((PyArrayObject_fields *)arr)->flags = storeflags;
     Py_DECREF(obj);
     if (ret < 0) {
         PyDataMem_FREE(oneval);
@@ -1702,22 +1732,38 @@ PyArray_One(PyArrayObject *arr)
 
 /*NUMPY_API
  * Return the typecode of the array a Python object would be converted to
+ *
+ * Returns the type number the result should have, or NPY_NOTYPE on error.
  */
 NPY_NO_EXPORT int
 PyArray_ObjectType(PyObject *op, int minimum_type)
 {
-    PyArray_Descr *intype;
-    PyArray_Descr *outtype;
-    int ret;
+    PyArray_Descr *dtype = NULL;
+    int ret, contains_na = 0;
+
+    if (minimum_type != NPY_NOTYPE && minimum_type >= 0) {
+        dtype = PyArray_DescrFromType(minimum_type);
+        if (dtype == NULL) {
+            return NPY_NOTYPE;
+        }
+    }
 
-    intype = PyArray_DescrFromType(minimum_type);
-    if (intype == NULL) {
-        PyErr_Clear();
+    if (PyArray_DTypeFromObject(op, NPY_MAXDIMS, &contains_na, &dtype) < 0) {
+        return NPY_NOTYPE;
     }
-    outtype = _array_find_type(op, intype, MAX_DIMS);
-    ret = outtype->type_num;
-    Py_DECREF(outtype);
-    Py_XDECREF(intype);
+
+    if (contains_na) {
+        ret = NPY_OBJECT;
+    }
+    else if (dtype == NULL) {
+        ret = NPY_DEFAULT_TYPE;
+    }
+    else {
+        ret = dtype->type_num;
+    }
+
+    Py_XDECREF(dtype);
+
     return ret;
 }
 
@@ -1820,7 +1866,7 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn)
 
     /* Make sure all arrays are actual array objects. */
     for (i = 0; i < n; i++) {
-        int flags = NPY_ARRAY_CARRAY;
+        int flags = NPY_ARRAY_CARRAY | NPY_ARRAY_ALLOWNA;
 
         if ((otmp = PySequence_GetItem(op, i)) == NULL) {
             goto fail;
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 71001b1c4..bf77d699a 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -13,6 +13,11 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn);
 NPY_NO_EXPORT int
 PyArray_ValidType(int type);
 
+/* Like PyArray_CanCastArrayTo */
+NPY_NO_EXPORT npy_bool
+can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
+                    PyArray_Descr *to, NPY_CASTING casting);
+
 /*
  * This function calls Py_DECREF on flex_dtype, and replaces it with
  * a new dtype that has been adapted based on the values in data_dtype
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index b527a1074..c37ce5ad6 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -24,6 +24,7 @@
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "na_object.h"
 
 /*
  * Reading from a file or a string.
@@ -235,7 +236,7 @@ _update_descr_and_dimensions(PyArray_Descr **des, npy_intp *newdims,
 
 
     newnd = oldnd + numnew;
-    if (newnd > MAX_DIMS) {
+    if (newnd > NPY_MAXDIMS) {
         goto finish;
     }
     if (tuple) {
@@ -275,12 +276,12 @@ _unaligned_strided_byte_copy(char *dst, npy_intp outstrides, char *src,
     char *tout = dst;
     char *tin = src;
 
-#define _COPY_N_SIZE(size)                      \
-    for(i=0; i<N; i++) {                       \
-        memcpy(tout, tin, size);                \
-        tin += instrides;                       \
-        tout += outstrides;                     \
-    }                                           \
+#define _COPY_N_SIZE(size) \
+    for(i=0; i<N; i++) { \
+        memcpy(tout, tin, size); \
+        tin += instrides; \
+        tout += outstrides; \
+    } \
     return
 
     switch(elsize) {
@@ -377,228 +378,14 @@ copy_and_swap(void *dst, void *src, int itemsize, npy_intp numitems,
     }
 }
 
-/* Gets a half-open range [start, end) which contains the array data */
-NPY_NO_EXPORT void
-_get_array_memory_extents(PyArrayObject *arr,
-                    npy_uintp *out_start, npy_uintp *out_end)
-{
-    npy_uintp start, end;
-    npy_intp idim, ndim = PyArray_NDIM(arr);
-    npy_intp *dimensions = PyArray_DIMS(arr),
-            *strides = PyArray_STRIDES(arr);
-
-    /* Calculate with a closed range [start, end] */
-    start = end = (npy_uintp)PyArray_DATA(arr);
-    for (idim = 0; idim < ndim; ++idim) {
-        npy_intp stride = strides[idim], dim = dimensions[idim];
-        /* If the array size is zero, return an empty range */
-        if (dim == 0) {
-            *out_start = *out_end = (npy_uintp)PyArray_DATA(arr);
-            return;
-        }
-        /* Expand either upwards or downwards depending on stride */
-        else {
-            if (stride > 0) {
-                end += stride*(dim-1);
-            }
-            else if (stride < 0) {
-                start += stride*(dim-1);
-            }
-        }
-    }
-
-    /* Return a half-open range */
-    *out_start = start;
-    *out_end = end + PyArray_DESCR(arr)->elsize;
-}
-
-/* Returns 1 if the arrays have overlapping data, 0 otherwise */
-NPY_NO_EXPORT int
-_arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
-{
-    npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
-
-    _get_array_memory_extents(arr1, &start1, &end1);
-    _get_array_memory_extents(arr2, &start2, &end2);
-
-    return (start1 < end2) && (start2 < end1);
-}
-
-/*NUMPY_API
- * Move the memory of one array into another, allowing for overlapping data.
- *
- * This is in general a difficult problem to solve efficiently, because
- * strides can be negative.  Consider "a = np.arange(3); a[::-1] = a", which
- * previously produced the incorrect [0, 1, 0].
- *
- * Instead of trying to be fancy, we simply check for overlap and make
- * a temporary copy when one exists.
- *
- * Returns 0 on success, negative on failure.
- */
-NPY_NO_EXPORT int
-PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src)
-{
-    /*
-     * Performance fix for expresions like "a[1000:6000] += x".  In this
-     * case, first an in-place add is done, followed by an assignment,
-     * equivalently expressed like this:
-     *
-     *   tmp = a[1000:6000]   # Calls array_subscript_nice in mapping.c
-     *   np.add(tmp, x, tmp)
-     *   a[1000:6000] = tmp   # Calls array_ass_sub in mapping.c
-     *
-     * In the assignment the underlying data type, shape, strides, and
-     * data pointers are identical, but src != dst because they are separately
-     * generated slices.  By detecting this and skipping the redundant
-     * copy of values to themselves, we potentially give a big speed boost.
-     *
-     * Note that we don't call EquivTypes, because usually the exact same
-     * dtype object will appear, and we don't want to slow things down
-     * with a complicated comparison.  The comparisons are ordered to
-     * try and reject this with as little work as possible.
-     */
-    if (PyArray_DATA(src) == PyArray_DATA(dst) &&
-                        PyArray_DESCR(src) == PyArray_DESCR(dst) &&
-                        PyArray_NDIM(src) == PyArray_NDIM(dst) &&
-                        PyArray_CompareLists(PyArray_DIMS(src),
-                                             PyArray_DIMS(dst),
-                                             PyArray_NDIM(src)) &&
-                        PyArray_CompareLists(PyArray_STRIDES(src),
-                                             PyArray_STRIDES(dst),
-                                             PyArray_NDIM(src))) {
-        /*printf("Redundant copy operation detected\n");*/
-        return 0;
-    }
-
-    /*
-     * A special case is when there is just one dimension with positive
-     * strides, and we pass that to CopyInto, which correctly handles
-     * it for most cases.  It may still incorrectly handle copying of
-     * partially-overlapping data elements, where the data pointer was offset
-     * by a fraction of the element size.
-     */
-    if ((PyArray_NDIM(dst) == 1 &&
-                        PyArray_NDIM(src) == 1 &&
-                        PyArray_STRIDE(dst, 0) > 0 &&
-                        PyArray_STRIDE(src, 0) > 0) ||
-                        !_arrays_overlap(dst, src)) {
-        return PyArray_CopyInto(dst, src);
-    }
-    else {
-        PyArrayObject *tmp;
-        int ret;
-
-        /*
-         * Allocate a temporary copy array.
-         */
-        tmp = (PyArrayObject *)PyArray_NewLikeArray(dst,
-                                        NPY_KEEPORDER, NULL, 0);
-        if (tmp == NULL) {
-            return -1;
-        }
-        ret = PyArray_CopyInto(tmp, src);
-        if (ret == 0) {
-            ret = PyArray_CopyInto(dst, tmp);
-        }
-        Py_DECREF(tmp);
-        return ret;
-    }
-}
-
-/*NUMPY_API
- * Copy the memory of one array into another, allowing for overlapping data
- * and selecting which elements to move based on a mask.
- *
- * Precisely handling the overlapping data is in general a difficult
- * problem to solve efficiently, because strides can be negative.
- * Consider "a = np.arange(3); a[::-1] = a", which previously produced
- * the incorrect [0, 1, 0].
- *
- * Instead of trying to be fancy, we simply check for overlap and make
- * a temporary copy when one exists.
- *
- * Returns 0 on success, negative on failure.
- */
-NPY_NO_EXPORT int
-PyArray_MaskedMoveInto(PyArrayObject *dst, PyArrayObject *src,
-                            PyArrayObject *mask, NPY_CASTING casting)
-{
-    /*
-     * Performance fix for expresions like "a[1000:6000] += x".  In this
-     * case, first an in-place add is done, followed by an assignment,
-     * equivalently expressed like this:
-     *
-     *   tmp = a[1000:6000]   # Calls array_subscript_nice in mapping.c
-     *   np.add(tmp, x, tmp)
-     *   a[1000:6000] = tmp   # Calls array_ass_sub in mapping.c
-     *
-     * In the assignment the underlying data type, shape, strides, and
-     * data pointers are identical, but src != dst because they are separately
-     * generated slices.  By detecting this and skipping the redundant
-     * copy of values to themselves, we potentially give a big speed boost.
-     *
-     * Note that we don't call EquivTypes, because usually the exact same
-     * dtype object will appear, and we don't want to slow things down
-     * with a complicated comparison.  The comparisons are ordered to
-     * try and reject this with as little work as possible.
-     */
-    if (PyArray_DATA(src) == PyArray_DATA(dst) &&
-                        PyArray_DESCR(src) == PyArray_DESCR(dst) &&
-                        PyArray_NDIM(src) == PyArray_NDIM(dst) &&
-                        PyArray_CompareLists(PyArray_DIMS(src),
-                                             PyArray_DIMS(dst),
-                                             PyArray_NDIM(src)) &&
-                        PyArray_CompareLists(PyArray_STRIDES(src),
-                                             PyArray_STRIDES(dst),
-                                             PyArray_NDIM(src))) {
-        /*printf("Redundant copy operation detected\n");*/
-        return 0;
-    }
-
-    /*
-     * A special case is when there is just one dimension with positive
-     * strides, and we pass that to CopyInto, which correctly handles
-     * it for most cases.  It may still incorrectly handle copying of
-     * partially-overlapping data elements, where the data pointer was offset
-     * by a fraction of the element size.
-     */
-    if ((PyArray_NDIM(dst) == 1 &&
-                        PyArray_NDIM(src) == 1 &&
-                        PyArray_STRIDE(dst, 0) > 0 &&
-                        PyArray_STRIDE(src, 0) > 0) ||
-                        !_arrays_overlap(dst, src)) {
-        return PyArray_MaskedCopyInto(dst, src, mask, casting);
-    }
-    else {
-        PyArrayObject *tmp;
-        int ret;
-
-        /*
-         * Allocate a temporary copy array.
-         */
-        tmp = (PyArrayObject *)PyArray_NewLikeArray(dst,
-                                        NPY_KEEPORDER, NULL, 0);
-        if (tmp == NULL) {
-            return -1;
-        }
-        ret = PyArray_CopyInto(tmp, src);
-        if (ret == 0) {
-            ret = PyArray_MaskedCopyInto(dst, tmp, mask, casting);
-        }
-        Py_DECREF(tmp);
-        return ret;
-    }
-}
-
-
-
 /* adapted from Numarray */
 static int
-setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
+setArrayFromSequence(PyArrayObject *a, PyObject *s,
+                        int dim, npy_intp offset, npy_intp maskoffset)
 {
     Py_ssize_t i, slen;
-    int res = -1;
+    int res = 0;
+    int a_has_maskna = PyArray_HASMASKNA(a);
 
     /*
      * This code is to ensure that the sequence access below will
@@ -645,25 +432,50 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
     /* Broadcast the one element from the sequence to all the outputs */
     if (slen == 1) {
         PyObject *o;
-        npy_intp alen = PyArray_DIMS(a)[dim];
+        NpyNA *na = NULL;
+        char maskvalue = 0;
+        npy_intp alen = PyArray_DIM(a, dim);
 
         o = PySequence_GetItem(s, 0);
         if (o == NULL) {
             goto fail;
         }
+
+        /* Check if the value being assigned is NA */
+        if (a_has_maskna) {
+            na = NpyNA_FromObject(o, 1);
+            if (na != NULL) {
+                maskvalue = (char)NpyNA_AsMaskValue(na);
+            }
+            else {
+                maskvalue = 1;
+            }
+        }
+
         for (i = 0; i < alen; i++) {
             if ((PyArray_NDIM(a) - dim) > 1) {
-                res = setArrayFromSequence(a, o, dim+1, offset);
+                res = setArrayFromSequence(a, o, dim+1, offset, maskoffset);
             }
             else {
-                res = PyArray_DESCR(a)->f->setitem(o, (PyArray_DATA(a) + offset), a);
+                /* Assign a value if it isn't NA */
+                if (na == NULL) {
+                    res = PyArray_DESCR(a)->f->setitem(o,
+                                        (PyArray_DATA(a) + offset), a);
+                }
+                /* Assign to the mask if a supports MASKNA */
+                if (a_has_maskna) {
+                    *(PyArray_MASKNA_DATA(a) + maskoffset) = maskvalue;
+                }
             }
             if (res < 0) {
                 Py_DECREF(o);
+                Py_XDECREF(na);
                 goto fail;
             }
             offset += PyArray_STRIDES(a)[dim];
+            maskoffset += PyArray_MASKNA_STRIDES(a)[dim];
         }
+        Py_XDECREF(na);
         Py_DECREF(o);
     }
     /* Copy element by element */
@@ -674,16 +486,38 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
                 goto fail;
             }
             if ((PyArray_NDIM(a) - dim) > 1) {
-                res = setArrayFromSequence(a, o, dim+1, offset);
+                res = setArrayFromSequence(a, o, dim+1, offset, maskoffset);
             }
             else {
-                res = PyArray_DESCR(a)->f->setitem(o, (PyArray_DATA(a) + offset), a);
+
+                /* Assignment without an NA mask */
+                if (!a_has_maskna) {
+                    res = PyArray_DESCR(a)->f->setitem(o,
+                                            (PyArray_DATA(a) + offset), a);
+                }
+                /* Assignment with an NA mask */
+                else {
+                    NpyNA *na = NpyNA_FromObject(o, 1);
+                    char maskvalue;
+                    if (na != NULL) {
+                        maskvalue = (char)NpyNA_AsMaskValue(na);
+                        res = 0;
+                    }
+                    else {
+                        maskvalue = 1;
+                        res = PyArray_DESCR(a)->f->setitem(o,
+                                            (PyArray_DATA(a) + offset), a);
+                    }
+
+                    *(PyArray_MASKNA_DATA(a) + maskoffset) = maskvalue;
+                }
             }
             Py_DECREF(o);
             if (res < 0) {
                 goto fail;
             }
             offset += PyArray_STRIDES(a)[dim];
+            maskoffset += PyArray_MASKNA_STRIDES(a)[dim];
         }
     }
 
@@ -708,7 +542,7 @@ PyArray_AssignFromSequence(PyArrayObject *self, PyObject *v)
                         "assignment to 0-d array");
         return -1;
     }
-    return setArrayFromSequence(self, v, 0, 0);
+    return setArrayFromSequence(self, v, 0, 0, 0);
 }
 
 /*
@@ -802,6 +636,12 @@ discover_dimensions(PyObject *obj, int *maxndim, npy_intp *d, int check_it,
         return 0;
     }
 
+    /* obj is an NA */
+    if (NpyNA_Check(obj)) {
+        *maxndim = 0;
+        return 0;
+    }
+
     /* obj is not a Sequence */
     if (!PySequence_Check(obj) ||
 #if defined(NPY_PY3K)
@@ -1030,7 +870,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
                      npy_intp *dims, npy_intp *strides, void *data,
                      int flags, PyObject *obj)
 {
-    PyArrayObject_fieldaccess *fa;
+    PyArrayObject_fields *fa;
     int i;
     size_t sd;
     npy_intp largest;
@@ -1118,7 +958,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         largest /= dim;
     }
 
-    fa = (PyArrayObject_fieldaccess *) subtype->tp_alloc(subtype, 0);
+    fa = (PyArrayObject_fields *) subtype->tp_alloc(subtype, 0);
     if (fa == NULL) {
         Py_DECREF(descr);
         return NULL;
@@ -1142,14 +982,17 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
     fa->descr = descr;
     fa->base = (PyObject *)NULL;
     fa->weakreflist = (PyObject *)NULL;
+    fa->maskna_dtype = NULL;
+    fa->maskna_data = NULL;
 
     if (nd > 0) {
-        fa->dimensions = PyDimMem_NEW(2*nd);
+        fa->dimensions = PyDimMem_NEW(3*nd);
         if (fa->dimensions == NULL) {
             PyErr_NoMemory();
             goto fail;
         }
         fa->strides = fa->dimensions + nd;
+        fa->maskna_strides = fa->dimensions + 2 * nd;
         memcpy(fa->dimensions, dims, sizeof(npy_intp)*nd);
         if (strides == NULL) { /* fill it in */
             sd = _array_fill_strides(fa->strides, dims, nd, sd,
@@ -1200,6 +1043,14 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
          * Caller must arrange for this to be reset if truly desired
          */
         fa->flags &= ~NPY_ARRAY_OWNDATA;
+
+        /* Flagging MASKNA is incompatible with providing the data pointer */
+        if (fa->flags & NPY_ARRAY_MASKNA) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot construct a view of data together with the "
+                    "NPY_ARRAY_MASKNA flag, the NA mask must be added later");
+            goto fail;
+        }
     }
     fa->data = data;
 
@@ -1320,21 +1171,24 @@ PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
     else {
         npy_intp strides[NPY_MAXDIMS], stride;
         npy_intp *shape = PyArray_DIMS(prototype);
-        _npy_stride_sort_item strideperm[NPY_MAXDIMS];
-        int i;
+        npy_stride_sort_item strideperm[NPY_MAXDIMS];
+        int idim;
 
-        PyArray_CreateSortedStridePerm(prototype, strideperm);
+        PyArray_CreateSortedStridePerm(PyArray_NDIM(prototype),
+                                        PyArray_SHAPE(prototype),
+                                        PyArray_STRIDES(prototype),
+                                        strideperm);
 
         /* Build the new strides */
         stride = dtype->elsize;
-        for (i = ndim-1; i >= 0; --i) {
-            npy_intp i_perm = strideperm[i].perm;
+        for (idim = ndim-1; idim >= 0; --idim) {
+            npy_intp i_perm = strideperm[idim].perm;
             strides[i_perm] = stride;
             stride *= shape[i_perm];
         }
 
         /* Finally, allocate the array */
-        ret = PyArray_NewFromDescr( subok ? Py_TYPE(prototype) : &PyArray_Type,
+        ret = PyArray_NewFromDescr(subok ? Py_TYPE(prototype) : &PyArray_Type,
                                         dtype,
                                         ndim,
                                         shape,
@@ -1457,7 +1311,7 @@ _array_from_buffer_3118(PyObject *obj, PyObject **out)
     r = PyArray_NewFromDescr(&PyArray_Type, descr,
                              nd, shape, strides, view->buf,
                              flags, NULL);
-    ((PyArrayObject_fieldaccess *)r)->base = memoryview;
+    ((PyArrayObject_fields *)r)->base = memoryview;
     PyArray_UpdateFlags((PyArrayObject *)r, NPY_ARRAY_UPDATE_ALL);
 
     *out = r;
@@ -1473,72 +1327,26 @@ fail:
 #endif
 }
 
-/*NUMPY_API
- * Retrieves the array parameters for viewing/converting an arbitrary
- * PyObject* to a NumPy array. This allows the "innate type and shape"
- * of Python list-of-lists to be discovered without
- * actually converting to an array.
- *
- * In some cases, such as structured arrays and the __array__ interface,
- * a data type needs to be used to make sense of the object.  When
- * this is needed, provide a Descr for 'requested_dtype', otherwise
- * provide NULL. This reference is not stolen. Also, if the requested
- * dtype doesn't modify the interpretation of the input, out_dtype will
- * still get the "innate" dtype of the object, not the dtype passed
- * in 'requested_dtype'.
- *
- * If writing to the value in 'op' is desired, set the boolean
- * 'writeable' to 1.  This raises an error when 'op' is a scalar, list
- * of lists, or other non-writeable 'op'.
- *
- * Result: When success (0 return value) is returned, either out_arr
- *         is filled with a non-NULL PyArrayObject and
- *         the rest of the parameters are untouched, or out_arr is
- *         filled with NULL, and the rest of the parameters are
- *         filled.
- *
- * Typical usage:
- *
- *      PyArrayObject *arr = NULL;
- *      PyArray_Descr *dtype = NULL;
- *      int ndim = 0;
- *      npy_intp dims[NPY_MAXDIMS];
+/*
+ * A slight generalization of PyArray_GetArrayParamsFromObject,
+ * which also returns whether the input data contains any numpy.NA
+ * values.
  *
- *      if (PyArray_GetArrayParamsFromObject(op, NULL, 1, &dtype,
- *                                          &ndim, &dims, &arr, NULL) < 0) {
- *          return NULL;
- *      }
- *      if (arr == NULL) {
- *          ... validate/change dtype, validate flags, ndim, etc ...
- *          // Could make custom strides here too
- *          arr = PyArray_NewFromDescr(&PyArray_Type, dtype, ndim,
- *                                      dims, NULL,
- *                                      is_f_order ? NPY_ARRAY_F_CONTIGUOUS : 0,
- *                                      NULL);
- *          if (arr == NULL) {
- *              return NULL;
- *          }
- *          if (PyArray_CopyObject(arr, op) < 0) {
- *              Py_DECREF(arr);
- *              return NULL;
- *          }
- *      }
- *      else {
- *          ... in this case the other parameters weren't filled, just
- *              validate and possibly copy arr itself ...
- *      }
- *      ... use arr ...
+ * This isn't exposed in the public API.
  */
 NPY_NO_EXPORT int
-PyArray_GetArrayParamsFromObject(PyObject *op,
+PyArray_GetArrayParamsFromObjectEx(PyObject *op,
                         PyArray_Descr *requested_dtype,
                         npy_bool writeable,
                         PyArray_Descr **out_dtype,
                         int *out_ndim, npy_intp *out_dims,
+                        int *out_contains_na,
                         PyArrayObject **out_arr, PyObject *context)
 {
     PyObject *tmp;
 
+    *out_contains_na = 0;
+
     /* If op is an array */
     if (PyArray_Check(op)) {
         if (writeable && !PyArray_ISWRITEABLE((PyArrayObject *)op)) {
@@ -1581,6 +1389,34 @@ PyArray_GetArrayParamsFromObject(PyObject *op,
         return 0;
     }
 
+    /* If op is a numpy.NA */
+    if (NpyNA_Check(op)) {
+        NpyNA_fields *fna = (NpyNA_fields *)op;
+
+        if (writeable) {
+            PyErr_SetString(PyExc_RuntimeError,
+                                "cannot write to numpy.NA");
+            return -1;
+        }
+        /* Use the NA's dtype if available */
+        if (fna->dtype != NULL) {
+            *out_dtype = fna->dtype;
+            Py_INCREF(*out_dtype);
+        }
+        /* Otherwise use the default NumPy dtype */
+        else {
+            *out_dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
+            if (*out_dtype == NULL) {
+                return -1;
+            }
+        }
+        *out_ndim = 0;
+        *out_arr = NULL;
+        *out_contains_na = 1;
+        return 0;
+
+    }
+
     /* If op supports the PEP 3118 buffer interface */
     if (!PyBytes_Check(op) && !PyUnicode_Check(op) &&
              _array_from_buffer_3118(op, (PyObject **)out_arr) == 0) {
@@ -1658,16 +1494,23 @@ PyArray_GetArrayParamsFromObject(PyObject *op,
             *out_dtype = requested_dtype;
         }
         else {
-            *out_dtype = _array_find_type(op, NULL, MAX_DIMS);
-            if (*out_dtype == NULL) {
-                if (PyErr_Occurred() &&
-                        PyErr_GivenExceptionMatches(PyErr_Occurred(),
-                                                PyExc_MemoryError)) {
+            *out_dtype = NULL;
+            if (PyArray_DTypeFromObject(op, NPY_MAXDIMS,
+                                    out_contains_na, out_dtype) < 0) {
+                if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
                     return -1;
                 }
-                /* Say it's an OBJECT array if there's an error */
-                PyErr_Clear();
-                *out_dtype = PyArray_DescrFromType(NPY_OBJECT);
+                /* Return NPY_OBJECT for most exceptions */
+                else {
+                    PyErr_Clear();
+                    *out_dtype = PyArray_DescrFromType(NPY_OBJECT);
+                    if (*out_dtype == NULL) {
+                        return -1;
+                    }
+                }
+            }
+            if (*out_dtype == NULL) {
+                *out_dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
                 if (*out_dtype == NULL) {
                     return -1;
                 }
@@ -1763,6 +1606,89 @@ PyArray_GetArrayParamsFromObject(PyObject *op,
 }
 
 /*NUMPY_API
+ * Retrieves the array parameters for viewing/converting an arbitrary
+ * PyObject* to a NumPy array. This allows the "innate type and shape"
+ * of Python list-of-lists to be discovered without
+ * actually converting to an array.
+ *
+ * In some cases, such as structured arrays and the __array__ interface,
+ * a data type needs to be used to make sense of the object.  When
+ * this is needed, provide a Descr for 'requested_dtype', otherwise
+ * provide NULL. This reference is not stolen. Also, if the requested
+ * dtype doesn't modify the interpretation of the input, out_dtype will
+ * still get the "innate" dtype of the object, not the dtype passed
+ * in 'requested_dtype'.
+ *
+ * If writing to the value in 'op' is desired, set the boolean
+ * 'writeable' to 1.  This raises an error when 'op' is a scalar, list
+ * of lists, or other non-writeable 'op'.
+ *
+ * Result: When success (0 return value) is returned, either out_arr
+ *         is filled with a non-NULL PyArrayObject and
+ *         the rest of the parameters are untouched, or out_arr is
+ *         filled with NULL, and the rest of the parameters are
+ *         filled.
+ *
+ * Typical usage:
+ *
+ *      PyArrayObject *arr = NULL;
+ *      PyArray_Descr *dtype = NULL;
+ *      int ndim = 0;
+ *      npy_intp dims[NPY_MAXDIMS];
+ *
+ *      if (PyArray_GetArrayParamsFromObject(op, NULL, 1, &dtype,
+ *                                          &ndim, &dims, &arr, NULL) < 0) {
+ *          return NULL;
+ *      }
+ *      if (arr == NULL) {
+ *          ... validate/change dtype, validate flags, ndim, etc ...
+ *          // Could make custom strides here too
+ *          arr = PyArray_NewFromDescr(&PyArray_Type, dtype, ndim,
+ *                                      dims, NULL,
+ *                                      is_f_order ? NPY_ARRAY_F_CONTIGUOUS : 0,
+ *                                      NULL);
+ *          if (arr == NULL) {
+ *              return NULL;
+ *          }
+ *          if (PyArray_CopyObject(arr, op) < 0) {
+ *              Py_DECREF(arr);
+ *              return NULL;
+ *          }
+ *      }
+ *      else {
+ *          ... in this case the other parameters weren't filled, just
+ *              validate and possibly copy arr itself ...
+ *      }
+ *      ... use arr ...
+ */
+NPY_NO_EXPORT int
+PyArray_GetArrayParamsFromObject(PyObject *op,
+                        PyArray_Descr *requested_dtype,
+                        npy_bool writeable,
+                        PyArray_Descr **out_dtype,
+                        int *out_ndim, npy_intp *out_dims,
+                        PyArrayObject **out_arr, PyObject *context)
+{
+    int contains_na = 0, retcode;
+    retcode = PyArray_GetArrayParamsFromObjectEx(op, requested_dtype,
+                        writeable, out_dtype, out_ndim, out_dims,
+                        &contains_na, out_arr, context);
+
+    /* If NAs were detected, switch to an NPY_OBJECT dtype */
+    if (retcode == 0 && *out_arr == NULL && contains_na) {
+        if ((*out_dtype)->type_num != NPY_OBJECT) {
+            Py_DECREF(*out_dtype);
+            *out_dtype = PyArray_DescrFromType(NPY_OBJECT);
+            if (*out_dtype == NULL) {
+                retcode = -1;
+            }
+        }
+    }
+
+    return retcode;
+}
+
+/*NUMPY_API
  * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
  * Steals a reference to newtype --- which can be NULL
  */
@@ -1776,13 +1702,13 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
      */
     PyArrayObject *arr = NULL, *ret;
     PyArray_Descr *dtype = NULL;
-    int ndim = 0;
+    int ndim = 0, contains_na = 0;
     npy_intp dims[NPY_MAXDIMS];
 
     /* Get either the array or its parameters if it isn't an array */
-    if (PyArray_GetArrayParamsFromObject(op, newtype,
+    if (PyArray_GetArrayParamsFromObjectEx(op, newtype,
                         0, &dtype,
-                        &ndim, dims, &arr, context) < 0) {
+                        &ndim, dims, &contains_na, &arr, context) < 0) {
         Py_XDECREF(newtype);
         return NULL;
     }
@@ -1796,6 +1722,14 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
 
     /* If we got dimensions and dtype instead of an array */
     if (arr == NULL) {
+        /*
+         * If the input data contains any NAs, and the ALLOWNA flag is
+         * enabled, produce an array with an NA mask.
+         */
+        if (contains_na && (flags & NPY_ARRAY_ALLOWNA) != 0) {
+            flags |= NPY_ARRAY_MASKNA;
+        }
+
         if (flags & NPY_ARRAY_UPDATEIFCOPY) {
             Py_XDECREF(newtype);
             PyErr_SetString(PyExc_TypeError,
@@ -1849,26 +1783,63 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 Py_DECREF(dtype);
             }
 
+            /*
+             * If there are NAs, but no requested NA support,
+             * switch to NPY_OBJECT. Alternatively - raise an error?
+             */
+            if (contains_na &&
+                    (flags & (NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA)) == 0) {
+                Py_DECREF(newtype);
+                newtype = PyArray_DescrFromType(NPY_OBJECT);
+                if (newtype == NULL) {
+                    return NULL;
+                }
+            }
+
             /* Create an array and copy the data */
             ret = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, newtype,
-                                                 ndim, dims,
-                                                 NULL, NULL,
-                                                 flags&NPY_ARRAY_F_CONTIGUOUS, NULL);
-            if (ret != NULL) {
-                if (ndim > 0) {
-                    if (PyArray_AssignFromSequence(ret, op) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
-                    }
+                                         ndim, dims,
+                                         NULL, NULL,
+                                         flags&NPY_ARRAY_F_CONTIGUOUS, NULL);
+            if (ret == NULL) {
+                return NULL;
+            }
+
+            /*
+             * Add an NA mask if requested, or if allowed and the data
+             * has NAs
+             */
+            if ((flags & (NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA)) != 0) {
+                if (PyArray_AllocateMaskNA(ret,
+                                (flags&NPY_ARRAY_OWNMASKNA) != 0, 0, 1) < 0) {
+                    Py_DECREF(ret);
+                    return NULL;
                 }
-                else {
-                    if (PyArray_DESCR(ret)->f->setitem(op,
-                                                PyArray_DATA(ret), ret) < 0) {
-                        Py_DECREF(ret);
-                        ret = NULL;
+
+                /* Special case assigning a single NA */
+                if (ndim == 0) {
+                    NpyNA *na = NpyNA_FromObject(op, 1);
+                    if (na != NULL) {
+                        PyArray_MASKNA_DATA(ret)[0] =
+                                        (char)NpyNA_AsMaskValue(na);
+                        return (PyObject *)ret;
                     }
                 }
             }
+
+            if (ndim > 0) {
+                if (PyArray_AssignFromSequence(ret, op) < 0) {
+                    Py_DECREF(ret);
+                    ret = NULL;
+                }
+            }
+            else {
+                if (PyArray_DESCR(ret)->f->setitem(op,
+                                            PyArray_DATA(ret), ret) < 0) {
+                    Py_DECREF(ret);
+                    ret = NULL;
+                }
+            }
         }
     }
     else {
@@ -1952,7 +1923,7 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
             PyArray_DESCR_REPLACE(descr);
         }
         if (descr) {
-            descr->byteorder = PyArray_NATIVE;
+            descr->byteorder = NPY_NATIVE;
         }
     }
 
@@ -1962,10 +1933,10 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
     }
     if ((requires & NPY_ARRAY_ELEMENTSTRIDES) &&
         !PyArray_ElementStrides(obj)) {
-        PyObject *new;
-        new = PyArray_NewCopy((PyArrayObject *)obj, NPY_ANYORDER);
+        PyObject *ret;
+        ret = PyArray_NewCopy((PyArrayObject *)obj, NPY_ANYORDER);
         Py_DECREF(obj);
-        obj = new;
+        obj = ret;
     }
     return obj;
 }
@@ -1982,13 +1953,12 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
     int copy = 0;
     int arrflags;
     PyArray_Descr *oldtype;
-    PyTypeObject *subtype;
     NPY_CASTING casting = NPY_SAFE_CASTING;
 
     oldtype = PyArray_DESCR(arr);
-    subtype = Py_TYPE(arr);
     if (newtype == NULL) {
-        newtype = oldtype; Py_INCREF(oldtype);
+        newtype = oldtype;
+        Py_INCREF(oldtype);
     }
     itemsize = newtype->elsize;
     if (itemsize == 0) {
@@ -2025,133 +1995,141 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
         return NULL;
     }
 
-    /* Don't copy if sizes are compatible */
-    if ((flags & NPY_ARRAY_ENSURECOPY) ||
-                            PyArray_EquivTypes(oldtype, newtype)) {
-        arrflags = PyArray_FLAGS(arr);
-        if (PyArray_NDIM(arr) <= 1 && (flags & NPY_ARRAY_F_CONTIGUOUS)) {
-            flags |= NPY_ARRAY_C_CONTIGUOUS;
-        }
-        copy = (flags & NPY_ARRAY_ENSURECOPY) ||
-            ((flags & NPY_ARRAY_C_CONTIGUOUS) &&
-                    (!(arrflags & NPY_ARRAY_C_CONTIGUOUS)))
-            || ((flags & NPY_ARRAY_ALIGNED) &&
-                    (!(arrflags & NPY_ARRAY_ALIGNED)))
-            || (PyArray_NDIM(arr) > 1 &&
-                    ((flags & NPY_ARRAY_F_CONTIGUOUS) &&
-                    (!(arrflags & NPY_ARRAY_F_CONTIGUOUS))))
-            || ((flags & NPY_ARRAY_WRITEABLE) &&
-                    (!(arrflags & NPY_ARRAY_WRITEABLE)));
-
-        if (copy) {
-            if ((flags & NPY_ARRAY_UPDATEIFCOPY) &&
-                                (!PyArray_ISWRITEABLE(arr))) {
-                Py_DECREF(newtype);
-                PyErr_SetString(PyExc_ValueError,
-                        "cannot copy back to a read-only array");
-                return NULL;
-            }
-            if ((flags & NPY_ARRAY_ENSUREARRAY)) {
-                subtype = &PyArray_Type;
-            }
-            ret = (PyArrayObject *)
-                PyArray_NewFromDescr(subtype, newtype,
-                                     PyArray_NDIM(arr),
-                                     PyArray_DIMS(arr),
-                                     NULL, NULL,
-                                     flags & NPY_ARRAY_F_CONTIGUOUS,
-                                     (PyObject *)arr);
-            if (ret == NULL) {
-                return NULL;
-            }
-            if (PyArray_CopyInto(ret, arr) < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-            if (flags & NPY_ARRAY_UPDATEIFCOPY)  {
-                /*
-                 * Don't use PyArray_SetBaseObject, because that compresses
-                 * the chain of bases.
-                 */
-                Py_INCREF(arr);
-                ((PyArrayObject_fieldaccess *)ret)->base = (PyObject *)arr;
-                PyArray_ENABLEFLAGS(ret, NPY_ARRAY_UPDATEIFCOPY);
-                PyArray_CLEARFLAGS(arr, NPY_ARRAY_WRITEABLE);
-            }
-        }
-        /*
-         * If no copy then just increase the reference
-         * count and return the input
-         */
-        else {
-            Py_DECREF(newtype);
-            if ((flags & NPY_ARRAY_ENSUREARRAY) &&
-                                    !PyArray_CheckExact(arr)) {
-                PyArray_Descr *dtype = PyArray_DESCR(arr);
-                Py_INCREF(dtype);
-                ret = (PyArrayObject *)
-                    PyArray_NewFromDescr(&PyArray_Type,
-                                         dtype,
-                                         PyArray_NDIM(arr),
-                                         PyArray_DIMS(arr),
-                                         PyArray_STRIDES(arr),
-                                         PyArray_DATA(arr),
-                                         PyArray_FLAGS(arr),
-                                         NULL);
-                if (ret == NULL) {
-                    return NULL;
-                }
-                if (PyArray_SetBaseObject(ret, (PyObject *)arr)) {
-                    Py_DECREF(ret);
-                    return NULL;
-                }
-            }
-            else {
-                ret = arr;
-            }
-            Py_INCREF(arr);
+    arrflags = PyArray_FLAGS(arr);
+    if (PyArray_NDIM(arr) <= 1 && (flags & NPY_ARRAY_F_CONTIGUOUS)) {
+        flags |= NPY_ARRAY_C_CONTIGUOUS;
+    }
+           /* If a guaranteed copy was requested */
+    copy = (flags & NPY_ARRAY_ENSURECOPY) ||
+           /* If C contiguous was requested, and arr is not */
+           ((flags & NPY_ARRAY_C_CONTIGUOUS) &&
+                   (!(arrflags & NPY_ARRAY_C_CONTIGUOUS))) ||
+           /* If an aligned array was requested, and arr is not */
+           ((flags & NPY_ARRAY_ALIGNED) &&
+                   (!(arrflags & NPY_ARRAY_ALIGNED))) ||
+           /* If a Fortran contiguous array was requested, and arr is not */
+           (PyArray_NDIM(arr) > 1 &&
+                   ((flags & NPY_ARRAY_F_CONTIGUOUS) &&
+                   (!(arrflags & NPY_ARRAY_F_CONTIGUOUS)))) ||
+           /* If a writeable array was requested, and arr is not */
+           ((flags & NPY_ARRAY_WRITEABLE) &&
+                   (!(arrflags & NPY_ARRAY_WRITEABLE))) ||
+           /* If an array with no NA mask was requested, and arr has one */
+           ((flags & (NPY_ARRAY_ALLOWNA |
+                      NPY_ARRAY_MASKNA |
+                      NPY_ARRAY_OWNMASKNA)) == 0 &&
+                   (arrflags & NPY_ARRAY_MASKNA)) ||
+           !PyArray_EquivTypes(oldtype, newtype);
+
+    if (copy) {
+        NPY_ORDER order = NPY_KEEPORDER;
+        int subok = 1;
+
+        /* Set the order for the copy being made based on the flags */
+        if (flags & NPY_ARRAY_F_CONTIGUOUS) {
+            order = NPY_FORTRANORDER;
+        }
+        else if (flags & NPY_ARRAY_C_CONTIGUOUS) {
+            order = NPY_CORDER;
         }
-    }
 
-    /*
-     * The desired output type is different than the input
-     * array type and copy was not specified
-     */
-    else {
         if ((flags & NPY_ARRAY_UPDATEIFCOPY) &&
                             (!PyArray_ISWRITEABLE(arr))) {
             Py_DECREF(newtype);
             PyErr_SetString(PyExc_ValueError,
-                    "cannot copy back to a read-only array B");
+                    "cannot copy back to a read-only array");
             return NULL;
         }
         if ((flags & NPY_ARRAY_ENSUREARRAY)) {
-            subtype = &PyArray_Type;
+            subok = 0;
         }
-        ret = (PyArrayObject *)
-            PyArray_NewFromDescr(subtype, newtype,
-                                 PyArray_NDIM(arr), PyArray_DIMS(arr),
-                                 NULL, NULL,
-                                 flags & NPY_ARRAY_F_CONTIGUOUS,
-                                 (PyObject *)arr);
+        ret = (PyArrayObject *)PyArray_NewLikeArray(arr, order,
+                                                    newtype, subok);
         if (ret == NULL) {
             return NULL;
         }
-        if (PyArray_CastTo(ret, arr) < 0) {
+
+        /*
+         * Allocate an NA mask if necessary from the input,
+         * is NAs are being allowed.
+         */
+        if ((arrflags & NPY_ARRAY_MASKNA) && (flags & NPY_ARRAY_ALLOWNA)) {
+            if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+        }
+
+        /*
+         * If a ALLOWNA was not enabled, and 'arr' has an NA mask,
+         * this will raise an error if 'arr' contains any NA values.
+         */
+        if (PyArray_CopyInto(ret, arr) < 0) {
             Py_DECREF(ret);
             return NULL;
         }
+
+        /* Allocate an NA mask if requested but wasn't from the input */
+        if ((flags & (NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA)) != 0 &&
+                            !PyArray_HASMASKNA(ret)) {
+            if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+        }
+
         if (flags & NPY_ARRAY_UPDATEIFCOPY)  {
             /*
              * Don't use PyArray_SetBaseObject, because that compresses
              * the chain of bases.
              */
             Py_INCREF(arr);
-            ((PyArrayObject_fieldaccess *)ret)->base = (PyObject *)arr;
+            ((PyArrayObject_fields *)ret)->base = (PyObject *)arr;
             PyArray_ENABLEFLAGS(ret, NPY_ARRAY_UPDATEIFCOPY);
             PyArray_CLEARFLAGS(arr, NPY_ARRAY_WRITEABLE);
         }
     }
+    /*
+     * If no copy then take an appropriate view if necessary, or
+     * just return a reference to ret itself.
+     */
+    else {
+        int needview = ((flags & NPY_ARRAY_ENSUREARRAY) &&
+                            !PyArray_CheckExact(arr)) ||
+                       ((flags & NPY_ARRAY_MASKNA) &&
+                            !(arrflags & NPY_ARRAY_MASKNA)) ||
+                       ((flags & NPY_ARRAY_OWNMASKNA) &&
+                            !(arrflags & NPY_ARRAY_OWNMASKNA));
+
+        Py_DECREF(newtype);
+        if (needview) {
+            PyArray_Descr *dtype = PyArray_DESCR(arr);
+            PyTypeObject *subtype = NULL;
+
+            if (flags & NPY_ARRAY_ENSUREARRAY) {
+                subtype = &PyArray_Type;
+            }
+
+            Py_INCREF(dtype);
+            ret = (PyArrayObject *)PyArray_View(arr, NULL, subtype);
+            if (ret == NULL) {
+                return NULL;
+            }
+
+            if (flags & (NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA)) {
+                int ownmaskna = (flags & NPY_ARRAY_OWNMASKNA) != 0;
+                if (PyArray_AllocateMaskNA(ret, ownmaskna, 0, 1) < 0) {
+                    Py_DECREF(ret);
+                    return NULL;
+                }
+            }
+        }
+        else {
+            Py_INCREF(arr);
+            ret = arr;
+        }
+    }
+
     return (PyObject *)ret;
 }
 
@@ -2233,7 +2211,7 @@ PyArray_FromInterface(PyObject *input)
     char *data;
     Py_ssize_t buffer_len;
     int res, i, n;
-    intp dims[MAX_DIMS], strides[MAX_DIMS];
+    intp dims[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     int dataflags = NPY_ARRAY_BEHAVED;
 
     /* Get the memory from __array_data__ and __array_offset__ */
@@ -2474,7 +2452,26 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_DescrFromObject(PyObject *op, PyArray_Descr *mintype)
 {
-    return _array_find_type(op, mintype, MAX_DIMS);
+    PyArray_Descr *dtype;
+    int contains_na = 0;
+
+    dtype = mintype;
+    Py_XINCREF(dtype);
+
+    if (PyArray_DTypeFromObject(op, NPY_MAXDIMS, &contains_na, &dtype) < 0) {
+        return NULL;
+    }
+
+    if (contains_na) {
+        Py_XDECREF(dtype);
+        return PyArray_DescrFromType(NPY_OBJECT);
+    }
+    else if (dtype == NULL) {
+        return PyArray_DescrFromType(NPY_DEFAULT_TYPE);
+    }
+    else {
+        return dtype;
+    }
 }
 
 /* These are also old calls (should use PyArray_NewFromDescr) */
@@ -2493,7 +2490,7 @@ PyArray_FromDimsAndDataAndDescr(int nd, int *d,
 {
     PyObject *ret;
     int i;
-    npy_intp newd[MAX_DIMS];
+    npy_intp newd[NPY_MAXDIMS];
     char msg[] = "PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.";
 
     if (DEPRECATE(msg) < 0) {
@@ -2581,23 +2578,28 @@ PyArray_EnsureAnyArray(PyObject *op)
 
 /* TODO: Put the order parameter in PyArray_CopyAnyInto and remove this */
 NPY_NO_EXPORT int
-PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
-                                NPY_ORDER order)
+PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
 {
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
+    PyArray_MaskedStridedUnaryOp *maskedstransfer = NULL;
     NpyAuxData *transferdata = NULL;
+    PyArray_StridedUnaryOp *maskna_stransfer = NULL;
+    NpyAuxData *maskna_transferdata = NULL;
     NpyIter *dst_iter, *src_iter;
 
     NpyIter_IterNextFunc *dst_iternext, *src_iternext;
     char **dst_dataptr, **src_dataptr;
     npy_intp dst_stride, src_stride;
+    npy_intp maskna_src_stride = 0, maskna_dst_stride = 0;
     npy_intp *dst_countptr, *src_countptr;
+    npy_uint32 baseflags;
 
     char *dst_data, *src_data;
+    char *maskna_dst_data = NULL, *maskna_src_data = NULL;
     npy_intp dst_count, src_count, count;
-    npy_intp src_itemsize;
+    npy_intp src_itemsize, maskna_src_itemsize = 0;
     npy_intp dst_size, src_size;
-    int needs_api;
+    int needs_api, use_maskna = 0;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -2632,26 +2634,58 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
         return 0;
     }
 
+    baseflags = NPY_ITER_EXTERNAL_LOOP |
+                NPY_ITER_DONT_NEGATE_STRIDES |
+                NPY_ITER_REFS_OK;
+
+    /*
+     * If 'src' has a mask, and 'dst' doesn't, need to validate that
+     * 'src' has everything exposed. Otherwise, the mask needs to
+     * be copied as well.
+     */
+    if (PyArray_HASMASKNA(src)) {
+        if (PyArray_HASMASKNA(dst)) {
+            use_maskna = 1;
+            baseflags |= NPY_ITER_USE_MASKNA;
+        }
+        else {
+            int containsna = PyArray_ContainsNA(src, NULL, NULL);
+            if (containsna == -1) {
+                return -1;
+            }
+            else if (containsna) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot assign NA to an array which "
+                        "does not support NAs");
+                return -1;
+            }
+            baseflags |= NPY_ITER_IGNORE_MASKNA;
+        }
+    }
+    /*
+     * If 'dst' has a mask but 'src' doesn't, set all of 'dst'
+     * to be exposed, then proceed without worrying about the mask.
+     */
+    else if (PyArray_HASMASKNA(dst)) {
+        if (PyArray_AssignMaskNA(dst, 1, NULL, 0, NULL) < 0) {
+            return -1;
+        }
+        baseflags |= NPY_ITER_IGNORE_MASKNA;
+    }
 
     /*
      * This copy is based on matching C-order traversals of src and dst.
      * By using two iterators, we can find maximal sub-chunks that
      * can be processed at once.
      */
-    dst_iter = NpyIter_New(dst, NPY_ITER_WRITEONLY|
-                                NPY_ITER_EXTERNAL_LOOP|
-                                NPY_ITER_DONT_NEGATE_STRIDES|
-                                NPY_ITER_REFS_OK,
+    dst_iter = NpyIter_New(dst, NPY_ITER_WRITEONLY | baseflags,
                                 order,
                                 NPY_NO_CASTING,
                                 NULL);
     if (dst_iter == NULL) {
         return -1;
     }
-    src_iter = NpyIter_New(src, NPY_ITER_READONLY|
-                                NPY_ITER_EXTERNAL_LOOP|
-                                NPY_ITER_DONT_NEGATE_STRIDES|
-                                NPY_ITER_REFS_OK,
+    src_iter = NpyIter_New(src, NPY_ITER_READONLY | baseflags,
                                 order,
                                 NPY_NO_CASTING,
                                 NULL);
@@ -2664,14 +2698,21 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
     dst_iternext = NpyIter_GetIterNext(dst_iter, NULL);
     dst_dataptr = NpyIter_GetDataPtrArray(dst_iter);
     /* Since buffering is disabled, we can cache the stride */
-    dst_stride = *NpyIter_GetInnerStrideArray(dst_iter);
+    dst_stride = NpyIter_GetInnerStrideArray(dst_iter)[0];
     dst_countptr = NpyIter_GetInnerLoopSizePtr(dst_iter);
 
     src_iternext = NpyIter_GetIterNext(src_iter, NULL);
     src_dataptr = NpyIter_GetDataPtrArray(src_iter);
     /* Since buffering is disabled, we can cache the stride */
-    src_stride = *NpyIter_GetInnerStrideArray(src_iter);
+    src_stride = NpyIter_GetInnerStrideArray(src_iter)[0];
     src_countptr = NpyIter_GetInnerLoopSizePtr(src_iter);
+    src_itemsize = PyArray_DESCR(src)->elsize;
+
+    if (use_maskna) {
+        maskna_src_stride = NpyIter_GetInnerStrideArray(src_iter)[1];
+        maskna_dst_stride = NpyIter_GetInnerStrideArray(dst_iter)[1];
+        maskna_src_itemsize = PyArray_MASKNA_DTYPE(src)->elsize;
+    }
 
     if (dst_iternext == NULL || src_iternext == NULL) {
         NpyIter_Deallocate(dst_iter);
@@ -2679,8 +2720,6 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
         return -1;
     }
 
-    src_itemsize = PyArray_DESCR(src)->elsize;
-
     needs_api = NpyIter_IterationNeedsAPI(dst_iter) ||
                 NpyIter_IterationNeedsAPI(src_iter);
 
@@ -2690,18 +2729,49 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
      * we can pass them to this function to take advantage of
      * contiguous strides, etc.
      */
-    if (PyArray_GetDTypeTransferFunction(
-                    PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
-                    src_stride, dst_stride,
-                    PyArray_DESCR(src), PyArray_DESCR(dst),
-                    0,
-                    &stransfer, &transferdata,
-                    &needs_api) != NPY_SUCCEED) {
-        NpyIter_Deallocate(dst_iter);
-        NpyIter_Deallocate(src_iter);
-        return -1;
+    if (!use_maskna) {
+        if (PyArray_GetDTypeTransferFunction(
+                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
+                        src_stride, dst_stride,
+                        PyArray_DESCR(src), PyArray_DESCR(dst),
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            NpyIter_Deallocate(dst_iter);
+            NpyIter_Deallocate(src_iter);
+            return -1;
+        }
     }
+    else {
+        if (PyArray_GetMaskedDTypeTransferFunction(
+                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
+                        src_stride,
+                        dst_stride,
+                        maskna_src_stride,
+                        PyArray_DESCR(src),
+                        PyArray_DESCR(dst),
+                        PyArray_MASKNA_DTYPE(src),
+                        0,
+                        &maskedstransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            NpyIter_Deallocate(dst_iter);
+            NpyIter_Deallocate(src_iter);
+            return -1;
+        }
 
+        /* Also need a transfer function for the mask itself */
+        if (PyArray_GetDTypeTransferFunction(1,
+                        maskna_src_stride, maskna_dst_stride,
+                        PyArray_MASKNA_DTYPE(src), PyArray_MASKNA_DTYPE(dst),
+                        0,
+                        &maskna_stransfer, &maskna_transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            NPY_AUXDATA_FREE(transferdata);
+            NpyIter_Deallocate(dst_iter);
+            NpyIter_Deallocate(src_iter);
+            return -1;
+        }
+    }
 
     if (!needs_api) {
         NPY_BEGIN_THREADS;
@@ -2709,43 +2779,90 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
 
     dst_count = *dst_countptr;
     src_count = *src_countptr;
-    dst_data = *dst_dataptr;
-    src_data = *src_dataptr;
+    dst_data = dst_dataptr[0];
+    src_data = src_dataptr[0];
     /*
      * The tests did not trigger this code, so added a new function
      * ndarray.setasflat to the Python exposure in order to test it.
      */
-    for(;;) {
-        /* Transfer the biggest amount that fits both */
-        count = (src_count < dst_count) ? src_count : dst_count;
-        stransfer(dst_data, dst_stride,
-                    src_data, src_stride,
-                    count, src_itemsize, transferdata);
-
-        /* If we exhausted the dst block, refresh it */
-        if (dst_count == count) {
-            if (!dst_iternext(dst_iter)) {
-                break;
+    if (!use_maskna) {
+        for(;;) {
+            /* Transfer the biggest amount that fits both */
+            count = (src_count < dst_count) ? src_count : dst_count;
+            stransfer(dst_data, dst_stride,
+                        src_data, src_stride,
+                        count, src_itemsize, transferdata);
+
+            /* If we exhausted the dst block, refresh it */
+            if (dst_count == count) {
+                if (!dst_iternext(dst_iter)) {
+                    break;
+                }
+                dst_count = *dst_countptr;
+                dst_data = dst_dataptr[0];
+            }
+            else {
+                dst_count -= count;
+                dst_data += count*dst_stride;
             }
-            dst_count = *dst_countptr;
-            dst_data = *dst_dataptr;
-        }
-        else {
-            dst_count -= count;
-            dst_data += count*dst_stride;
-        }
 
-        /* If we exhausted the src block, refresh it */
-        if (src_count == count) {
-            if (!src_iternext(src_iter)) {
-                break;
+            /* If we exhausted the src block, refresh it */
+            if (src_count == count) {
+                if (!src_iternext(src_iter)) {
+                    break;
+                }
+                src_count = *src_countptr;
+                src_data = src_dataptr[0];
+            }
+            else {
+                src_count -= count;
+                src_data += count*src_stride;
             }
-            src_count = *src_countptr;
-            src_data = *src_dataptr;
         }
-        else {
-            src_count -= count;
-            src_data += count*src_stride;
+    }
+    else {
+        maskna_src_data = src_dataptr[1];
+        maskna_dst_data = dst_dataptr[1];
+        for(;;) {
+            /* Transfer the biggest amount that fits both */
+            count = (src_count < dst_count) ? src_count : dst_count;
+            maskedstransfer(dst_data, dst_stride,
+                        src_data, src_stride,
+                        (npy_mask *)maskna_src_data, maskna_src_stride,
+                        count, src_itemsize, transferdata);
+            maskna_stransfer(maskna_dst_data, maskna_dst_stride,
+                        maskna_src_data, maskna_src_stride,
+                        count, maskna_src_itemsize, maskna_transferdata);
+
+            /* If we exhausted the dst block, refresh it */
+            if (dst_count == count) {
+                if (!dst_iternext(dst_iter)) {
+                    break;
+                }
+                dst_count = *dst_countptr;
+                dst_data = dst_dataptr[0];
+                maskna_dst_data = dst_dataptr[1];
+            }
+            else {
+                dst_count -= count;
+                dst_data += count*dst_stride;
+                maskna_dst_data += count*maskna_dst_stride;
+            }
+
+            /* If we exhausted the src block, refresh it */
+            if (src_count == count) {
+                if (!src_iternext(src_iter)) {
+                    break;
+                }
+                src_count = *src_countptr;
+                src_data = src_dataptr[0];
+                maskna_src_data = src_dataptr[1];
+            }
+            else {
+                src_count -= count;
+                src_data += count*src_stride;
+                maskna_src_data += count*maskna_src_stride;
+            }
         }
     }
 
@@ -2754,6 +2871,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src,
     }
 
     NPY_AUXDATA_FREE(transferdata);
+    NPY_AUXDATA_FREE(maskna_transferdata);
     NpyIter_Deallocate(dst_iter);
     NpyIter_Deallocate(src_iter);
 
@@ -2778,7 +2896,7 @@ PyArray_CopyAnyInto(PyArrayObject *dst, PyArrayObject *src)
 }
 
 /*NUMPY_API
- * Copy an Array into another array -- memory must not overlap.
+ * Copy an Array into another array.
  * Broadcast to the destination shape if necessary.
  *
  * Returns 0 on success, -1 on failure.
@@ -2786,381 +2904,20 @@ PyArray_CopyAnyInto(PyArrayObject *dst, PyArrayObject *src)
 NPY_NO_EXPORT int
 PyArray_CopyInto(PyArrayObject *dst, PyArrayObject *src)
 {
-    PyArray_StridedTransferFn *stransfer = NULL;
-    NpyAuxData *transferdata = NULL;
-    NPY_BEGIN_THREADS_DEF;
-
-    if (!PyArray_ISWRITEABLE(dst)) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "cannot write to array");
-        return -1;
-    }
-
-    if (PyArray_NDIM(dst) >= PyArray_NDIM(src) &&
-                            PyArray_TRIVIALLY_ITERABLE_PAIR(dst, src)) {
-        char *dst_data, *src_data;
-        npy_intp count, dst_stride, src_stride, src_itemsize;
-
-        int needs_api = 0;
-
-        PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(dst, src, count,
-                              dst_data, src_data, dst_stride, src_stride);
-
-        /*
-         * Check for overlap with positive strides, and if found,
-         * possibly reverse the order
-         */
-        if (dst_data > src_data && src_stride > 0 && dst_stride > 0 &&
-                        (dst_data < src_data+src_stride*count) &&
-                        (src_data < dst_data+dst_stride*count)) {
-            dst_data += dst_stride*(count-1);
-            src_data += src_stride*(count-1);
-            dst_stride = -dst_stride;
-            src_stride = -src_stride;
-        }
-
-        if (PyArray_GetDTypeTransferFunction(
-                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
-                        src_stride, dst_stride,
-                        PyArray_DESCR(src), PyArray_DESCR(dst),
-                        0,
-                        &stransfer, &transferdata,
-                        &needs_api) != NPY_SUCCEED) {
-            return -1;
-        }
-
-        src_itemsize = PyArray_DESCR(src)->elsize;
-
-        if (!needs_api) {
-            NPY_BEGIN_THREADS;
-        }
-
-        stransfer(dst_data, dst_stride, src_data, src_stride,
-                    count, src_itemsize, transferdata);
-
-        if (!needs_api) {
-            NPY_END_THREADS;
-        }
-
-        NPY_AUXDATA_FREE(transferdata);
-
-        return PyErr_Occurred() ? -1 : 0;
-    }
-    else {
-        PyArrayObject *op[2];
-        npy_uint32 op_flags[2];
-        PyArray_Descr *op_dtypes_values[2], **op_dtypes = NULL;
-        NpyIter *iter;
-        npy_intp src_size;
-
-        NpyIter_IterNextFunc *iternext;
-        char **dataptr;
-        npy_intp *stride;
-        npy_intp *countptr;
-        npy_intp src_itemsize;
-        int needs_api;
-
-        op[0] = dst;
-        op[1] = src;
-        /*
-         * TODO: In NumPy 2.0, reenable NPY_ITER_NO_BROADCAST. This
-         *       was removed during NumPy 1.6 testing for compatibility
-         *       with NumPy 1.5, as per Travis's -10 veto power.
-         */
-        /*op_flags[0] = NPY_ITER_WRITEONLY|NPY_ITER_NO_BROADCAST;*/
-        op_flags[0] = NPY_ITER_WRITEONLY;
-        op_flags[1] = NPY_ITER_READONLY;
-
-        /*
-         * If 'src' is being broadcast to 'dst', and it is smaller
-         * than the default NumPy buffer size, allow the iterator to
-         * make a copy of 'src' with the 'dst' dtype if necessary.
-         *
-         * This is a performance operation, to allow fewer casts followed
-         * by more plain copies.
-         */
-        src_size = PyArray_SIZE(src);
-        if (src_size <= NPY_BUFSIZE && src_size < PyArray_SIZE(dst)) {
-            op_flags[1] |= NPY_ITER_COPY;
-            op_dtypes = op_dtypes_values;
-            op_dtypes_values[0] = NULL;
-            op_dtypes_values[1] = PyArray_DESCR(dst);
-        }
-
-        iter = NpyIter_MultiNew(2, op,
-                            NPY_ITER_EXTERNAL_LOOP|
-                            NPY_ITER_REFS_OK|
-                            NPY_ITER_ZEROSIZE_OK,
-                            NPY_KEEPORDER,
-                            NPY_UNSAFE_CASTING,
-                            op_flags,
-                            op_dtypes);
-        if (iter == NULL) {
-            return -1;
-        }
-
-        iternext = NpyIter_GetIterNext(iter, NULL);
-        if (iternext == NULL) {
-            NpyIter_Deallocate(iter);
-            return -1;
-        }
-        dataptr = NpyIter_GetDataPtrArray(iter);
-        stride = NpyIter_GetInnerStrideArray(iter);
-        countptr = NpyIter_GetInnerLoopSizePtr(iter);
-        src_itemsize = PyArray_DESCR(src)->elsize;
-
-        needs_api = NpyIter_IterationNeedsAPI(iter);
-
-        /*
-         * Because buffering is disabled in the iterator, the inner loop
-         * strides will be the same throughout the iteration loop.  Thus,
-         * we can pass them to this function to take advantage of
-         * contiguous strides, etc.
-         */
-        if (PyArray_GetDTypeTransferFunction(
-                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
-                        stride[1], stride[0],
-                        NpyIter_GetDescrArray(iter)[1], PyArray_DESCR(dst),
-                        0,
-                        &stransfer, &transferdata,
-                        &needs_api) != NPY_SUCCEED) {
-            NpyIter_Deallocate(iter);
-            return -1;
-        }
-
-
-        if (NpyIter_GetIterSize(iter) != 0) {
-            if (!needs_api) {
-                NPY_BEGIN_THREADS;
-            }
-
-            do {
-                stransfer(dataptr[0], stride[0],
-                            dataptr[1], stride[1],
-                            *countptr, src_itemsize, transferdata);
-            } while(iternext(iter));
-
-            if (!needs_api) {
-                NPY_END_THREADS;
-            }
-        }
-
-        NPY_AUXDATA_FREE(transferdata);
-        NpyIter_Deallocate(iter);
-
-        return PyErr_Occurred() ? -1 : 0;
-    }
+    return PyArray_AssignArray(dst, src, NULL, NPY_UNSAFE_CASTING, 0, NULL);
 }
 
 /*NUMPY_API
- * Copy an Array into another array, wherever the mask specifies.
- * The memory of src and dst must not overlap.
- *
- * Broadcast to the destination shape if necessary.
+ * Move the memory of one array into another, allowing for overlapping data.
  *
- * Returns 0 on success, -1 on failure.
+ * Returns 0 on success, negative on failure.
  */
 NPY_NO_EXPORT int
-PyArray_MaskedCopyInto(PyArrayObject *dst, PyArrayObject *src,
-                        PyArrayObject *mask, NPY_CASTING casting)
+PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src)
 {
-    PyArray_MaskedStridedTransferFn *stransfer = NULL;
-    NpyAuxData *transferdata = NULL;
-    NPY_BEGIN_THREADS_DEF;
-
-    if (!PyArray_ISWRITEABLE(dst)) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "cannot write to array");
-        return -1;
-    }
-
-    if (!PyArray_CanCastArrayTo(src, PyArray_DESCR(dst), casting)) {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Cannot cast array data from ");
-        PyUString_ConcatAndDel(&errmsg,
-                PyObject_Repr((PyObject *)PyArray_DESCR(src)));
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" to "));
-        PyUString_ConcatAndDel(&errmsg,
-                PyObject_Repr((PyObject *)PyArray_DESCR(dst)));
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromFormat(" according to the rule %s",
-                        npy_casting_to_string(casting)));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        return -1;
-    }
-
-
-    if (PyArray_NDIM(dst) >= PyArray_NDIM(src) &&
-                        PyArray_NDIM(dst) >= PyArray_NDIM(mask) &&
-                        PyArray_TRIVIALLY_ITERABLE_TRIPLE(dst, src, mask)) {
-        char *dst_data, *src_data, *mask_data;
-        npy_intp count, dst_stride, src_stride, src_itemsize, mask_stride;
-
-        int needs_api = 0;
-
-        PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(dst, src, mask, count,
-                              dst_data, src_data, mask_data,
-                              dst_stride, src_stride, mask_stride);
-
-        /*
-         * Check for overlap with positive strides, and if found,
-         * possibly reverse the order
-         */
-        if (dst_data > src_data && src_stride > 0 && dst_stride > 0 &&
-                        (dst_data < src_data+src_stride*count) &&
-                        (src_data < dst_data+dst_stride*count)) {
-            dst_data += dst_stride*(count-1);
-            src_data += src_stride*(count-1);
-            mask_data += mask_stride*(count-1);
-            dst_stride = -dst_stride;
-            src_stride = -src_stride;
-            mask_stride = -mask_stride;
-        }
-
-        if (PyArray_GetMaskedDTypeTransferFunction(
-                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
-                        src_stride, dst_stride, mask_stride,
-                        PyArray_DESCR(src),
-                        PyArray_DESCR(dst),
-                        PyArray_DESCR(mask),
-                        0,
-                        &stransfer, &transferdata,
-                        &needs_api) != NPY_SUCCEED) {
-            return -1;
-        }
-
-        src_itemsize = PyArray_DESCR(src)->elsize;
-
-        if (!needs_api) {
-            NPY_BEGIN_THREADS;
-        }
-
-        stransfer(dst_data, dst_stride, src_data, src_stride,
-                    (npy_uint8 *)mask_data, mask_stride,
-                    count, src_itemsize, transferdata);
-
-        if (!needs_api) {
-            NPY_END_THREADS;
-        }
-
-        NPY_AUXDATA_FREE(transferdata);
-
-        return PyErr_Occurred() ? -1 : 0;
-    }
-    else {
-        PyArrayObject *op[3];
-        npy_uint32 op_flags[3];
-        PyArray_Descr *op_dtypes_values[3], **op_dtypes = NULL;
-        NpyIter *iter;
-        npy_intp src_size;
-
-        NpyIter_IterNextFunc *iternext;
-        char **dataptr;
-        npy_intp *stride;
-        npy_intp *countptr;
-        npy_intp src_itemsize;
-        int needs_api;
-
-        op[0] = dst;
-        op[1] = src;
-        op[2] = mask;
-        /*
-         * TODO: In NumPy 2.0, renable NPY_ITER_NO_BROADCAST. This
-         *       was removed during NumPy 1.6 testing for compatibility
-         *       with NumPy 1.5, as per Travis's -10 veto power.
-         */
-        /*op_flags[0] = NPY_ITER_WRITEONLY|NPY_ITER_NO_BROADCAST;*/
-        op_flags[0] = NPY_ITER_WRITEONLY;
-        op_flags[1] = NPY_ITER_READONLY;
-        op_flags[2] = NPY_ITER_READONLY;
-
-        /*
-         * If 'src' is being broadcast to 'dst', and it is smaller
-         * than the default NumPy buffer size, allow the iterator to
-         * make a copy of 'src' with the 'dst' dtype if necessary.
-         *
-         * This is a performance operation, to allow fewer casts followed
-         * by more plain copies.
-         */
-        src_size = PyArray_SIZE(src);
-        if (src_size <= NPY_BUFSIZE && src_size < PyArray_SIZE(dst)) {
-            op_flags[1] |= NPY_ITER_COPY;
-            op_dtypes = op_dtypes_values;
-            op_dtypes_values[0] = NULL;
-            op_dtypes_values[1] = PyArray_DESCR(dst);
-            op_dtypes_values[2] = NULL;
-        }
-
-        iter = NpyIter_MultiNew(3, op,
-                            NPY_ITER_EXTERNAL_LOOP|
-                            NPY_ITER_REFS_OK|
-                            NPY_ITER_ZEROSIZE_OK,
-                            NPY_KEEPORDER,
-                            NPY_UNSAFE_CASTING,
-                            op_flags,
-                            op_dtypes);
-        if (iter == NULL) {
-            return -1;
-        }
-
-        iternext = NpyIter_GetIterNext(iter, NULL);
-        if (iternext == NULL) {
-            NpyIter_Deallocate(iter);
-            return -1;
-        }
-        dataptr = NpyIter_GetDataPtrArray(iter);
-        stride = NpyIter_GetInnerStrideArray(iter);
-        countptr = NpyIter_GetInnerLoopSizePtr(iter);
-        src_itemsize = PyArray_DESCR(src)->elsize;
-
-        needs_api = NpyIter_IterationNeedsAPI(iter);
-
-        /*
-         * Because buffering is disabled in the iterator, the inner loop
-         * strides will be the same throughout the iteration loop.  Thus,
-         * we can pass them to this function to take advantage of
-         * contiguous strides, etc.
-         */
-        if (PyArray_GetMaskedDTypeTransferFunction(
-                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
-                        stride[1], stride[0], stride[2],
-                        NpyIter_GetDescrArray(iter)[1],
-                        PyArray_DESCR(dst),
-                        PyArray_DESCR(mask),
-                        0,
-                        &stransfer, &transferdata,
-                        &needs_api) != NPY_SUCCEED) {
-            NpyIter_Deallocate(iter);
-            return -1;
-        }
-
-
-        if (NpyIter_GetIterSize(iter) != 0) {
-            if (!needs_api) {
-                NPY_BEGIN_THREADS;
-            }
-
-            do {
-                stransfer(dataptr[0], stride[0],
-                            dataptr[1], stride[1],
-                            (npy_uint8 *)dataptr[2], stride[2],
-                            *countptr, src_itemsize, transferdata);
-            } while(iternext(iter));
-
-            if (!needs_api) {
-                NPY_END_THREADS;
-            }
-        }
-
-        NPY_AUXDATA_FREE(transferdata);
-        NpyIter_Deallocate(iter);
-
-        return PyErr_Occurred() ? -1 : 0;
-    }
+    return PyArray_AssignArray(dst, src, NULL, NPY_UNSAFE_CASTING, 0, NULL);
 }
 
-
 /*NUMPY_API
  * PyArray_CheckAxis
  *
@@ -3173,14 +2930,14 @@ PyArray_CheckAxis(PyArrayObject *arr, int *axis, int flags)
     PyObject *temp1, *temp2;
     int n = PyArray_NDIM(arr);
 
-    if (*axis == MAX_DIMS || n == 0) {
+    if (*axis == NPY_MAXDIMS || n == 0) {
         if (n != 1) {
             temp1 = PyArray_Ravel(arr,0);
             if (temp1 == NULL) {
                 *axis = 0;
                 return NULL;
             }
-            if (*axis == MAX_DIMS) {
+            if (*axis == NPY_MAXDIMS) {
                 *axis = PyArray_NDIM((PyArrayObject *)temp1)-1;
             }
         }
@@ -3571,7 +3328,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
         Py_DECREF(new);
         Py_DECREF(PyArray_DESCR(range));
         /* steals the reference */
-        ((PyArrayObject_fieldaccess *)range)->descr = dtype;
+        ((PyArrayObject_fields *)range)->descr = dtype;
     }
     Py_DECREF(start);
     Py_DECREF(step);
@@ -3696,7 +3453,7 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
                 err = 1;
                 break;
             }
-            ((PyArrayObject_fieldaccess *)r)->data = tmp;
+            ((PyArrayObject_fields *)r)->data = tmp;
             dptr = tmp + (totalbytes - bytes);
             thisbuf = 0;
         }
@@ -3711,7 +3468,7 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
         }
         else {
             PyArray_DIMS(r)[0] = *nread;
-            ((PyArrayObject_fieldaccess *)r)->data = tmp;
+            ((PyArrayObject_fields *)r)->data = tmp;
         }
     }
     NPY_END_ALLOW_THREADS;
@@ -3791,7 +3548,7 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
             Py_DECREF(ret);
             return PyErr_NoMemory();
         }
-        ((PyArrayObject_fieldaccess *)ret)->data = tmp;
+        ((PyArrayObject_fields *)ret)->data = tmp;
         PyArray_DIMS(ret)[0] = nread;
     }
     return (PyObject *)ret;
@@ -4075,7 +3832,7 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
                 Py_DECREF(value);
                 goto done;
             }
-            ((PyArrayObject_fieldaccess *)ret)->data = new_data;
+            ((PyArrayObject_fields *)ret)->data = new_data;
         }
         PyArray_DIMS(ret)[0] = i + 1;
 
@@ -4104,7 +3861,7 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
         PyErr_SetString(PyExc_MemoryError, "cannot allocate array memory");
         goto done;
     }
-    ((PyArrayObject_fieldaccess *)ret)->data = new_data;
+    ((PyArrayObject_fields *)ret)->data = new_data;
 
  done:
     Py_XDECREF(iter);
@@ -4168,3 +3925,26 @@ _array_fill_strides(npy_intp *strides, npy_intp *dims, int nd, size_t itemsize,
     }
     return itemsize;
 }
+
+/*
+ * Calls arr_of_subclass.__array_wrap__(towrap), in order to make 'towrap'
+ * have the same ndarray subclass as 'arr_of_subclass'.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_SubclassWrap(PyArrayObject *arr_of_subclass, PyArrayObject *towrap)
+{
+    PyObject *wrapped = PyObject_CallMethod((PyObject *)arr_of_subclass,
+                                        "__array_wrap__", "O", towrap);
+    if (wrapped == NULL) {
+        return NULL;
+    }
+    if (!PyArray_Check(wrapped)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "ndarray subclass __array_wrap__ method returned an "
+                "object which was not an instance of an ndarray subclass");
+        Py_DECREF(wrapped);
+        return NULL;
+    }
+
+    return (PyArrayObject *)wrapped;
+}
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index ed7b72980..e12d153ca 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -72,4 +72,32 @@ byte_swap_vector(void *p, intp n, int size);
 NPY_NO_EXPORT int
 PyArray_AssignFromSequence(PyArrayObject *self, PyObject *v);
 
+/*
+ * A slight generalization of PyArray_GetArrayParamsFromObject,
+ * which also returns whether the input data contains any numpy.NA
+ * values.
+ *
+ * This isn't exposed in the public API.
+ */
+NPY_NO_EXPORT int
+PyArray_GetArrayParamsFromObjectEx(PyObject *op,
+                        PyArray_Descr *requested_dtype,
+                        npy_bool writeable,
+                        PyArray_Descr **out_dtype,
+                        int *out_ndim, npy_intp *out_dims,
+                        int *out_contains_na,
+                        PyArrayObject **out_arr, PyObject *context);
+
+/* Returns 1 if the arrays have overlapping data, 0 otherwise */
+NPY_NO_EXPORT int
+_arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2);
+
+/*
+ * Calls arr_of_subclass.__array_wrap__(towrap), in order to make 'towrap'
+ * have the same ndarray subclass as 'arr_of_subclass'.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_SubclassWrap(PyArrayObject *arr_of_subclass, PyArrayObject *towrap);
+
+
 #endif
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index cc4fc9d63..4111070f9 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -24,6 +24,7 @@
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "na_object.h"
 
 /*
  * Imports the PyDateTime functions so we can create these objects.
@@ -2655,6 +2656,13 @@ convert_pyobject_to_datetime(PyArray_DatetimeMetaData *meta, PyObject *obj,
         *out = NPY_DATETIME_NAT;
         return 0;
     }
+    /* Check for NA */
+    else if (NpyNA_Check(obj) || NpyNA_IsZeroDimArrayNA(obj)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
     else {
         PyErr_SetString(PyExc_ValueError,
                 "Could not convert object to NumPy datetime");
@@ -2916,6 +2924,13 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         *out = NPY_DATETIME_NAT;
         return 0;
     }
+    /* Check for NA */
+    else if (NpyNA_Check(obj) || NpyNA_IsZeroDimArrayNA(obj)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+        return -1;
+    }
     else {
         PyErr_SetString(PyExc_ValueError,
                 "Could not convert object to NumPy timedelta");
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index 423528b72..39d2372f2 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -1373,7 +1373,7 @@ array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
         }
         Py_DECREF(strobj);
 
-        if (!can_cast_datetime64_units(meta->base, unit, casting)) {
+        if (unit != -1 && !can_cast_datetime64_units(meta->base, unit, casting)) {
             PyErr_Format(PyExc_TypeError, "Cannot create a datetime "
                         "string as units '%s' from a NumPy datetime "
                         "with units '%s' according to the rule %s",
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 4f832bd12..7e7daa724 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -120,6 +120,7 @@ static int
 _check_for_commastring(char *type, Py_ssize_t len)
 {
     Py_ssize_t i;
+    int sqbracket;
 
     /* Check for ints at start of string */
     if ((type[0] >= '0'
@@ -140,10 +141,21 @@ _check_for_commastring(char *type, Py_ssize_t len)
                 && type[2] == ')'))) {
         return 1;
     }
-    /* Check for presence of commas */
+    /* Check for presence of commas outside square [] brackets */
+    sqbracket = 0;
     for (i = 1; i < len; i++) {
-        if (type[i] == ',') {
-            return 1;
+        switch (type[i]) {
+            case ',':
+                if (sqbracket == 0) {
+                    return 1;
+                }
+                break;
+            case '[':
+                ++sqbracket;
+                break;
+            case ']':
+                --sqbracket;
+                break;
         }
     }
     return 0;
@@ -582,6 +594,9 @@ _convert_from_list(PyObject *obj, int align)
  * found in the _internal.py file patterned after that one -- the approach is
  * to try to convert to a list (with tuples if any repeat information is
  * present) and then call the _convert_from_list)
+ *
+ * TODO: Calling Python from C like this in critical-path code is not
+ *       a good idea. This should all be converted to C code.
  */
 static PyArray_Descr *
 _convert_from_commastring(PyObject *obj, int align)
@@ -714,13 +729,12 @@ validate_object_field_overlap(PyArray_Descr *dtype)
     PyObject *names, *fields, *key, *tup, *title;
     Py_ssize_t i, j, names_size;
     PyArray_Descr *fld_dtype, *fld2_dtype;
-    int fld_offset, fld2_offset, align;
+    int fld_offset, fld2_offset;
 
     /* Get some properties from the dtype */
     names = dtype->names;
     names_size = PyTuple_GET_SIZE(names);
     fields = dtype->fields;
-    align = PyDataType_FLAGCHK(dtype, NPY_ALIGNED_STRUCT);
 
     for (i = 0; i < names_size; ++i) {
         key = PyTuple_GET_ITEM(names, i);
@@ -875,16 +889,16 @@ _convert_from_dict(PyObject *obj, int align)
 
     totalsize = 0;
     for (i = 0; i < n; i++) {
-        PyObject *tup, *descr, *index, *title, *name, *off;
+        PyObject *tup, *descr, *ind, *title, *name, *off;
         int len, ret, _align = 1;
         PyArray_Descr *newdescr;
 
         /* Build item to insert (descr, offset, [title])*/
         len = 2;
         title = NULL;
-        index = PyInt_FromLong(i);
+        ind = PyInt_FromLong(i);
         if (titles) {
-            title=PyObject_GetItem(titles, index);
+            title=PyObject_GetItem(titles, ind);
             if (title && title != Py_None) {
                 len = 3;
             }
@@ -894,7 +908,7 @@ _convert_from_dict(PyObject *obj, int align)
             PyErr_Clear();
         }
         tup = PyTuple_New(len);
-        descr = PyObject_GetItem(descrs, index);
+        descr = PyObject_GetItem(descrs, ind);
         if (!descr) {
             goto fail;
         }
@@ -907,7 +921,7 @@ _convert_from_dict(PyObject *obj, int align)
         Py_DECREF(descr);
         if (ret == PY_FAIL) {
             Py_DECREF(tup);
-            Py_DECREF(index);
+            Py_DECREF(ind);
             goto fail;
         }
         PyTuple_SET_ITEM(tup, 0, (PyObject *)newdescr);
@@ -917,7 +931,7 @@ _convert_from_dict(PyObject *obj, int align)
         }
         if (offsets) {
             long offset;
-            off = PyObject_GetItem(offsets, index);
+            off = PyObject_GetItem(offsets, ind);
             if (!off) {
                 goto fail;
             }
@@ -950,11 +964,11 @@ _convert_from_dict(PyObject *obj, int align)
         if (len == 3) {
             PyTuple_SET_ITEM(tup, 2, title);
         }
-        name = PyObject_GetItem(names, index);
+        name = PyObject_GetItem(names, ind);
         if (!name) {
             goto fail;
         }
-        Py_DECREF(index);
+        Py_DECREF(ind);
 #if defined(NPY_PY3K)
         if (!PyUString_Check(name)) {
 #else
@@ -2765,15 +2779,15 @@ arraydescr_struct_list_str(PyArray_Descr *dtype)
         }
         /* Special case subarray handling here */
         if (PyDataType_HASSUBARRAY(fld_dtype)) {
-            tmp = arraydescr_short_construction_repr(
-                            fld_dtype->subarray->base, 0);
+            tmp = arraydescr_construction_repr(
+                            fld_dtype->subarray->base, 0, 1);
             PyUString_ConcatAndDel(&ret, tmp);
             PyUString_ConcatAndDel(&ret, PyUString_FromString(", "));
             PyUString_ConcatAndDel(&ret,
                             PyObject_Str(fld_dtype->subarray->shape));
         }
         else {
-            tmp = arraydescr_short_construction_repr(fld_dtype, 0);
+            tmp = arraydescr_construction_repr(fld_dtype, 0, 1);
             PyUString_ConcatAndDel(&ret, tmp);
         }
         PyUString_ConcatAndDel(&ret, PyUString_FromString(")"));
@@ -2831,7 +2845,7 @@ arraydescr_struct_dict_str(PyArray_Descr *dtype, int includealignedflag)
         if (title != NULL && title != Py_None) {
             has_titles = 1;
         }
-        tmp = arraydescr_short_construction_repr(fld_dtype, 0);
+        tmp = arraydescr_construction_repr(fld_dtype, 0, 1);
         PyUString_ConcatAndDel(&ret, tmp);
         if (i != names_size - 1) {
             PyUString_ConcatAndDel(&ret, PyUString_FromString(","));
@@ -2915,7 +2929,7 @@ arraydescr_subarray_str(PyArray_Descr *dtype)
     PyObject *p, *ret;
 
     ret = PyUString_FromString("(");
-    p = arraydescr_short_construction_repr(dtype->subarray->base, 0);
+    p = arraydescr_construction_repr(dtype->subarray->base, 0, 1);
     PyUString_ConcatAndDel(&ret, p);
     PyUString_ConcatAndDel(&ret, PyUString_FromString(", "));
     PyUString_ConcatAndDel(&ret, PyObject_Str(dtype->subarray->shape));
@@ -2969,26 +2983,10 @@ arraydescr_struct_repr(PyArray_Descr *dtype)
     return s;
 }
 
-/*
- * This creates a shorter repr using 'kind' and 'itemsize',
- * instead of the longer type name. This is the object passed
- * as the first parameter to the dtype constructor, and if no
- * additional constructor parameters are given, will reproduce
- * the exact memory layout.
- *
- * If 'includealignflag' is true, this includes the 'align=True' parameter
- * inside the struct dtype construction dict when needed. Use this flag
- * if you want a proper repr string without the 'dtype()' part around it.
- *
- * If 'includealignflag' is false, this does not preserve the
- * 'align=True' parameter or sticky NPY_ALIGNED_STRUCT flag for
- * struct arrays like the regular repr does, because the 'align'
- * flag is not part of first dtype constructor parameter. This
- * mode is intended for a full 'repr', where the 'align=True' is
- * provided as the second parameter.
- */
+/* See descriptor.h for documentation */
 NPY_NO_EXPORT PyObject *
-arraydescr_short_construction_repr(PyArray_Descr *dtype, int includealignflag)
+arraydescr_construction_repr(PyArray_Descr *dtype, int includealignflag,
+                                int shortrepr)
 {
     PyObject *ret;
     PyArray_DatetimeMetaData *meta;
@@ -3020,11 +3018,44 @@ arraydescr_short_construction_repr(PyArray_Descr *dtype, int includealignflag)
 
     /* Handle booleans, numbers, and custom dtypes */
     if (dtype->type_num == NPY_BOOL) {
-        return PyUString_FromString("'?'");
+        if (shortrepr) {
+            return PyUString_FromString("'?'");
+        }
+        else {
+            return PyUString_FromString("'bool'");
+        }
     }
     else if (PyTypeNum_ISNUMBER(dtype->type_num)) {
-        return PyUString_FromFormat("'%s%c%d'", byteorder, (int)dtype->kind,
-                                                dtype->elsize);
+        /* Short repr with endianness, like '<f8' */
+        if (shortrepr || (dtype->byteorder != NPY_NATIVE &&
+                          dtype->byteorder != NPY_IGNORE)) {
+            return PyUString_FromFormat("'%s%c%d'", byteorder,
+                                        (int)dtype->kind, dtype->elsize);
+        }
+        /* Longer repr, like 'float64' */
+        else {
+            char *kindstr;
+            switch (dtype->kind) {
+                case 'u':
+                    kindstr = "uint";
+                    break;
+                case 'i':
+                    kindstr = "int";
+                    break;
+                case 'f':
+                    kindstr = "float";
+                    break;
+                case 'c':
+                    kindstr = "complex";
+                    break;
+                default:
+                    PyErr_Format(PyExc_RuntimeError,
+                            "internal dtype repr error, unknown kind '%c'",
+                            (int)dtype->kind);
+                    return NULL;
+            }
+            return PyUString_FromFormat("'%s%d'", kindstr, 8*dtype->elsize);
+        }
     }
     else if (PyTypeNum_ISUSERDEF(dtype->type_num)) {
         char *s = strrchr(dtype->typeobj->tp_name, '.');
@@ -3103,27 +3134,17 @@ arraydescr_short_construction_repr(PyArray_Descr *dtype, int includealignflag)
 static PyObject *
 arraydescr_repr(PyArray_Descr *dtype)
 {
-    PyObject *sub, *s;
+    PyObject *ret;
 
     if (PyDataType_HASFIELDS(dtype)) {
         return arraydescr_struct_repr(dtype);
     }
     else {
-        s = PyUString_FromString("dtype(");
-        sub = arraydescr_str(dtype);
-        if (sub == NULL) {
-            return NULL;
-        }
-        if (!PyDataType_HASSUBARRAY(dtype)) {
-            PyObject *t=PyUString_FromString("'");
-            PyUString_Concat(&sub, t);
-            PyUString_ConcatAndDel(&t, sub);
-            sub = t;
-        }
-        PyUString_ConcatAndDel(&s, sub);
-        sub = PyUString_FromString(")");
-        PyUString_ConcatAndDel(&s, sub);
-        return s;
+        ret = PyUString_FromString("dtype(");
+        PyUString_ConcatAndDel(&ret,
+                            arraydescr_construction_repr(dtype, 1, 0));
+        PyUString_ConcatAndDel(&ret, PyUString_FromString(")"));
+        return ret;
     }
 }
 
diff --git a/numpy/core/src/multiarray/descriptor.h b/numpy/core/src/multiarray/descriptor.h
index d936d0b31..01a778954 100644
--- a/numpy/core/src/multiarray/descriptor.h
+++ b/numpy/core/src/multiarray/descriptor.h
@@ -11,11 +11,15 @@ NPY_NO_EXPORT PyArray_Descr *
 _arraydescr_fromobj(PyObject *obj);
 
 /*
- * This creates a shorter repr using 'kind' and 'itemsize',
- * instead of the longer type name. This is the object passed
- * as the first parameter to the dtype constructor, and if no
- * additional constructor parameters are given, will reproduce
- * the exact memory layout.
+ * Creates a string repr of the dtype, excluding the 'dtype()' part
+ * surrounding the object. This object may be a string, a list, or
+ * a dict depending on the nature of the dtype. This
+ * is the object passed as the first parameter to the dtype
+ * constructor, and if no additional constructor parameters are
+ * given, will reproduce the exact memory layout.
+ *
+ * If 'shortrepr' is non-zero, this creates a shorter repr using
+ * 'kind' and 'itemsize', instead of the longer type name.
  *
  * If 'includealignflag' is true, this includes the 'align=True' parameter
  * inside the struct dtype construction dict when needed. Use this flag
@@ -29,7 +33,8 @@ _arraydescr_fromobj(PyObject *obj);
  * provided as the second parameter.
  */
 NPY_NO_EXPORT PyObject *
-arraydescr_short_construction_repr(PyArray_Descr *dtype, int includealignflag);
+arraydescr_construction_repr(PyArray_Descr *dtype, int includealignflag,
+                                int shortrepr);
 
 #ifdef NPY_ENABLE_SEPARATE_COMPILATION
 extern NPY_NO_EXPORT char *_datetime_strings[];
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index ce688efd5..943859ae5 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -25,6 +25,7 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 
+#include "shape.h"
 #include "lowlevel_strided_loops.h"
 
 #define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
@@ -56,7 +57,7 @@ static int
 get_decsrcref_transfer_function(int aligned,
                             npy_intp src_stride,
                             PyArray_Descr *src_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api);
 
@@ -69,7 +70,7 @@ static int
 get_setdstzero_transfer_function(int aligned,
                             npy_intp dst_stride,
                             PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api);
 
@@ -80,7 +81,7 @@ get_setdstzero_transfer_function(int aligned,
  */
 NPY_NO_EXPORT int
 get_bool_setdstone_transfer_function(npy_intp dst_stride,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *NPY_UNUSED(out_needs_api));
 
@@ -212,7 +213,7 @@ NPY_NO_EXPORT int
 PyArray_GetStridedZeroPadCopyFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             npy_intp src_itemsize, npy_intp dst_itemsize,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     if (src_itemsize == dst_itemsize) {
@@ -249,7 +250,7 @@ PyArray_GetStridedZeroPadCopyFn(int aligned,
 /* Wraps a transfer function + data in alignment code */
 typedef struct {
     NpyAuxData base;
-    PyArray_StridedTransferFn *wrapped,
+    PyArray_StridedUnaryOp *wrapped,
                 *tobuffer, *frombuffer;
     NpyAuxData *wrappeddata, *todata, *fromdata;
     npy_intp src_itemsize, dst_itemsize;
@@ -324,7 +325,7 @@ _strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _align_wrap_data *d = (_align_wrap_data *)data;
-    PyArray_StridedTransferFn *wrapped = d->wrapped,
+    PyArray_StridedUnaryOp *wrapped = d->wrapped,
             *tobuffer = d->tobuffer,
             *frombuffer = d->frombuffer;
     npy_intp inner_src_itemsize = d->src_itemsize,
@@ -368,7 +369,7 @@ _strided_to_strided_contig_align_wrap_init_dest(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _align_wrap_data *d = (_align_wrap_data *)data;
-    PyArray_StridedTransferFn *wrapped = d->wrapped,
+    PyArray_StridedUnaryOp *wrapped = d->wrapped,
             *tobuffer = d->tobuffer,
             *frombuffer = d->frombuffer;
     npy_intp inner_src_itemsize = d->src_itemsize,
@@ -425,11 +426,11 @@ _strided_to_strided_contig_align_wrap_init_dest(char *dst, npy_intp dst_stride,
 NPY_NO_EXPORT int
 wrap_aligned_contig_transfer_function(
             npy_intp src_itemsize, npy_intp dst_itemsize,
-            PyArray_StridedTransferFn *tobuffer, NpyAuxData *todata,
-            PyArray_StridedTransferFn *frombuffer, NpyAuxData *fromdata,
-            PyArray_StridedTransferFn *wrapped, NpyAuxData *wrappeddata,
+            PyArray_StridedUnaryOp *tobuffer, NpyAuxData *todata,
+            PyArray_StridedUnaryOp *frombuffer, NpyAuxData *fromdata,
+            PyArray_StridedUnaryOp *wrapped, NpyAuxData *wrappeddata,
             int init_dest,
-            PyArray_StridedTransferFn **out_stransfer,
+            PyArray_StridedUnaryOp **out_stransfer,
             NpyAuxData **out_transferdata)
 {
     _align_wrap_data *data;
@@ -523,7 +524,7 @@ wrap_copy_swap_function(int aligned,
                 npy_intp src_stride, npy_intp dst_stride,
                 PyArray_Descr *dtype,
                 int should_swap,
-                PyArray_StridedTransferFn **out_stransfer,
+                PyArray_StridedUnaryOp **out_stransfer,
                 NpyAuxData **out_transferdata)
 {
     _wrap_copy_swap_data *data;
@@ -654,7 +655,7 @@ static int
 get_nbo_cast_numeric_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             int src_type_num, int dst_type_num,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     /* Emit a warning if complex imaginary is being cast away */
@@ -942,7 +943,7 @@ static int
 get_nbo_cast_datetime_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *src_meta, *dst_meta;
@@ -1018,7 +1019,7 @@ static int
 get_nbo_datetime_to_string_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *src_meta;
@@ -1063,12 +1064,12 @@ static int
 get_datetime_to_unicode_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
     NpyAuxData *castdata = NULL, *todata = NULL, *fromdata = NULL;
-    PyArray_StridedTransferFn *caststransfer, *tobuffer, *frombuffer;
+    PyArray_StridedUnaryOp *caststransfer, *tobuffer, *frombuffer;
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
@@ -1133,7 +1134,7 @@ static int
 get_nbo_string_to_datetime_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *dst_meta;
@@ -1185,12 +1186,12 @@ static int
 get_unicode_to_datetime_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
     NpyAuxData *castdata = NULL, *todata = NULL, *fromdata = NULL;
-    PyArray_StridedTransferFn *caststransfer, *tobuffer, *frombuffer;
+    PyArray_StridedUnaryOp *caststransfer, *tobuffer, *frombuffer;
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
@@ -1257,7 +1258,7 @@ get_nbo_cast_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api,
                             int *out_needs_wrap)
@@ -1464,11 +1465,11 @@ get_cast_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
-    PyArray_StridedTransferFn *caststransfer;
+    PyArray_StridedUnaryOp *caststransfer;
     NpyAuxData *castdata, *todata = NULL, *fromdata = NULL;
     int needs_wrap = 0;
     npy_intp src_itemsize = src_dtype->elsize,
@@ -1505,7 +1506,7 @@ get_cast_transfer_function(int aligned,
     }
     /* Otherwise, we have to copy and/or swap to aligned temporaries */
     else {
-        PyArray_StridedTransferFn *tobuffer, *frombuffer;
+        PyArray_StridedUnaryOp *tobuffer, *frombuffer;
 
         /* Get the copy/swap operation from src */
         PyArray_GetDTypeCopySwapFn(aligned,
@@ -1552,11 +1553,11 @@ get_cast_transfer_function(int aligned,
 /* Copies 1 element to N contiguous elements */
 typedef struct {
     NpyAuxData base;
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *data;
     npy_intp N, dst_itemsize;
     /* If this is non-NULL the source type has references needing a decref */
-    PyArray_StridedTransferFn *stransfer_finish_src;
+    PyArray_StridedUnaryOp *stransfer_finish_src;
     NpyAuxData *data_finish_src;
 } _one_to_n_data;
 
@@ -1607,7 +1608,7 @@ _strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _one_to_n_data *d = (_one_to_n_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer;
     NpyAuxData *subdata = d->data;
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
 
@@ -1630,7 +1631,7 @@ _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _one_to_n_data *d = (_one_to_n_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer,
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer,
                 *stransfer_finish_src = d->stransfer_finish_src;
     NpyAuxData *subdata = d->data, *data_finish_src = data_finish_src;
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
@@ -1661,13 +1662,13 @@ _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
  */
 static int
 wrap_transfer_function_one_to_n(
-                            PyArray_StridedTransferFn *stransfer_inner,
+                            PyArray_StridedUnaryOp *stransfer_inner,
                             NpyAuxData *data_inner,
-                            PyArray_StridedTransferFn *stransfer_finish_src,
+                            PyArray_StridedUnaryOp *stransfer_finish_src,
                             NpyAuxData *data_finish_src,
                             npy_intp dst_itemsize,
                             npy_intp N,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     _one_to_n_data *data;
@@ -1705,11 +1706,11 @@ get_one_to_n_transfer_function(int aligned,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
                             npy_intp N,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
-    PyArray_StridedTransferFn *stransfer, *stransfer_finish_src = NULL;
+    PyArray_StridedUnaryOp *stransfer, *stransfer_finish_src = NULL;
     NpyAuxData *data, *data_finish_src = NULL;
 
     /*
@@ -1758,7 +1759,7 @@ get_one_to_n_transfer_function(int aligned,
 /* Copies N contiguous elements to N contiguous elements */
 typedef struct {
     NpyAuxData base;
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *data;
     npy_intp N, src_itemsize, dst_itemsize;
 } _n_to_n_data;
@@ -1801,7 +1802,7 @@ _strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _n_to_n_data *d = (_n_to_n_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer;
     NpyAuxData *subdata = d->data;
     npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
                 dst_subitemsize = d->dst_itemsize;
@@ -1825,7 +1826,7 @@ _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         NpyAuxData *data)
 {
     _n_to_n_data *d = (_n_to_n_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer;
     NpyAuxData *subdata = d->data;
     npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
                 dst_subitemsize = d->dst_itemsize;
@@ -1842,12 +1843,12 @@ _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
  */
 static int
 wrap_transfer_function_n_to_n(
-                            PyArray_StridedTransferFn *stransfer_inner,
+                            PyArray_StridedUnaryOp *stransfer_inner,
                             NpyAuxData *data_inner,
                             npy_intp src_stride, npy_intp dst_stride,
                             npy_intp src_itemsize, npy_intp dst_itemsize,
                             npy_intp N,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
 {
     _n_to_n_data *data;
@@ -1888,11 +1889,11 @@ get_n_to_n_transfer_function(int aligned,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
                             npy_intp N,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *data;
 
     /*
@@ -1930,12 +1931,12 @@ typedef struct {
 /* Copies element with subarray broadcasting */
 typedef struct {
     NpyAuxData base;
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *data;
     npy_intp src_N, dst_N, src_itemsize, dst_itemsize;
-    PyArray_StridedTransferFn *stransfer_decsrcref;
+    PyArray_StridedUnaryOp *stransfer_decsrcref;
     NpyAuxData *data_decsrcref;
-    PyArray_StridedTransferFn *stransfer_decdstref;
+    PyArray_StridedUnaryOp *stransfer_decdstref;
     NpyAuxData *data_decdstref;
     /* This gets a run-length encoded representation of the transfer */
     npy_intp run_count;
@@ -2003,7 +2004,7 @@ _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer;
     NpyAuxData *subdata = d->data;
     npy_intp run, run_count = d->run_count,
             src_subitemsize = d->src_itemsize,
@@ -2044,11 +2045,11 @@ _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
                         NpyAuxData *data)
 {
     _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
-    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    PyArray_StridedUnaryOp *subtransfer = d->stransfer;
     NpyAuxData *subdata = d->data;
-    PyArray_StridedTransferFn *stransfer_decsrcref = d->stransfer_decsrcref;
+    PyArray_StridedUnaryOp *stransfer_decsrcref = d->stransfer_decsrcref;
     NpyAuxData *data_decsrcref = d->data_decsrcref;
-    PyArray_StridedTransferFn *stransfer_decdstref = d->stransfer_decdstref;
+    PyArray_StridedUnaryOp *stransfer_decdstref = d->stransfer_decdstref;
     NpyAuxData *data_decdstref = d->data_decdstref;
     npy_intp run, run_count = d->run_count,
             src_subitemsize = d->src_itemsize,
@@ -2101,7 +2102,7 @@ get_subarray_broadcast_transfer_function(int aligned,
                             npy_intp src_size, npy_intp dst_size,
                             PyArray_Dims src_shape, PyArray_Dims dst_shape,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -2288,7 +2289,7 @@ get_subarray_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -2385,7 +2386,7 @@ get_subarray_transfer_function(int aligned,
 /**************************** COPY FIELDS *******************************/
 typedef struct {
     npy_intp src_offset, dst_offset, src_itemsize;
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *data;
 } _single_field_transfer;
 
@@ -2496,7 +2497,7 @@ get_fields_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -2868,7 +2869,7 @@ static int
 get_decsrcref_fields_transfer_function(int aligned,
                             npy_intp src_stride,
                             PyArray_Descr *src_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -2939,7 +2940,7 @@ static int
 get_setdestzero_fields_transfer_function(int aligned,
                             npy_intp dst_stride,
                             PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -3004,11 +3005,11 @@ get_setdestzero_fields_transfer_function(int aligned,
 typedef struct {
     NpyAuxData base;
     /* The transfer function being wrapped */
-    PyArray_StridedTransferFn *stransfer;
+    PyArray_StridedUnaryOp *stransfer;
     NpyAuxData *transferdata;
 
     /* The src decref function if necessary */
-    PyArray_StridedTransferFn *decsrcref_stransfer;
+    PyArray_StridedUnaryOp *decsrcref_stransfer;
     NpyAuxData *decsrcref_transferdata;
 } _masked_wrapper_transfer_data;
 
@@ -3066,7 +3067,7 @@ void _strided_masked_wrapper_decsrcref_transfer_function(
     _masked_wrapper_transfer_data *d =
                         (_masked_wrapper_transfer_data *)transferdata;
     npy_intp subloopsize;
-    PyArray_StridedTransferFn *unmasked_stransfer, *decsrcref_stransfer;
+    PyArray_StridedUnaryOp *unmasked_stransfer, *decsrcref_stransfer;
     NpyAuxData *unmasked_transferdata, *decsrcref_transferdata;
 
     unmasked_stransfer = d->stransfer;
@@ -3077,7 +3078,7 @@ void _strided_masked_wrapper_decsrcref_transfer_function(
     while (N > 0) {
         /* Skip masked values, still calling decsrcref for move_references */
         subloopsize = 0;
-        while (subloopsize < N && !NpyMask_IsExposed(*mask)) {
+        while (subloopsize < N && !NpyMaskValue_IsExposed(*mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
@@ -3088,7 +3089,7 @@ void _strided_masked_wrapper_decsrcref_transfer_function(
         N -= subloopsize;
         /* Process unmasked values */
         subloopsize = 0;
-        while (subloopsize < N && NpyMask_IsExposed(*mask)) {
+        while (subloopsize < N && NpyMaskValue_IsExposed(*mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
@@ -3111,7 +3112,7 @@ void _strided_masked_wrapper_transfer_function(
     _masked_wrapper_transfer_data *d =
                             (_masked_wrapper_transfer_data *)transferdata;
     npy_intp subloopsize;
-    PyArray_StridedTransferFn *unmasked_stransfer;
+    PyArray_StridedUnaryOp *unmasked_stransfer;
     NpyAuxData *unmasked_transferdata;
 
     unmasked_stransfer = d->stransfer;
@@ -3120,7 +3121,7 @@ void _strided_masked_wrapper_transfer_function(
     while (N > 0) {
         /* Skip masked values */
         subloopsize = 0;
-        while (subloopsize < N && !NpyMask_IsExposed(*mask)) {
+        while (subloopsize < N && !NpyMaskValue_IsExposed(*mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
@@ -3129,7 +3130,7 @@ void _strided_masked_wrapper_transfer_function(
         N -= subloopsize;
         /* Process unmasked values */
         subloopsize = 0;
-        while (subloopsize < N && NpyMask_IsExposed(*mask)) {
+        while (subloopsize < N && NpyMaskValue_IsExposed(*mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
@@ -3176,7 +3177,7 @@ _null_to_contig_set_bool_one(char *dst,
 /* Only for the bool type, sets the destination to 1 */
 NPY_NO_EXPORT int
 get_bool_setdstone_transfer_function(npy_intp dst_stride,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *NPY_UNUSED(out_needs_api))
 {
@@ -3273,7 +3274,7 @@ NPY_NO_EXPORT int
 get_setdstzero_transfer_function(int aligned,
                             npy_intp dst_stride,
                             PyArray_Descr *dst_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -3313,7 +3314,7 @@ get_setdstzero_transfer_function(int aligned,
     else if (PyDataType_HASSUBARRAY(dst_dtype)) {
         PyArray_Dims dst_shape = {NULL, -1};
         npy_intp dst_size = 1;
-        PyArray_StridedTransferFn *contig_stransfer;
+        PyArray_StridedUnaryOp *contig_stransfer;
         NpyAuxData *contig_data;
 
         if (out_needs_api) {
@@ -3400,7 +3401,7 @@ NPY_NO_EXPORT int
 get_decsrcref_transfer_function(int aligned,
                             npy_intp src_stride,
                             PyArray_Descr *src_dtype,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -3426,7 +3427,7 @@ get_decsrcref_transfer_function(int aligned,
     else if (PyDataType_HASSUBARRAY(src_dtype)) {
         PyArray_Dims src_shape = {NULL, -1};
         npy_intp src_size = 1;
-        PyArray_StridedTransferFn *stransfer;
+        PyArray_StridedUnaryOp *stransfer;
         NpyAuxData *data;
 
         if (out_needs_api) {
@@ -3482,7 +3483,7 @@ NPY_NO_EXPORT int
 PyArray_GetDTypeCopySwapFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *dtype,
-                            PyArray_StridedTransferFn **outstransfer,
+                            PyArray_StridedUnaryOp **outstransfer,
                             NpyAuxData **outtransferdata)
 {
     npy_intp itemsize = dtype->elsize;
@@ -3528,7 +3529,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
@@ -3754,11 +3755,11 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             PyArray_Descr *dst_dtype,
                             PyArray_Descr *mask_dtype,
                             int move_references,
-                            PyArray_MaskedStridedTransferFn **out_stransfer,
+                            PyArray_MaskedStridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api)
 {
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
     NpyAuxData *transferdata = NULL;
     _masked_wrapper_transfer_data *data;
 
@@ -3830,7 +3831,7 @@ PyArray_CastRawArrays(npy_intp count,
                       PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                       int move_references)
 {
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
     NpyAuxData *transferdata = NULL;
     int aligned = 1, needs_api = 0;
 
@@ -3870,3 +3871,610 @@ PyArray_CastRawArrays(npy_intp count,
     /* If needs_api was set to 1, it may have raised a Python exception */
     return (needs_api && PyErr_Occurred()) ? NPY_FAIL : NPY_SUCCEED;
 }
+
+/*
+ * Prepares shape and strides for a simple raw array iteration.
+ * This sorts the strides into FORTRAN order, reverses any negative
+ * strides, then coalesces axes where possible. The results are
+ * filled in the output parameters.
+ *
+ * This is intended for simple, lightweight iteration over arrays
+ * where no buffering of any kind is needed, and the array may
+ * not be stored as a PyArrayObject. For example, to iterate over
+ * the NA mask of an array.
+ *
+ * The arrays shape, out_shape, strides, and out_strides must all
+ * point to different data.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareOneRawArrayIter(int ndim, npy_intp *shape,
+                            char *data, npy_intp *strides,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_data, npy_intp *out_strides)
+{
+    npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int i, j;
+
+    /* Special case 0 and 1 dimensions */
+    if (ndim == 0) {
+        *out_ndim = 1;
+        *out_data = data;
+        out_shape[0] = 1;
+        out_strides[0] = 0;
+        return 0;
+    }
+    else if (ndim == 1) {
+        npy_intp stride_entry = strides[0], shape_entry = shape[0];
+        *out_ndim = 1;
+        out_shape[0] = shape[0];
+        /* Always make a positive stride */
+        if (stride_entry >= 0) {
+            *out_data = data;
+            out_strides[0] = stride_entry;
+        }
+        else {
+            *out_data = data + stride_entry * (shape_entry - 1);
+            out_strides[0] = -stride_entry;
+        }
+        return 0;
+    }
+
+    /* Sort the axes based on the destination strides */
+    PyArray_CreateSortedStridePerm(ndim, shape, strides, strideperm);
+    for (i = 0; i < ndim; ++i) {
+        int iperm = strideperm[ndim - i - 1].perm;
+        out_shape[i] = shape[iperm];
+        out_strides[i] = strides[iperm];
+    }
+
+    /* Reverse any negative strides */
+    for (i = 0; i < ndim; ++i) {
+        npy_intp stride_entry = out_strides[i], shape_entry = out_shape[i];
+
+        if (stride_entry < 0) {
+            data += stride_entry * (shape_entry - 1);
+            out_strides[i] = -stride_entry;
+        }
+        /* Detect 0-size arrays here */
+        if (shape_entry == 0) {
+            *out_ndim = 1;
+            *out_data = data;
+            out_shape[0] = 0;
+            out_strides[0] = 0;
+            return 0;
+        }
+    }
+
+    /* Coalesce any dimensions where possible */
+    i = 0;
+    for (j = 1; j < ndim; ++j) {
+        if (out_shape[i] == 1) {
+            /* Drop axis i */
+            out_shape[i] = out_shape[j];
+            out_strides[i] = out_strides[j];
+        }
+        else if (out_shape[j] == 1) {
+            /* Drop axis j */
+        }
+        else if (out_strides[i] * out_shape[i] == out_strides[j]) {
+            /* Coalesce axes i and j */
+            out_shape[i] *= out_shape[j];
+        }
+        else {
+            /* Can't coalesce, go to next i */
+            ++i;
+            out_shape[i] = out_shape[j];
+            out_strides[i] = out_strides[j];
+        }
+    }
+    ndim = i+1;
+
+#if 0
+    /* DEBUG */
+    {
+        printf("raw iter ndim %d\n", ndim);
+        printf("shape: ");
+        for (i = 0; i < ndim; ++i) {
+            printf("%d ", (int)out_shape[i]);
+        }
+        printf("\n");
+        printf("strides: ");
+        for (i = 0; i < ndim; ++i) {
+            printf("%d ", (int)out_strides[i]);
+        }
+        printf("\n");
+    }
+#endif
+
+    *out_data = data;
+    *out_ndim = ndim;
+    return 0;
+}
+
+/*
+ * The same as PyArray_PrepareOneRawArrayIter, but for two
+ * operands instead of one. Any broadcasting of the two operands
+ * should have already been done before calling this function,
+ * as the ndim and shape is only specified once for both operands.
+ *
+ * Only the strides of the first operand are used to reorder
+ * the dimensions, no attempt to consider all the strides together
+ * is made, as is done in the NpyIter object.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_TWO_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareTwoRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB)
+{
+    npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int i, j;
+
+    /* Special case 0 and 1 dimensions */
+    if (ndim == 0) {
+        *out_ndim = 1;
+        *out_dataA = dataA;
+        *out_dataB = dataB;
+        out_shape[0] = 1;
+        out_stridesA[0] = 0;
+        out_stridesB[0] = 0;
+        return 0;
+    }
+    else if (ndim == 1) {
+        npy_intp stride_entryA = stridesA[0], stride_entryB = stridesB[0];
+        npy_intp shape_entry = shape[0];
+        *out_ndim = 1;
+        out_shape[0] = shape[0];
+        /* Always make a positive stride for the first operand */
+        if (stride_entryA >= 0) {
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            out_stridesA[0] = stride_entryA;
+            out_stridesB[0] = stride_entryB;
+        }
+        else {
+            *out_dataA = dataA + stride_entryA * (shape_entry - 1);
+            *out_dataB = dataB + stride_entryB * (shape_entry - 1);
+            out_stridesA[0] = -stride_entryA;
+            out_stridesB[0] = -stride_entryB;
+        }
+        return 0;
+    }
+
+    /* Sort the axes based on the destination strides */
+    PyArray_CreateSortedStridePerm(ndim, shape, stridesA, strideperm);
+    for (i = 0; i < ndim; ++i) {
+        int iperm = strideperm[ndim - i - 1].perm;
+        out_shape[i] = shape[iperm];
+        out_stridesA[i] = stridesA[iperm];
+        out_stridesB[i] = stridesB[iperm];
+    }
+
+    /* Reverse any negative strides of operand A */
+    for (i = 0; i < ndim; ++i) {
+        npy_intp stride_entryA = out_stridesA[i];
+        npy_intp stride_entryB = out_stridesB[i];
+        npy_intp shape_entry = out_shape[i];
+
+        if (stride_entryA < 0) {
+            dataA += stride_entryA * (shape_entry - 1);
+            dataB += stride_entryB * (shape_entry - 1);
+            out_stridesA[i] = -stride_entryA;
+            out_stridesB[i] = -stride_entryB;
+        }
+        /* Detect 0-size arrays here */
+        if (shape_entry == 0) {
+            *out_ndim = 1;
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            out_shape[0] = 0;
+            out_stridesA[0] = 0;
+            out_stridesB[0] = 0;
+            return 0;
+        }
+    }
+
+    /* Coalesce any dimensions where possible */
+    i = 0;
+    for (j = 1; j < ndim; ++j) {
+        if (out_shape[i] == 1) {
+            /* Drop axis i */
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+        }
+        else if (out_shape[j] == 1) {
+            /* Drop axis j */
+        }
+        else if (out_stridesA[i] * out_shape[i] == out_stridesA[j] &&
+                    out_stridesB[i] * out_shape[i] == out_stridesB[j]) {
+            /* Coalesce axes i and j */
+            out_shape[i] *= out_shape[j];
+        }
+        else {
+            /* Can't coalesce, go to next i */
+            ++i;
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+        }
+    }
+    ndim = i+1;
+
+    *out_dataA = dataA;
+    *out_dataB = dataB;
+    *out_ndim = ndim;
+    return 0;
+}
+
+/*
+ * The same as PyArray_PrepareOneRawArrayIter, but for three
+ * operands instead of one. Any broadcasting of the three operands
+ * should have already been done before calling this function,
+ * as the ndim and shape is only specified once for all operands.
+ *
+ * Only the strides of the first operand are used to reorder
+ * the dimensions, no attempt to consider all the strides together
+ * is made, as is done in the NpyIter object.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_THREE_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            char *dataC, npy_intp *stridesC,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB,
+                            char **out_dataC, npy_intp *out_stridesC)
+{
+    npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int i, j;
+
+    /* Special case 0 and 1 dimensions */
+    if (ndim == 0) {
+        *out_ndim = 1;
+        *out_dataA = dataA;
+        *out_dataB = dataB;
+        *out_dataC = dataC;
+        out_shape[0] = 1;
+        out_stridesA[0] = 0;
+        out_stridesB[0] = 0;
+        out_stridesC[0] = 0;
+        return 0;
+    }
+    else if (ndim == 1) {
+        npy_intp stride_entryA = stridesA[0];
+        npy_intp stride_entryB = stridesB[0];
+        npy_intp stride_entryC = stridesC[0];
+        npy_intp shape_entry = shape[0];
+        *out_ndim = 1;
+        out_shape[0] = shape[0];
+        /* Always make a positive stride for the first operand */
+        if (stride_entryA >= 0) {
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            *out_dataC = dataC;
+            out_stridesA[0] = stride_entryA;
+            out_stridesB[0] = stride_entryB;
+            out_stridesC[0] = stride_entryC;
+        }
+        else {
+            *out_dataA = dataA + stride_entryA * (shape_entry - 1);
+            *out_dataB = dataB + stride_entryB * (shape_entry - 1);
+            *out_dataC = dataC + stride_entryC * (shape_entry - 1);
+            out_stridesA[0] = -stride_entryA;
+            out_stridesB[0] = -stride_entryB;
+            out_stridesC[0] = -stride_entryC;
+        }
+        return 0;
+    }
+
+    /* Sort the axes based on the destination strides */
+    PyArray_CreateSortedStridePerm(ndim, shape, stridesA, strideperm);
+    for (i = 0; i < ndim; ++i) {
+        int iperm = strideperm[ndim - i - 1].perm;
+        out_shape[i] = shape[iperm];
+        out_stridesA[i] = stridesA[iperm];
+        out_stridesB[i] = stridesB[iperm];
+        out_stridesC[i] = stridesC[iperm];
+    }
+
+    /* Reverse any negative strides of operand A */
+    for (i = 0; i < ndim; ++i) {
+        npy_intp stride_entryA = out_stridesA[i];
+        npy_intp stride_entryB = out_stridesB[i];
+        npy_intp stride_entryC = out_stridesC[i];
+        npy_intp shape_entry = out_shape[i];
+
+        if (stride_entryA < 0) {
+            dataA += stride_entryA * (shape_entry - 1);
+            dataB += stride_entryB * (shape_entry - 1);
+            dataC += stride_entryC * (shape_entry - 1);
+            out_stridesA[i] = -stride_entryA;
+            out_stridesB[i] = -stride_entryB;
+            out_stridesC[i] = -stride_entryC;
+        }
+        /* Detect 0-size arrays here */
+        if (shape_entry == 0) {
+            *out_ndim = 1;
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            *out_dataC = dataC;
+            out_shape[0] = 0;
+            out_stridesA[0] = 0;
+            out_stridesB[0] = 0;
+            out_stridesC[0] = 0;
+            return 0;
+        }
+    }
+
+    /* Coalesce any dimensions where possible */
+    i = 0;
+    for (j = 1; j < ndim; ++j) {
+        if (out_shape[i] == 1) {
+            /* Drop axis i */
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+            out_stridesC[i] = out_stridesC[j];
+        }
+        else if (out_shape[j] == 1) {
+            /* Drop axis j */
+        }
+        else if (out_stridesA[i] * out_shape[i] == out_stridesA[j] &&
+                    out_stridesB[i] * out_shape[i] == out_stridesB[j] &&
+                    out_stridesC[i] * out_shape[i] == out_stridesC[j]) {
+            /* Coalesce axes i and j */
+            out_shape[i] *= out_shape[j];
+        }
+        else {
+            /* Can't coalesce, go to next i */
+            ++i;
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+            out_stridesC[i] = out_stridesC[j];
+        }
+    }
+    ndim = i+1;
+
+    *out_dataA = dataA;
+    *out_dataB = dataB;
+    *out_dataC = dataC;
+    *out_ndim = ndim;
+    return 0;
+}
+
+/* See lowlevel_strided_loops.h for parameter docs. */
+NPY_NO_EXPORT int
+PyArray_PrepareFourRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            char *dataC, npy_intp *stridesC,
+                            char *dataD, npy_intp *stridesD,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB,
+                            char **out_dataC, npy_intp *out_stridesC,
+                            char **out_dataD, npy_intp *out_stridesD)
+{
+    npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int i, j;
+
+    /* Special case 0 and 1 dimensions */
+    if (ndim == 0) {
+        *out_ndim = 1;
+        *out_dataA = dataA;
+        *out_dataB = dataB;
+        *out_dataC = dataC;
+        *out_dataD = dataD;
+        out_shape[0] = 1;
+        out_stridesA[0] = 0;
+        out_stridesB[0] = 0;
+        out_stridesC[0] = 0;
+        out_stridesD[0] = 0;
+        return 0;
+    }
+    else if (ndim == 1) {
+        npy_intp stride_entryA = stridesA[0];
+        npy_intp stride_entryB = stridesB[0];
+        npy_intp stride_entryC = stridesC[0];
+        npy_intp stride_entryD = stridesD[0];
+        npy_intp shape_entry = shape[0];
+        *out_ndim = 1;
+        out_shape[0] = shape[0];
+        /* Always make a positive stride for the first operand */
+        if (stride_entryA >= 0) {
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            *out_dataC = dataC;
+            *out_dataD = dataD;
+            out_stridesA[0] = stride_entryA;
+            out_stridesB[0] = stride_entryB;
+            out_stridesC[0] = stride_entryC;
+            out_stridesD[0] = stride_entryD;
+        }
+        else {
+            *out_dataA = dataA + stride_entryA * (shape_entry - 1);
+            *out_dataB = dataB + stride_entryB * (shape_entry - 1);
+            *out_dataC = dataC + stride_entryC * (shape_entry - 1);
+            *out_dataD = dataD + stride_entryD * (shape_entry - 1);
+            out_stridesA[0] = -stride_entryA;
+            out_stridesB[0] = -stride_entryB;
+            out_stridesC[0] = -stride_entryC;
+            out_stridesD[0] = -stride_entryD;
+        }
+        return 0;
+    }
+
+    /* Sort the axes based on the destination strides */
+    PyArray_CreateSortedStridePerm(ndim, shape, stridesA, strideperm);
+    for (i = 0; i < ndim; ++i) {
+        int iperm = strideperm[ndim - i - 1].perm;
+        out_shape[i] = shape[iperm];
+        out_stridesA[i] = stridesA[iperm];
+        out_stridesB[i] = stridesB[iperm];
+        out_stridesC[i] = stridesC[iperm];
+        out_stridesD[i] = stridesD[iperm];
+    }
+
+    /* Reverse any negative strides of operand A */
+    for (i = 0; i < ndim; ++i) {
+        npy_intp stride_entryA = out_stridesA[i];
+        npy_intp stride_entryB = out_stridesB[i];
+        npy_intp stride_entryC = out_stridesC[i];
+        npy_intp stride_entryD = out_stridesD[i];
+        npy_intp shape_entry = out_shape[i];
+
+        if (stride_entryA < 0) {
+            dataA += stride_entryA * (shape_entry - 1);
+            dataB += stride_entryB * (shape_entry - 1);
+            dataC += stride_entryC * (shape_entry - 1);
+            dataD += stride_entryD * (shape_entry - 1);
+            out_stridesA[i] = -stride_entryA;
+            out_stridesB[i] = -stride_entryB;
+            out_stridesC[i] = -stride_entryC;
+            out_stridesD[i] = -stride_entryD;
+        }
+        /* Detect 0-size arrays here */
+        if (shape_entry == 0) {
+            *out_ndim = 1;
+            *out_dataA = dataA;
+            *out_dataB = dataB;
+            *out_dataC = dataC;
+            *out_dataD = dataD;
+            out_shape[0] = 0;
+            out_stridesA[0] = 0;
+            out_stridesB[0] = 0;
+            out_stridesC[0] = 0;
+            out_stridesD[0] = 0;
+            return 0;
+        }
+    }
+
+    /* Coalesce any dimensions where possible */
+    i = 0;
+    for (j = 1; j < ndim; ++j) {
+        if (out_shape[i] == 1) {
+            /* Drop axis i */
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+            out_stridesC[i] = out_stridesC[j];
+            out_stridesD[i] = out_stridesD[j];
+        }
+        else if (out_shape[j] == 1) {
+            /* Drop axis j */
+        }
+        else if (out_stridesA[i] * out_shape[i] == out_stridesA[j] &&
+                    out_stridesB[i] * out_shape[i] == out_stridesB[j] &&
+                    out_stridesC[i] * out_shape[i] == out_stridesC[j] &&
+                    out_stridesD[i] * out_shape[i] == out_stridesD[j]) {
+            /* Coalesce axes i and j */
+            out_shape[i] *= out_shape[j];
+        }
+        else {
+            /* Can't coalesce, go to next i */
+            ++i;
+            out_shape[i] = out_shape[j];
+            out_stridesA[i] = out_stridesA[j];
+            out_stridesB[i] = out_stridesB[j];
+            out_stridesC[i] = out_stridesC[j];
+            out_stridesD[i] = out_stridesD[j];
+        }
+    }
+    ndim = i+1;
+
+    *out_dataA = dataA;
+    *out_dataB = dataB;
+    *out_dataC = dataC;
+    *out_dataD = dataD;
+    *out_ndim = ndim;
+    return 0;
+}
+
+/*
+ * Casts the elements from one n-dimensional array to another n-dimensional
+ * array with identical shape but possibly different strides and dtypes.
+ * Does not account for overlap.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+PyArray_CastRawNDimArrays(int ndim, npy_intp *shape,
+                      char *src, char *dst,
+                      npy_intp *src_strides, npy_intp *dst_strides,
+                      PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                      int move_references)
+{
+    PyArray_StridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int idim;
+    npy_intp src_align, dst_align;
+    int aligned, needs_api = 0;
+    npy_intp coord[NPY_MAXDIMS];
+    npy_intp shape_it[NPY_MAXDIMS];
+    npy_intp src_strides_it[NPY_MAXDIMS];
+    npy_intp dst_strides_it[NPY_MAXDIMS];
+
+    /* Determine data alignment */
+    src_align = (npy_intp)src;
+    for (idim = 0; idim < ndim; ++idim) {
+        src_align |= src_strides[idim];
+    }
+    dst_align = (npy_intp)dst;
+    for (idim = 0; idim < ndim; ++idim) {
+        dst_align |= dst_strides[idim];
+    }
+    aligned = (src_align & (src_dtype->alignment - 1)) == 0 &&
+              (dst_align & (dst_dtype->alignment - 1)) == 0;
+
+    if (PyArray_PrepareTwoRawArrayIter(ndim, shape,
+                                    dst, dst_strides,
+                                    src, src_strides,
+                                    &ndim, shape_it,
+                                    &dst, dst_strides_it,
+                                    &src, src_strides_it) < 0) {
+        return NPY_FAIL;
+    }
+
+    /* Get the function to do the casting */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                        src_strides[0], dst_strides[0],
+                        src_dtype, dst_dtype,
+                        move_references,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        stransfer(dst, dst_strides_it[0],
+                    src, src_strides_it[0], shape_it[0],
+                    src_dtype->elsize, transferdata);
+    } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                src, src_strides_it,
+                                dst, dst_strides_it);
+
+    /* Cleanup */
+    NPY_AUXDATA_FREE(transferdata);
+
+    /* If needs_api was set to 1, it may have raised a Python exception */
+    return (needs_api && PyErr_Occurred()) ? NPY_FAIL : NPY_SUCCEED;
+}
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 6447e9843..cf4895d80 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -3013,7 +3013,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     /* Initialize the output to all zeros and reset the iterator */
     ret = NpyIter_GetOperandArray(iter)[nop];
     Py_INCREF(ret);
-    PyArray_FillWithZero(ret);
+    PyArray_AssignZero(ret, NULL, 0, NULL);
 
 
     /***************************/
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index 31a7d041e..3de49b74c 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -181,14 +181,14 @@ arrayflags_dealloc(PyArrayFlagsObject *self)
 }
 
 
-#define _define_get(UPPER, lower)                                       \
-    static PyObject *                                                   \
-    arrayflags_ ## lower ## _get(PyArrayFlagsObject *self)              \
-    {                                                                   \
-        PyObject *item;                                                 \
+#define _define_get(UPPER, lower) \
+    static PyObject * \
+    arrayflags_ ## lower ## _get(PyArrayFlagsObject *self) \
+    { \
+        PyObject *item; \
         item = ((self->flags & (UPPER)) == (UPPER)) ? Py_True : Py_False; \
-        Py_INCREF(item);                                                \
-        return item;                                                    \
+        Py_INCREF(item); \
+        return item; \
     }
 
 _define_get(NPY_ARRAY_C_CONTIGUOUS, contiguous)
@@ -260,6 +260,80 @@ arrayflags_num_get(PyArrayFlagsObject *self)
     return PyInt_FromLong(self->flags);
 }
 
+static PyObject *
+arrayflags_maskna_get(PyArrayFlagsObject *self)
+{
+    PyObject *item;
+    if (self->flags & NPY_ARRAY_MASKNA) {
+        item = Py_True;
+    }
+    else {
+        item = Py_False;
+    }
+    Py_INCREF(item);
+    return item;
+}
+
+static int
+arrayflags_maskna_set(PyArrayFlagsObject *self, PyObject *obj)
+{
+    if (self->arr == NULL) {
+        PyErr_SetString(PyExc_ValueError, "Cannot set flags on array scalars.");
+        return -1;
+    }
+
+    if (PyObject_IsTrue(obj)) {
+        return PyArray_AllocateMaskNA((PyArrayObject *)self->arr, 0, 0, 1);
+    }
+    else {
+        if (self->flags & NPY_ARRAY_MASKNA) {
+            PyErr_SetString(PyExc_ValueError,
+                        "Cannot remove a NumPy array's NA mask");
+            return -1;
+        }
+        else {
+            return 0;
+        }
+    }
+}
+
+static PyObject *
+arrayflags_ownmaskna_get(PyArrayFlagsObject *self)
+{
+    PyObject *item;
+    if (self->flags & NPY_ARRAY_OWNMASKNA) {
+        item = Py_True;
+    }
+    else {
+        item = Py_False;
+    }
+    Py_INCREF(item);
+    return item;
+}
+
+static int
+arrayflags_ownmaskna_set(PyArrayFlagsObject *self, PyObject *obj)
+{
+    if (self->arr == NULL) {
+        PyErr_SetString(PyExc_ValueError, "Cannot set flags on array scalars.");
+        return -1;
+    }
+
+    if (PyObject_IsTrue(obj)) {
+        return PyArray_AllocateMaskNA((PyArrayObject *)self->arr, 1, 0, 1);
+    }
+    else {
+        if (self->flags & NPY_ARRAY_OWNMASKNA) {
+            PyErr_SetString(PyExc_ValueError,
+                        "Cannot remove a NumPy array's NA mask");
+            return -1;
+        }
+        else {
+            return 0;
+        }
+    }
+}
+
 /* relies on setflags order being write, align, uic */
 static int
 arrayflags_updateifcopy_set(PyArrayFlagsObject *self, PyObject *obj)
@@ -348,6 +422,14 @@ static PyGetSetDef arrayflags_getsets[] = {
         (getter)arrayflags_writeable_get,
         (setter)arrayflags_writeable_set,
         NULL, NULL},
+    {"maskna",
+        (getter)arrayflags_maskna_get,
+        (setter)arrayflags_maskna_set,
+        NULL, NULL},
+    {"ownmaskna",
+        (getter)arrayflags_ownmaskna_get,
+        (setter)arrayflags_ownmaskna_set,
+        NULL, NULL},
     {"fnc",
         (getter)arrayflags_fnc_get,
         NULL,
@@ -450,6 +532,9 @@ arrayflags_getitem(PyArrayFlagsObject *self, PyObject *ind)
         if (strncmp(key, "FARRAY", n) == 0) {
             return arrayflags_farray_get(self);
         }
+        if (strncmp(key, "MASKNA", n) == 0) {
+            return arrayflags_maskna_get(self);
+        }
         break;
     case 7:
         if (strncmp(key,"FORTRAN",n) == 0) {
@@ -469,6 +554,9 @@ arrayflags_getitem(PyArrayFlagsObject *self, PyObject *ind)
         if (strncmp(key,"WRITEABLE",n) == 0) {
             return arrayflags_writeable_get(self);
         }
+        if (strncmp(key, "OWNMASKNA", n) == 0) {
+            return arrayflags_ownmaskna_get(self);
+        }
         break;
     case 10:
         if (strncmp(key,"CONTIGUOUS",n) == 0) {
@@ -528,6 +616,12 @@ arrayflags_setitem(PyArrayFlagsObject *self, PyObject *ind, PyObject *item)
              ((n==1) && (strncmp(key, "U", n) == 0))) {
         return arrayflags_updateifcopy_set(self, item);
     }
+    else if ((n==6) && (strncmp(key, "MASKNA", n) == 0)) {
+        return arrayflags_maskna_set(self, item);
+    }
+    else if ((n==9) && (strncmp(key, "OWNMASKNA", n) == 0)) {
+        return arrayflags_ownmaskna_set(self, item);
+    }
 
  fail:
     PyErr_SetString(PyExc_KeyError, "Unknown flag");
@@ -550,14 +644,19 @@ arrayflags_print(PyArrayFlagsObject *self)
 {
     int fl = self->flags;
 
-    return PyUString_FromFormat("  %s : %s\n  %s : %s\n  %s : %s\n"\
-                           "  %s : %s\n  %s : %s\n  %s : %s",
-                           "C_CONTIGUOUS", _torf_(fl, NPY_ARRAY_C_CONTIGUOUS),
-                           "F_CONTIGUOUS", _torf_(fl, NPY_ARRAY_F_CONTIGUOUS),
-                           "OWNDATA",      _torf_(fl, NPY_ARRAY_OWNDATA),
-                           "WRITEABLE",    _torf_(fl, NPY_ARRAY_WRITEABLE),
-                           "ALIGNED",      _torf_(fl, NPY_ARRAY_ALIGNED),
-                           "UPDATEIFCOPY", _torf_(fl, NPY_ARRAY_UPDATEIFCOPY));
+    return PyUString_FromFormat(
+                        "  %s : %s\n  %s : %s\n"
+                        "  %s : %s\n  %s : %s\n"
+                        "  %s : %s\n  %s : %s\n"
+                        "  %s : %s\n  %s : %s",
+                        "C_CONTIGUOUS", _torf_(fl, NPY_ARRAY_C_CONTIGUOUS),
+                        "F_CONTIGUOUS", _torf_(fl, NPY_ARRAY_F_CONTIGUOUS),
+                        "OWNDATA",      _torf_(fl, NPY_ARRAY_OWNDATA),
+                        "MASKNA",       _torf_(fl, NPY_ARRAY_MASKNA),
+                        "OWNMASKNA",    _torf_(fl, NPY_ARRAY_OWNMASKNA),
+                        "WRITEABLE",    _torf_(fl, NPY_ARRAY_WRITEABLE),
+                        "ALIGNED",      _torf_(fl, NPY_ARRAY_ALIGNED),
+                        "UPDATEIFCOPY", _torf_(fl, NPY_ARRAY_UPDATEIFCOPY));
 }
 
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index e9e052683..145e0dbfa 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -61,22 +61,24 @@ array_shape_set(PyArrayObject *self, PyObject *val)
     /* Free old dimensions and strides */
     PyDimMem_FREE(PyArray_DIMS(self));
     nd = PyArray_NDIM(ret);
-    ((PyArrayObject_fieldaccess *)self)->nd = nd;
+    ((PyArrayObject_fields *)self)->nd = nd;
     if (nd > 0) {
         /* create new dimensions and strides */
-        ((PyArrayObject_fieldaccess *)self)->dimensions = PyDimMem_NEW(2*nd);
+        ((PyArrayObject_fields *)self)->dimensions = PyDimMem_NEW(3*nd);
         if (PyArray_DIMS(self) == NULL) {
             Py_DECREF(ret);
             PyErr_SetString(PyExc_MemoryError,"");
             return -1;
         }
-        ((PyArrayObject_fieldaccess *)self)->strides = PyArray_DIMS(self) + nd;
+        ((PyArrayObject_fields *)self)->strides = PyArray_DIMS(self) + nd;
+        ((PyArrayObject_fields *)self)->maskna_strides = PyArray_DIMS(self) + 2*nd;
         memcpy(PyArray_DIMS(self), PyArray_DIMS(ret), nd*sizeof(intp));
         memcpy(PyArray_STRIDES(self), PyArray_STRIDES(ret), nd*sizeof(intp));
+        memcpy(PyArray_MASKNA_STRIDES(self), PyArray_MASKNA_STRIDES(ret), nd*sizeof(intp));
     }
     else {
-        ((PyArrayObject_fieldaccess *)self)->dimensions = NULL;
-        ((PyArrayObject_fieldaccess *)self)->strides = NULL;
+        ((PyArrayObject_fields *)self)->dimensions = NULL;
+        ((PyArrayObject_fields *)self)->strides = NULL;
     }
     Py_DECREF(ret);
     PyArray_UpdateFlags(self, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
@@ -341,9 +343,9 @@ array_data_set(PyArrayObject *self, PyObject *op)
         Py_DECREF(PyArray_BASE(self));
     }
     Py_INCREF(op);
-    ((PyArrayObject_fieldaccess *)self)->base = op;
-    ((PyArrayObject_fieldaccess *)self)->data = buf;
-    ((PyArrayObject_fieldaccess *)self)->flags = NPY_ARRAY_CARRAY;
+    ((PyArrayObject_fields *)self)->base = op;
+    ((PyArrayObject_fields *)self)->data = buf;
+    ((PyArrayObject_fields *)self)->flags = NPY_ARRAY_CARRAY;
     if (!writeable) {
         PyArray_CLEARFLAGS(self, ~NPY_ARRAY_WRITEABLE);
     }
@@ -498,18 +500,18 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
             return -1;
         }
         PyDimMem_FREE(PyArray_DIMS(self));
-        ((PyArrayObject_fieldaccess *)self)->dimensions = PyArray_DIMS(temp);
-        ((PyArrayObject_fieldaccess *)self)->nd = PyArray_NDIM(temp);
-        ((PyArrayObject_fieldaccess *)self)->strides = PyArray_STRIDES(temp);
+        ((PyArrayObject_fields *)self)->dimensions = PyArray_DIMS(temp);
+        ((PyArrayObject_fields *)self)->nd = PyArray_NDIM(temp);
+        ((PyArrayObject_fields *)self)->strides = PyArray_STRIDES(temp);
         newtype = PyArray_DESCR(temp);
         Py_INCREF(PyArray_DESCR(temp));
         /* Fool deallocator not to delete these*/
-        ((PyArrayObject_fieldaccess *)temp)->nd = 0;
-        ((PyArrayObject_fieldaccess *)temp)->dimensions = NULL;
+        ((PyArrayObject_fields *)temp)->nd = 0;
+        ((PyArrayObject_fields *)temp)->dimensions = NULL;
         Py_DECREF(temp);
     }
 
-    ((PyArrayObject_fieldaccess *)self)->descr = newtype;
+    ((PyArrayObject_fields *)self)->descr = newtype;
     PyArray_UpdateFlags(self, NPY_ARRAY_UPDATE_ALL);
     return 0;
 
@@ -775,7 +777,7 @@ array_flat_set(PyArrayObject *self, PyObject *val)
     typecode = PyArray_DESCR(self);
     Py_INCREF(typecode);
     arr = (PyArrayObject *)PyArray_FromAny(val, typecode,
-                          0, 0, NPY_ARRAY_FORCECAST | FORTRAN_IF(self), NULL);
+                  0, 0, NPY_ARRAY_FORCECAST | PyArray_FORTRAN_IF(self), NULL);
     if (arr == NULL) {
         return -1;
     }
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 54de27e05..11f506c8c 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -15,10 +15,13 @@
 #include "numpy/npy_3kcompat.h"
 
 #include "common.h"
+#include "arrayobject.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
+#include "na_object.h"
+#include "reduction.h"
 
-#define _check_axis PyArray_CheckAxis
+#include "item_selection.h"
 
 /*NUMPY_API
  * Take
@@ -33,19 +36,23 @@ PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int axis,
     intp nd, i, j, n, m, max_item, tmp, chunk, nelem;
     intp shape[MAX_DIMS];
     char *src, *dest;
-    int err;
+    int err, use_maskna = 0;
 
     indices = NULL;
-    self = (PyArrayObject *)_check_axis(self0, &axis, NPY_ARRAY_CARRAY);
+    self = (PyArrayObject *)PyArray_CheckAxis(self0, &axis,
+                                    NPY_ARRAY_CARRAY | NPY_ARRAY_ALLOWNA);
     if (self == NULL) {
         return NULL;
     }
     indices = (PyArrayObject *)PyArray_ContiguousFromAny(indices0,
-                                                         PyArray_INTP,
+                                                         NPY_INTP,
                                                          1, 0);
     if (indices == NULL) {
         goto fail;
     }
+
+
+
     n = m = chunk = 1;
     nd = PyArray_NDIM(self) + PyArray_NDIM(indices) - 1;
     for (i = 0; i < nd; i++) {
@@ -76,14 +83,24 @@ PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int axis,
         if (obj == NULL) {
             goto fail;
         }
+
+        /* Allocate an NA mask if necessary */
+        if (PyArray_HASMASKNA(self)) {
+            if (PyArray_AllocateMaskNA(obj, 1, 0, 1) < 0) {
+                goto fail;
+            }
+            use_maskna = 1;
+        }
     }
     else {
-        int flags = NPY_ARRAY_CARRAY | NPY_ARRAY_UPDATEIFCOPY;
+        int flags = NPY_ARRAY_CARRAY |
+                    NPY_ARRAY_UPDATEIFCOPY |
+                    NPY_ARRAY_ALLOWNA;
 
         if ((PyArray_NDIM(out) != nd) ||
             !PyArray_CompareLists(PyArray_DIMS(out), shape, nd)) {
             PyErr_SetString(PyExc_ValueError,
-                            "bad shape in output array");
+                        "output array does not match result of ndarray.take");
             goto fail;
         }
 
@@ -101,6 +118,24 @@ PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int axis,
         if (obj == NULL) {
             goto fail;
         }
+
+        if (PyArray_HASMASKNA(self)) {
+            if (PyArray_HASMASKNA(obj)) {
+                use_maskna = 1;
+            }
+            else {
+                int containsna = PyArray_ContainsNA(self, NULL, NULL);
+                if (containsna == -1) {
+                    goto fail;
+                }
+                else if (containsna) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "Cannot assign NA to an array which "
+                            "does not support NAs");
+                    goto fail;
+                }
+            }
+        }
     }
 
     max_item = PyArray_DIMS(self)[axis];
@@ -110,7 +145,109 @@ PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int axis,
     dest = PyArray_DATA(obj);
 
     func = PyArray_DESCR(self)->f->fasttake;
-    if (func == NULL) {
+    if (use_maskna) {
+        char *dst_maskna = NULL, *src_maskna = NULL;
+        npy_intp itemsize = PyArray_DESCR(obj)->elsize;
+        PyArray_MaskedStridedUnaryOp *maskedstransfer = NULL;
+        NpyAuxData *transferdata = NULL;
+        int needs_api = 0;
+
+        if (PyArray_GetMaskedDTypeTransferFunction(
+                        1,
+                        itemsize,
+                        itemsize,
+                        1,
+                        PyArray_DESCR(obj),
+                        PyArray_DESCR(obj),
+                        PyArray_MASKNA_DTYPE(obj),
+                        0,
+                        &maskedstransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            goto fail;
+        }
+
+
+        src_maskna = PyArray_MASKNA_DATA(self);
+        dst_maskna = PyArray_MASKNA_DATA(obj);
+
+        switch(clipmode) {
+        case NPY_RAISE:
+            for (i = 0; i < n; i++) {
+                for (j = 0; j < m; j++) {
+                    tmp = ((intp *)(PyArray_DATA(indices)))[j];
+                    if (tmp < 0) {
+                        tmp = tmp + max_item;
+                    }
+                    if ((tmp < 0) || (tmp >= max_item)) {
+                        PyErr_SetString(PyExc_IndexError,
+                                "index out of range for array");
+                        NPY_AUXDATA_FREE(transferdata);
+                        goto fail;
+                    }
+                    maskedstransfer(dest, itemsize,
+                                    src + tmp*chunk, itemsize,
+                                    (npy_mask *)(src_maskna + tmp*nelem), 1,
+                                    nelem, itemsize, transferdata);
+                    dest += chunk;
+                    memmove(dst_maskna, src_maskna + tmp*nelem, nelem);
+                    dst_maskna += nelem;
+                }
+                src += chunk*max_item;
+                src_maskna += nelem*max_item;
+            }
+            break;
+        case NPY_WRAP:
+            for (i = 0; i < n; i++) {
+                for (j = 0; j < m; j++) {
+                    tmp = ((intp *)(PyArray_DATA(indices)))[j];
+                    if (tmp < 0) {
+                        while (tmp < 0) {
+                            tmp += max_item;
+                        }
+                    }
+                    else if (tmp >= max_item) {
+                        while (tmp >= max_item) {
+                            tmp -= max_item;
+                        }
+                    }
+                    maskedstransfer(dest, itemsize,
+                                    src + tmp*chunk, itemsize,
+                                    (npy_mask *)(src_maskna + tmp*nelem), 1,
+                                    nelem, itemsize, transferdata);
+                    dest += chunk;
+                    memmove(dst_maskna, src_maskna + tmp*nelem, nelem);
+                    dst_maskna += nelem;
+                }
+                src += chunk*max_item;
+                src_maskna += nelem*max_item;
+            }
+            break;
+        case NPY_CLIP:
+            for (i = 0; i < n; i++) {
+                for (j = 0; j < m; j++) {
+                    tmp = ((intp *)(PyArray_DATA(indices)))[j];
+                    if (tmp < 0) {
+                        tmp = 0;
+                    }
+                    else if (tmp >= max_item) {
+                        tmp = max_item - 1;
+                    }
+                    maskedstransfer(dest, itemsize,
+                                    src + tmp*chunk, itemsize,
+                                    (npy_mask *)(src_maskna + tmp*nelem), 1,
+                                    nelem, itemsize, transferdata);
+                    dest += chunk;
+                    memmove(dst_maskna, src_maskna + tmp*nelem, nelem);
+                    dst_maskna += nelem;
+                }
+                src += chunk*max_item;
+                src_maskna += nelem*max_item;
+            }
+            break;
+        }
+        NPY_AUXDATA_FREE(transferdata);
+    }
+    else if (func == NULL) {
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < n; i++) {
@@ -232,7 +369,7 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     dest = PyArray_DATA(self);
     chunk = PyArray_DESCR(self)->elsize;
     indices = (PyArrayObject *)PyArray_ContiguousFromAny(indices0,
-                                                         PyArray_INTP, 0, 0);
+                                                         NPY_INTP, 0, 0);
     if (indices == NULL) {
         goto fail;
     }
@@ -499,14 +636,14 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     PyArrayObject *ret = NULL;
     char *new_data, *old_data;
 
-    repeats = (PyArrayObject *)PyArray_ContiguousFromAny(op, PyArray_INTP, 0, 1);
+    repeats = (PyArrayObject *)PyArray_ContiguousFromAny(op, NPY_INTP, 0, 1);
     if (repeats == NULL) {
         return NULL;
     }
     nd = PyArray_NDIM(repeats);
     counts = (npy_intp *)PyArray_DATA(repeats);
 
-    if ((ap=_check_axis(aop, &axis, NPY_ARRAY_CARRAY))==NULL) {
+    if ((ap=PyArray_CheckAxis(aop, &axis, NPY_ARRAY_CARRAY))==NULL) {
         Py_DECREF(repeats);
         return NULL;
     }
@@ -1068,7 +1205,7 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     if ((n == 0) || (PyArray_SIZE(op) == 1)) {
         ret = (PyArrayObject *)PyArray_New(Py_TYPE(op), PyArray_NDIM(op),
                                            PyArray_DIMS(op),
-                                           PyArray_INTP,
+                                           NPY_INTP,
                                            NULL, NULL, 0, 0,
                                            (PyObject *)op);
         if (ret == NULL) {
@@ -1079,7 +1216,7 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     }
 
     /* Creates new reference op2 */
-    if ((op2=(PyArrayObject *)_check_axis(op, &axis, 0)) == NULL) {
+    if ((op2=(PyArrayObject *)PyArray_CheckAxis(op, &axis, 0)) == NULL) {
         return NULL;
     }
     /* Determine if we should use new algorithm or not */
@@ -1107,7 +1244,7 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
         return NULL;
     }
     ret = (PyArrayObject *)PyArray_New(Py_TYPE(op), PyArray_NDIM(op),
-                                       PyArray_DIMS(op), PyArray_INTP,
+                                       PyArray_DIMS(op), NPY_INTP,
                                        NULL, NULL, 0, 0, (PyObject *)op);
     if (ret == NULL) {
         goto fail;
@@ -1230,7 +1367,7 @@ PyArray_LexSort(PyObject *sort_keys, int axis)
         /* single element case */
         ret = (PyArrayObject *)PyArray_New(&PyArray_Type, PyArray_NDIM(mps[0]),
                                            PyArray_DIMS(mps[0]),
-                                           PyArray_INTP,
+                                           NPY_INTP,
                                            NULL, NULL, 0, 0, NULL);
 
         if (ret == NULL) {
@@ -1250,7 +1387,7 @@ PyArray_LexSort(PyObject *sort_keys, int axis)
 
     /* Now do the sorting */
     ret = (PyArrayObject *)PyArray_New(&PyArray_Type, PyArray_NDIM(mps[0]),
-                                       PyArray_DIMS(mps[0]), PyArray_INTP,
+                                       PyArray_DIMS(mps[0]), NPY_INTP,
                                        NULL, NULL, 0, 0, NULL);
     if (ret == NULL) {
         goto fail;
@@ -1481,7 +1618,7 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, NPY_SEARCHSIDE side)
     }
     /* ret is a contiguous array of intp type to hold returned indices */
     ret = (PyArrayObject *)PyArray_New(Py_TYPE(ap2), PyArray_NDIM(ap2),
-                                       PyArray_DIMS(ap2), PyArray_INTP,
+                                       PyArray_DIMS(ap2), NPY_INTP,
                                        NULL, NULL, 0, 0, (PyObject *)ap2);
     if (ret == NULL) {
         goto fail;
@@ -1516,146 +1653,153 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, NPY_SEARCHSIDE side)
 
 /*NUMPY_API
  * Diagonal
+ *
+ * As of NumPy 1.7, this function always returns a view into 'self'.
  */
 NPY_NO_EXPORT PyObject *
 PyArray_Diagonal(PyArrayObject *self, int offset, int axis1, int axis2)
 {
-    int n = PyArray_NDIM(self);
-    PyObject *new;
-    PyArray_Dims newaxes;
-    intp dims[MAX_DIMS];
-    int i, pos;
-
-    newaxes.ptr = dims;
-    if (n < 2) {
+    int i, idim, ndim = PyArray_NDIM(self);
+    npy_intp *strides, *maskna_strides = NULL;
+    npy_intp stride1, stride2, maskna_stride1 = 0, maskna_stride2 = 0;
+    npy_intp *shape, dim1, dim2;
+    int self_has_maskna = PyArray_HASMASKNA(self);
+
+    char *data, *maskna_data;
+    npy_intp diag_size;
+    PyArrayObject *ret;
+    PyArray_Descr *dtype;
+    npy_intp ret_shape[NPY_MAXDIMS], ret_strides[NPY_MAXDIMS];
+
+    if (ndim < 2) {
         PyErr_SetString(PyExc_ValueError,
-                        "array.ndim must be >= 2");
+                        "diag requires an array of at least two dimensions");
         return NULL;
     }
+
+    /* Handle negative axes with standard Python indexing rules */
     if (axis1 < 0) {
-        axis1 += n;
+        axis1 += ndim;
     }
     if (axis2 < 0) {
-        axis2 += n;
-    }
-    if ((axis1 == axis2) || (axis1 < 0) || (axis1 >= n) ||
-        (axis2 < 0) || (axis2 >= n)) {
-        PyErr_Format(PyExc_ValueError, "axis1(=%d) and axis2(=%d) "\
-                     "must be different and within range (nd=%d)",
-                     axis1, axis2, n);
-        return NULL;
+        axis2 += ndim;
     }
 
-    newaxes.len = n;
-    /* insert at the end */
-    newaxes.ptr[n-2] = axis1;
-    newaxes.ptr[n-1] = axis2;
-    pos = 0;
-    for (i = 0; i < n; i++) {
-        if ((i==axis1) || (i==axis2)) {
-            continue;
-        }
-        newaxes.ptr[pos++] = i;
+    /* Error check the two axes */
+    if (axis1 == axis2) {
+        PyErr_SetString(PyExc_ValueError,
+                    "axis1 and axis2 cannot be the same");
+        return NULL;
     }
-    new = PyArray_Transpose(self, &newaxes);
-    if (new == NULL) {
+    else if (axis1 < 0 || axis1 >= ndim || axis2 < 0 || axis2 >= ndim) {
+        PyErr_Format(PyExc_ValueError,
+                    "axis1(=%d) and axis2(=%d) "
+                    "must be within range (ndim=%d)",
+                    axis1, axis2, ndim);
         return NULL;
     }
-    self = (PyArrayObject *)new;
-
-    if (n == 2) {
-        PyObject *a = NULL, *ret = NULL;
-        PyArrayObject *indices = NULL;
-        intp n1, n2, start, stop, step, count;
-        intp *dptr;
 
-        n1 = PyArray_DIMS(self)[0];
-        n2 = PyArray_DIMS(self)[1];
-        step = n2 + 1;
-        if (offset < 0) {
-            start = -n2 * offset;
-            stop = MIN(n2, n1+offset)*(n2+1) - n2*offset;
+    /* Get the shape and strides of the two axes */
+    shape = PyArray_SHAPE(self);
+    dim1 = shape[axis1];
+    dim2 = shape[axis2];
+    strides = PyArray_STRIDES(self);
+    stride1 = strides[axis1];
+    stride2 = strides[axis2];
+    if (self_has_maskna) {
+        maskna_strides = PyArray_MASKNA_STRIDES(self);
+        maskna_stride1 = maskna_strides[axis1];
+        maskna_stride2 = maskna_strides[axis2];
+    }
+
+    /* Compute the data pointers and diag_size for the view */
+    data = PyArray_DATA(self);
+    maskna_data = PyArray_MASKNA_DATA(self);
+    if (offset > 0) {
+        if (offset >= dim2) {
+            diag_size = 0;
         }
         else {
-            start = offset;
-            stop = MIN(n1, n2-offset)*(n2+1) + offset;
-        }
+            data += offset * stride2;
+            maskna_data += offset * maskna_stride2;
 
-        /* count = ceil((stop-start)/step) */
-        count = ((stop-start) / step) + (((stop-start) % step) != 0);
-        indices = (PyArrayObject *)PyArray_New(&PyArray_Type, 1, &count,
-                              PyArray_INTP, NULL, NULL, 0, 0, NULL);
-        if (indices == NULL) {
-            Py_DECREF(self);
-            return NULL;
+            diag_size = dim2 - offset;
+            if (dim1 < diag_size) {
+                diag_size = dim1;
+            }
         }
-        dptr = (intp *)PyArray_DATA(indices);
-        for (n1 = start; n1 < stop; n1 += step) {
-            *dptr++ = n1;
+    }
+    else if (offset < 0) {
+        offset = -offset;
+        if (offset >= dim1) {
+            diag_size = 0;
         }
-        a = PyArray_IterNew((PyObject *)self);
-        Py_DECREF(self);
-        if (a == NULL) {
-            Py_DECREF(indices);
-            return NULL;
+        else {
+            data += offset * stride1;
+            maskna_data += offset * maskna_stride1;
+
+            diag_size = dim1 - offset;
+            if (dim2 < diag_size) {
+                diag_size = dim2;
+            }
         }
-        ret = PyObject_GetItem(a, (PyObject *)indices);
-        Py_DECREF(a);
-        Py_DECREF(indices);
-        return ret;
     }
-
     else {
-        /*
-         * my_diagonal = []
-         * for i in range (s [0]) :
-         * my_diagonal.append (diagonal (a [i], offset))
-         * return array (my_diagonal)
-         */
-        PyObject *mydiagonal = NULL, *ret = NULL, *sel = NULL;
-        intp n1;
-        int res;
-        PyArray_Descr *typecode;
-
-        new = NULL;
-
-        typecode = PyArray_DESCR(self);
-        mydiagonal = PyList_New(0);
-        if (mydiagonal == NULL) {
-            Py_DECREF(self);
-            return NULL;
+        diag_size = dim1 < dim2 ? dim1 : dim2;
+    }
+
+    /* Build the new shape and strides for the main data */
+    i = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (idim != axis1 && idim != axis2) {
+            ret_shape[i] = shape[idim];
+            ret_strides[i] = strides[idim];
+            ++i;
         }
-        n1 = PyArray_DIMS(self)[0];
-        for (i = 0; i < n1; i++) {
-            new = PyInt_FromLong((long) i);
-            sel = PyArray_EnsureAnyArray(PyObject_GetItem((PyObject *)self, new));
-            Py_DECREF(new);
-            if (sel == NULL) {
-                Py_DECREF(self);
-                Py_DECREF(mydiagonal);
-                return NULL;
-            }
-            new = PyArray_Diagonal((PyArrayObject *)sel, offset, n-3, n-2);
-            Py_DECREF(sel);
-            if (new == NULL) {
-                Py_DECREF(self);
-                Py_DECREF(mydiagonal);
-                return NULL;
-            }
-            res = PyList_Append(mydiagonal, new);
-            Py_DECREF(new);
-            if (res < 0) {
-                Py_DECREF(self);
-                Py_DECREF(mydiagonal);
-                return NULL;
+    }
+    ret_shape[ndim-2] = diag_size;
+    ret_strides[ndim-2] = stride1 + stride2;
+
+    /* Create the diagonal view */
+    dtype = PyArray_DTYPE(self);
+    Py_INCREF(dtype);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
+                               dtype,
+                               ndim-1, ret_shape,
+                               ret_strides,
+                               data,
+               PyArray_FLAGS(self) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
+                               (PyObject *)self);
+    if (ret == NULL) {
+        return NULL;
+    }
+    Py_INCREF(self);
+    if (PyArray_SetBaseObject(ret, (PyObject *)self) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    /* Take a view of the mask if it exists */
+    if (self_has_maskna) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+        npy_intp *maskna_strides = PyArray_MASKNA_STRIDES(self);
+
+        fret->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fret->maskna_dtype);
+        fret->maskna_data = maskna_data;
+        /* Build the strides for the mask */
+        i = 0;
+        for (idim = 0; idim < ndim; ++idim) {
+            if (idim != axis1 && idim != axis2) {
+                fret->maskna_strides[i] = maskna_strides[idim];
+                ++i;
             }
         }
-        Py_DECREF(self);
-        Py_INCREF(typecode);
-        ret =  PyArray_FromAny(mydiagonal, typecode, 0, 0, 0, NULL);
-        Py_DECREF(mydiagonal);
-        return ret;
+        fret->maskna_strides[ndim-2] = maskna_stride1 + maskna_stride2;
+        fret->flags |= NPY_ARRAY_MASKNA;
     }
+
+    return (PyObject *)ret;
 }
 
 /*NUMPY_API
@@ -1668,14 +1812,26 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
     PyArrayObject *cond;
     PyObject *res, *ret;
 
-    cond = (PyArrayObject *)PyArray_FROM_O(condition);
-    if (cond == NULL) {
-        return NULL;
+    if (PyArray_Check(condition)) {
+        cond = (PyArrayObject *)condition;
+        Py_INCREF(cond);
     }
+    else {
+        PyArray_Descr *dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
+            return NULL;
+        }
+        cond = (PyArrayObject *)PyArray_FromAny(condition, dtype,
+                                    0, 0, NPY_ARRAY_ALLOWNA, NULL);
+        if (cond == NULL) {
+            return NULL;
+        }
+    }
+
     if (PyArray_NDIM(cond) != 1) {
         Py_DECREF(cond);
         PyErr_SetString(PyExc_ValueError,
-                        "condition must be 1-d array");
+                        "condition must be a 1-d array");
         return NULL;
     }
 
@@ -1690,15 +1846,213 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
     return ret;
 }
 
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
+{
+    int idim;
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp i, coord[NPY_MAXDIMS];
+    npy_intp count = 0;
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, ashape,
+                    data, astrides,
+                    &ndim, shape,
+                    &data, strides) < 0) {
+        return -1;
+    }
+
+    /* Handle zero-sized array */
+    if (shape[0] == 0) {
+        return 0;
+    }
+
+    /* Special case for contiguous inner loop */
+    if (strides[0] == 1) {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape[0]; ++i, ++d) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    }
+    /* General inner loop */
+    else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape[0]; ++i, d += strides[0]) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    }
+
+    return count;
+}
+
+static int
+assign_reduce_identity_zero(PyArrayObject *result, int preservena, void *data)
+{
+    return PyArray_AssignZero(result, NULL, preservena, NULL);
+}
+
+static int
+reduce_count_nonzero_loop(NpyIter *iter,
+                                            char **dataptr,
+                                            npy_intp *strides,
+                                            npy_intp *countptr,
+                                            NpyIter_IterNextFunc *iternext,
+                                            int needs_api,
+                                            npy_intp skip_first_count,
+                                            void *data)
+{
+    PyArray_NonzeroFunc *nonzero = (PyArray_NonzeroFunc *)data;
+    PyArrayObject *arr = NpyIter_GetOperandArray(iter)[1];
+
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    /*
+     * 'skip_first_count' will always be 0 because we are doing a reduction
+     * with an identity.
+     */
+
+    do {
+        char *data0 = dataptr[0], *data1 = dataptr[1];
+        npy_intp stride0 = strides[0], stride1 = strides[1];
+        npy_intp count = *countptr;
+
+        while (count--) {
+            if (nonzero(data1, arr)) {
+                ++(*(npy_intp *)data0);
+            }
+            data0 += stride0;
+            data1 += stride1;
+        }
+    } while (iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+static int
+reduce_count_nonzero_masked_loop(NpyIter *iter,
+                                            char **dataptr,
+                                            npy_intp *strides,
+                                            npy_intp *countptr,
+                                            NpyIter_IterNextFunc *iternext,
+                                            int needs_api,
+                                            npy_intp skip_first_count,
+                                            void *data)
+{
+    PyArray_NonzeroFunc *nonzero = (PyArray_NonzeroFunc *)data;
+    PyArrayObject *arr = NpyIter_GetOperandArray(iter)[1];
+
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    /*
+     * 'skip_first_count' will always be 0 because we are doing a reduction
+     * with an identity.
+     */
+
+    do {
+        char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
+        npy_intp stride0 = strides[0], stride1 = strides[1],
+                    stride2 = strides[2];
+        npy_intp count = *countptr;
+
+        while (count--) {
+            if (NpyMaskValue_IsExposed((npy_mask)*data2) &&
+                                        nonzero(data1, arr)) {
+                ++(*(npy_intp *)data0);
+            }
+            data0 += stride0;
+            data1 += stride1;
+            data2 += stride2;
+        }
+    } while (iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+
+/*
+ * A full reduction version of PyArray_CountNonzero, supporting
+ * an 'out' parameter and doing the count as a reduction along
+ * selected axes. It also supports a 'skipna' parameter, which
+ * skips over any NA masked values in arr.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
+                        npy_bool *axis_flags, int skipna, int keepdims)
+{
+    PyArray_NonzeroFunc *nonzero;
+    PyArrayObject *result;
+    PyArray_Descr *dtype;
+
+    nonzero = PyArray_DESCR(arr)->f->nonzero;
+    if (nonzero == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "Cannot count the number of non-zeros for a dtype "
+                    "which doesn't have a 'nonzero' function");
+        return NULL;
+    }
+
+    dtype = PyArray_DescrFromType(NPY_INTP);
+    if (dtype == NULL) {
+        return NULL;
+    }
+
+    result = PyArray_ReduceWrapper(arr, out, NULL,
+                            PyArray_DESCR(arr), dtype,
+                            NPY_SAME_KIND_CASTING,
+                            axis_flags, 1, skipna, NULL, keepdims, 0,
+                            &assign_reduce_identity_zero,
+                            &reduce_count_nonzero_loop,
+                            &reduce_count_nonzero_masked_loop,
+                            NULL,
+                            nonzero, 0, "count_nonzero");
+    Py_DECREF(dtype);
+    if (out == NULL && result != NULL) {
+        return PyArray_Return(result);
+    }
+    else {
+        return (PyObject *)result;
+    }
+}
+
 /*NUMPY_API
- * Counts the number of non-zero elements in the array
+ * Counts the number of non-zero elements in the array. Raises
+ * an error if the array contains an NA.
  *
  * Returns -1 on error.
  */
 NPY_NO_EXPORT npy_intp
 PyArray_CountNonzero(PyArrayObject *self)
 {
-    PyArray_NonzeroFunc *nonzero = PyArray_DESCR(self)->f->nonzero;
+    PyArray_NonzeroFunc *nonzero;
     char *data;
     npy_intp stride, count;
     npy_intp nonzero_count = 0;
@@ -1708,6 +2062,28 @@ PyArray_CountNonzero(PyArrayObject *self)
     char **dataptr;
     npy_intp *strideptr, *innersizeptr;
 
+    /* If 'self' has an NA mask, make sure it has no NA values */
+    if (PyArray_HASMASKNA(self)) {
+        int containsna = PyArray_ContainsNA(self, NULL, NULL);
+        if (containsna == -1) {
+            return -1;
+        }
+        else if (containsna) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot count the number of nonzeros in an array "
+                    "which contains an NA");
+            return -1;
+        }
+    }
+
+    /* Special low-overhead version specific to the boolean type */
+    if (PyArray_DESCR(self)->type_num == NPY_BOOL) {
+        return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    }
+
+    nonzero = PyArray_DESCR(self)->f->nonzero;
+
     /* If it's a trivial one-dimensional loop, don't use an iterator */
     if (PyArray_TRIVIALLY_ITERABLE(self)) {
         PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
@@ -1730,9 +2106,14 @@ PyArray_CountNonzero(PyArrayObject *self)
         return 0;
     }
 
-    /* Otherwise create and use an iterator to count the nonzeros */
-    iter = NpyIter_New(self, NPY_ITER_READONLY|
-                             NPY_ITER_EXTERNAL_LOOP|
+    /*
+     * Otherwise create and use an iterator to count the nonzeros.
+     * Can ignore any NAs because we already checked PyArray_ContainsNA
+     * earlier.
+     */
+    iter = NpyIter_New(self, NPY_ITER_READONLY |
+                             NPY_ITER_IGNORE_MASKNA |
+                             NPY_ITER_EXTERNAL_LOOP |
                              NPY_ITER_REFS_OK,
                         NPY_KEEPORDER, NPY_NO_CASTING,
                         NULL);
@@ -1767,7 +2148,7 @@ PyArray_CountNonzero(PyArrayObject *self)
 
     NpyIter_Deallocate(iter);
 
-    return nonzero_count;
+    return PyErr_Occurred() ? -1 : nonzero_count;
 }
 
 /*NUMPY_API
@@ -1785,7 +2166,7 @@ PyArray_Nonzero(PyArrayObject *self)
     PyArray_NonzeroFunc *nonzero = PyArray_DESCR(self)->f->nonzero;
     char *data;
     npy_intp stride, count;
-    npy_intp nonzero_count = PyArray_CountNonzero(self);
+    npy_intp nonzero_count;
     npy_intp *multi_index;
 
     NpyIter *iter;
@@ -1793,6 +2174,16 @@ PyArray_Nonzero(PyArrayObject *self)
     NpyIter_GetMultiIndexFunc *get_multi_index;
     char **dataptr;
 
+    /*
+     * First count the number of non-zeros in 'self'. If 'self' contains
+     * an NA value, this will raise an error, so after this call
+     * we can assume 'self' contains no NAs.
+     */
+    nonzero_count = PyArray_CountNonzero(self);
+    if (nonzero_count < 0) {
+        return NULL;
+    }
+
     /* Allocate the result as a 2D array */
     ret_dims[0] = nonzero_count;
     ret_dims[1] = (ndim == 0) ? 1 : ndim;
@@ -1822,10 +2213,15 @@ PyArray_Nonzero(PyArrayObject *self)
         goto finish;
     }
 
-    /* Build an iterator tracking a multi-index, in C order */
-    iter = NpyIter_New(self, NPY_ITER_READONLY|
-                             NPY_ITER_MULTI_INDEX|
-                             NPY_ITER_ZEROSIZE_OK|
+    /*
+     * Build an iterator tracking a multi-index, in C order. We
+     * can ignore NAs because the PyArray_CountNonzero call checked
+     * that there were no NAs already.
+     */
+    iter = NpyIter_New(self, NPY_ITER_READONLY |
+                             NPY_ITER_IGNORE_MASKNA |
+                             NPY_ITER_MULTI_INDEX |
+                             NPY_ITER_ZEROSIZE_OK |
                              NPY_ITER_REFS_OK,
                         NPY_CORDER, NPY_NO_CASTING,
                         NULL);
@@ -1879,7 +2275,7 @@ finish:
     /* Create views into ret, one for each dimension */
     if (ndim == 1) {
         /* Directly switch to one dimensions (dimension 1 is 1 anyway) */
-        ((PyArrayObject_fieldaccess *)ret)->nd = 1;
+        ((PyArrayObject_fields *)ret)->nd = 1;
         PyTuple_SET_ITEM(ret_tuple, 0, (PyObject *)ret);
     }
     else {
@@ -1910,3 +2306,167 @@ finish:
 
     return ret_tuple;
 }
+
+/*
+ * Gets a single item from the array, based on a single multi-index
+ * array of values, which must be of length PyArray_NDIM(self).
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MultiIndexGetItem(PyArrayObject *self, npy_intp *multi_index)
+{
+    int idim, ndim = PyArray_NDIM(self);
+    char *data = PyArray_DATA(self);
+    npy_intp *shape = PyArray_SHAPE(self);
+    npy_intp *strides = PyArray_STRIDES(self);
+
+    /* Case with an NA mask */
+    if (PyArray_HASMASKNA(self)) {
+        char *maskdata = PyArray_MASKNA_DATA(self);
+        npy_mask maskvalue;
+        npy_intp *maskstrides = PyArray_MASKNA_STRIDES(self);
+
+        if (PyArray_HASFIELDS(self)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "field-NA is not supported yet in MultiIndexGetItem");
+            return NULL;
+        }
+
+        /* Get the data and maskdata pointer */
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp shapevalue = shape[idim];
+            npy_intp ind = multi_index[idim];
+
+            if (ind < 0) {
+                ind += shapevalue;
+            }
+
+            if (ind < 0 || ind >= shapevalue) {
+                PyErr_SetString(PyExc_ValueError, "index out of bounds");
+                return NULL;
+            }
+
+            data += ind * strides[idim];
+            maskdata += ind * maskstrides[idim];
+        }
+
+        maskvalue = (npy_mask)*maskdata;
+        if (NpyMaskValue_IsExposed(maskvalue)) {
+            return PyArray_DESCR(self)->f->getitem(data, self);
+        }
+        else {
+            return (PyObject *)NpyNA_FromDTypeAndPayload(
+                                                PyArray_DTYPE(self), 0, 0);
+        }
+    }
+    /* Case without an NA mask */
+    else {
+        /* Get the data pointer */
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp shapevalue = shape[idim];
+            npy_intp ind = multi_index[idim];
+
+            if (ind < 0) {
+                ind += shapevalue;
+            }
+
+            if (ind < 0 || ind >= shapevalue) {
+                PyErr_SetString(PyExc_ValueError, "index out of bounds");
+                return NULL;
+            }
+
+            data += ind * strides[idim];
+        }
+
+        return PyArray_DESCR(self)->f->getitem(data, self);
+    }
+}
+
+/*
+ * Sets a single item in the array, based on a single multi-index
+ * array of values, which must be of length PyArray_NDIM(self).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_MultiIndexSetItem(PyArrayObject *self, npy_intp *multi_index,
+                                                PyObject *obj)
+{
+    int idim, ndim = PyArray_NDIM(self);
+    char *data = PyArray_DATA(self);
+    npy_intp *shape = PyArray_SHAPE(self);
+    npy_intp *strides = PyArray_STRIDES(self);
+
+    /* Case with an NA mask */
+    if (PyArray_HASMASKNA(self)) {
+        char *maskdata = PyArray_MASKNA_DATA(self);
+        npy_intp *maskstrides = PyArray_MASKNA_STRIDES(self);
+        NpyNA *na = NpyNA_FromObject(obj, 1);
+
+        if (PyArray_HASFIELDS(self)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "field-NA is not supported yet in MultiIndexSetItem");
+            return -1;
+        }
+
+        /* Get the data and maskdata pointer */
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp shapevalue = shape[idim];
+            npy_intp ind = multi_index[idim];
+
+            if (ind < 0) {
+                ind += shapevalue;
+            }
+
+            if (ind < 0 || ind >= shapevalue) {
+                PyErr_SetString(PyExc_ValueError, "index out of bounds");
+                return -1;
+            }
+
+            data += ind * strides[idim];
+            maskdata += ind * maskstrides[idim];
+        }
+
+        if (na == NULL) {
+            *maskdata = 1;
+            return PyArray_DESCR(self)->f->setitem(obj, data, self);
+        }
+        else {
+            char maskvalue = (char)NpyNA_AsMaskValue(na);
+
+            if (maskvalue != 0 &&
+                        PyArray_MASKNA_DTYPE(self)->type_num != NPY_MASK) {
+                /* TODO: also handle struct-NA mask dtypes */
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot assign an NA with a payload to an "
+                        "NA-array with a boolean mask, requires a "
+                        "multi-NA mask");
+                return -1;
+            }
+
+            *maskdata = maskvalue;
+
+            return 0;
+        }
+    }
+    /* Case without an NA mask */
+    else {
+        /* Get the data pointer */
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp shapevalue = shape[idim];
+            npy_intp ind = multi_index[idim];
+
+            if (ind < 0) {
+                ind += shapevalue;
+            }
+
+            if (ind < 0 || ind >= shapevalue) {
+                PyErr_SetString(PyExc_ValueError, "index out of bounds");
+                return -1;
+            }
+
+            data += ind * strides[idim];
+        }
+
+        return PyArray_DESCR(self)->f->setitem(obj, data, self);
+    }
+}
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
new file mode 100644
index 000000000..5c1741aaf
--- /dev/null
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -0,0 +1,42 @@
+#ifndef _NPY_PRIVATE__ITEM_SELECTION_H_
+#define _NPY_PRIVATE__ITEM_SELECTION_H_
+
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides);
+
+/*
+ * Gets a single item from the array, based on a single multi-index
+ * array of values, which must be of length PyArray_NDIM(self).
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MultiIndexGetItem(PyArrayObject *self, npy_intp *multi_index);
+
+/*
+ * Sets a single item in the array, based on a single multi-index
+ * array of values, which must be of length PyArray_NDIM(self).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_MultiIndexSetItem(PyArrayObject *self, npy_intp *multi_index,
+                                                PyObject *obj);
+
+/*
+ * A full reduction version of PyArray_CountNonzero, supporting
+ * an 'out' parameter and doing the count as a reduction along
+ * selected axes. It also supports a 'skipna' parameter, which
+ * skips over any NA masked values in arr.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
+                        npy_bool *axis_flags, int skipna, int keepdims);
+
+
+
+#endif
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index e1d44740f..e13173fa7 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -17,22 +17,31 @@
 #include "ctors.h"
 #include "common.h"
 
-#define PseudoIndex -1
-#define RubberIndex -2
-#define SingleIndex -3
+#define NEWAXIS_INDEX -1
+#define ELLIPSIS_INDEX -2
+#define SINGLE_INDEX -3
 
+static int
+slice_coerce_index(PyObject *o, npy_intp *v);
+
+/*
+ * This function converts one element of the indexing tuple
+ * into a step size and a number of steps, returning the
+ * starting index. Non-slices are signalled in 'n_steps',
+ * as NEWAXIS_INDEX, ELLIPSIS_INDEX, or SINGLE_INDEX.
+ */
 NPY_NO_EXPORT npy_intp
-parse_subindex(PyObject *op, npy_intp *step_size,
+parse_index_entry(PyObject *op, npy_intp *step_size,
                     npy_intp *n_steps, npy_intp max)
 {
     npy_intp index;
 
     if (op == Py_None) {
-        *n_steps = PseudoIndex;
+        *n_steps = NEWAXIS_INDEX;
         index = 0;
     }
     else if (op == Py_Ellipsis) {
-        *n_steps = RubberIndex;
+        *n_steps = ELLIPSIS_INDEX;
         index = 0;
     }
     else if (PySlice_Check(op)) {
@@ -52,15 +61,14 @@ parse_subindex(PyObject *op, npy_intp *step_size,
         }
     }
     else {
-        index = PyArray_PyIntAsIntp(op);
-        if (error_converting(index)) {
+        if (!slice_coerce_index(op, &index)) {
             PyErr_SetString(PyExc_IndexError,
-                            "each subindex must be either a "\
-                            "slice, an integer, Ellipsis, or "\
+                            "each index entry must be either a "
+                            "slice, an integer, Ellipsis, or "
                             "newaxis");
             goto fail;
         }
-        *n_steps = SingleIndex;
+        *n_steps = SINGLE_INDEX;
         *step_size = 0;
         if (index < 0) {
             index += max;
@@ -77,13 +85,23 @@ parse_subindex(PyObject *op, npy_intp *step_size,
 }
 
 
+/*
+ * Parses an index that has no fancy indexing. Populates
+ * out_dimensions, out_strides, and out_offset. If out_maskna_strides
+ * and out_maskoffset aren't NULL, then 'self' must have an NA mask
+ * which is used to populate those variables as well.
+ */
 NPY_NO_EXPORT int
 parse_index(PyArrayObject *self, PyObject *op,
-            npy_intp *dimensions, npy_intp *strides, npy_intp *offset_ptr)
+            npy_intp *out_dimensions,
+            npy_intp *out_strides,
+            npy_intp *out_offset,
+            npy_intp *out_maskna_strides,
+            npy_intp *out_maskna_offset)
 {
     int i, j, n;
-    int nd_old, nd_new, n_add, n_pseudo;
-    npy_intp n_steps, start, offset, step_size;
+    int nd_old, nd_new, n_add, n_ellipsis;
+    npy_intp n_steps, start, offset, maskna_offset, step_size;
     PyObject *op1 = NULL;
     int is_slice;
 
@@ -97,7 +115,7 @@ parse_index(PyArrayObject *self, PyObject *op,
     else {
         if (!PySequence_Check(op)) {
             PyErr_SetString(PyExc_IndexError,
-                            "index must be either an int "\
+                            "index must be either an int "
                             "or a sequence");
             return -1;
         }
@@ -108,62 +126,71 @@ parse_index(PyArrayObject *self, PyObject *op,
     nd_old = nd_new = 0;
 
     offset = 0;
+    maskna_offset = 0;
     for (i = 0; i < n; i++) {
         if (!is_slice) {
-            if (!(op1=PySequence_GetItem(op, i))) {
-                PyErr_SetString(PyExc_IndexError,
-                                "invalid index");
+            op1 = PySequence_GetItem(op, i);
+            if (op1 == NULL) {
                 return -1;
             }
         }
-        start = parse_subindex(op1, &step_size, &n_steps,
+        start = parse_index_entry(op1, &step_size, &n_steps,
                                nd_old < PyArray_NDIM(self) ?
                                PyArray_DIMS(self)[nd_old] : 0);
         Py_DECREF(op1);
         if (start == -1) {
             break;
         }
-        if (n_steps == PseudoIndex) {
-            dimensions[nd_new] = 1; strides[nd_new] = 0;
+        if (n_steps == NEWAXIS_INDEX) {
+            out_dimensions[nd_new] = 1;
+            out_strides[nd_new] = 0;
+            if (out_maskna_strides != NULL) {
+                out_maskna_strides[nd_new] = 0;
+            }
             nd_new++;
         }
-        else {
-            if (n_steps == RubberIndex) {
-                for (j = i + 1, n_pseudo = 0; j < n; j++) {
-                    op1 = PySequence_GetItem(op, j);
-                    if (op1 == Py_None) {
-                        n_pseudo++;
-                    }
-                    Py_DECREF(op1);
-                }
-                n_add = PyArray_NDIM(self)-(n-i-n_pseudo-1+nd_old);
-                if (n_add < 0) {
-                    PyErr_SetString(PyExc_IndexError,
-                                    "too many indices");
-                    return -1;
-                }
-                for (j = 0; j < n_add; j++) {
-                    dimensions[nd_new] = \
-                        PyArray_DIMS(self)[nd_old];
-                    strides[nd_new] = \
-                        PyArray_STRIDES(self)[nd_old];
-                    nd_new++; nd_old++;
+        else if (n_steps == ELLIPSIS_INDEX) {
+            for (j = i + 1, n_ellipsis = 0; j < n; j++) {
+                op1 = PySequence_GetItem(op, j);
+                if (op1 == Py_None) {
+                    n_ellipsis++;
                 }
+                Py_DECREF(op1);
             }
-            else {
-                if (nd_old >= PyArray_NDIM(self)) {
-                    PyErr_SetString(PyExc_IndexError,
-                                    "too many indices");
-                    return -1;
+            n_add = PyArray_NDIM(self)-(n-i-n_ellipsis-1+nd_old);
+            if (n_add < 0) {
+                PyErr_SetString(PyExc_IndexError, "too many indices");
+                return -1;
+            }
+            for (j = 0; j < n_add; j++) {
+                out_dimensions[nd_new] = PyArray_DIMS(self)[nd_old];
+                out_strides[nd_new] = PyArray_STRIDES(self)[nd_old];
+                if (out_maskna_strides != NULL) {
+                    out_maskna_strides[nd_new] =
+                                    PyArray_MASKNA_STRIDES(self)[nd_old];
                 }
-                offset += PyArray_STRIDES(self)[nd_old]*start;
-                nd_old++;
-                if (n_steps != SingleIndex) {
-                    dimensions[nd_new] = n_steps;
-                    strides[nd_new] = step_size * \
-                        PyArray_STRIDES(self)[nd_old-1];
-                    nd_new++;
+                nd_new++; nd_old++;
+            }
+        }
+        else {
+            if (nd_old >= PyArray_NDIM(self)) {
+                PyErr_SetString(PyExc_IndexError, "too many indices");
+                return -1;
+            }
+            offset += PyArray_STRIDES(self)[nd_old]*start;
+            if (out_maskna_offset != NULL) {
+                maskna_offset += PyArray_MASKNA_STRIDES(self)[nd_old]*start;
+            }
+            nd_old++;
+            if (n_steps != SINGLE_INDEX) {
+                out_dimensions[nd_new] = n_steps;
+                out_strides[nd_new] = step_size *
+                                            PyArray_STRIDES(self)[nd_old-1];
+                if (out_maskna_strides != NULL) {
+                    out_maskna_strides[nd_new] = step_size *
+                                        PyArray_MASKNA_STRIDES(self)[nd_old-1];
                 }
+                nd_new++;
             }
         }
     }
@@ -172,20 +199,50 @@ parse_index(PyArrayObject *self, PyObject *op,
     }
     n_add = PyArray_NDIM(self)-nd_old;
     for (j = 0; j < n_add; j++) {
-        dimensions[nd_new] = PyArray_DIMS(self)[nd_old];
-        strides[nd_new] = PyArray_STRIDES(self)[nd_old];
+        out_dimensions[nd_new] = PyArray_DIMS(self)[nd_old];
+        out_strides[nd_new] = PyArray_STRIDES(self)[nd_old];
+        if (out_maskna_strides != NULL) {
+            out_maskna_strides[nd_new] = PyArray_MASKNA_STRIDES(self)[nd_old];
+        }
         nd_new++;
         nd_old++;
     }
-    *offset_ptr = offset;
+    *out_offset = offset;
+    if (out_maskna_offset != NULL) {
+        *out_maskna_offset = maskna_offset;
+    }
     return nd_new;
 }
 
+/*
+ * Tries to convert 'o' into an npy_intp interpreted as an
+ * index. Returns 1 if it was successful, 0 otherwise. Does
+ * not set an exception.
+ */
 static int
 slice_coerce_index(PyObject *o, npy_intp *v)
 {
+    /*
+     * PyNumber_Index was introduced in Python 2.5 because of NumPy.
+     * http://www.python.org/dev/peps/pep-0357/
+     * Let's use it for indexing!
+     *
+     * Unfortunately, SciPy and possibly other code seems to rely
+     * on the lenient coercion. :(
+     */
+#if 0 /*PY_VERSION_HEX >= 0x02050000*/
+    PyObject *ind = PyNumber_Index(o);
+    if (ind != NULL) {
+        *v = PyArray_PyIntAsIntp(ind);
+        Py_DECREF(ind);
+    }
+    else {
+        *v = -1;
+    }
+#else
     *v = PyArray_PyIntAsIntp(o);
-    if (error_converting(*v)) {
+#endif
+    if ((*v) == -1 && PyErr_Occurred()) {
         PyErr_Clear();
         return 0;
     }
@@ -343,14 +400,22 @@ NPY_NO_EXPORT PyObject *
 PyArray_IterNew(PyObject *obj)
 {
     PyArrayIterObject *it;
-    PyArrayObject *ao = (PyArrayObject *)obj;
+    PyArrayObject *ao;
 
-    if (!PyArray_Check(ao)) {
+    if (!PyArray_Check(obj)) {
         PyErr_BadInternalCall();
         return NULL;
     }
+    ao = (PyArrayObject *)obj;
 
-    it = (PyArrayIterObject *)_pya_malloc(sizeof(PyArrayIterObject));
+    if (PyArray_HASMASKNA(ao)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Old-style NumPy iterators do not support NA masks, "
+                "use numpy.nditer instead");
+        return NULL;
+    }
+
+    it = (PyArrayIterObject *)PyArray_malloc(sizeof(PyArrayIterObject));
     PyObject_Init((PyObject *)it, &PyArrayIter_Type);
     /* it = PyObject_New(PyArrayIterObject, &PyArrayIter_Type);*/
     if (it == NULL) {
@@ -371,6 +436,13 @@ PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd)
     int i, diff, j, compat, k;
     PyArrayObject *ao = (PyArrayObject *)obj;
 
+    if (PyArray_HASMASKNA(ao)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Old-style NumPy iterators do not support NA masks, "
+                "use numpy.nditer instead");
+        return NULL;
+    }
+
     if (PyArray_NDIM(ao) > nd) {
         goto err;
     }
@@ -388,7 +460,7 @@ PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd)
     if (!compat) {
         goto err;
     }
-    it = (PyArrayIterObject *)_pya_malloc(sizeof(PyArrayIterObject));
+    it = (PyArrayIterObject *)PyArray_malloc(sizeof(PyArrayIterObject));
     PyObject_Init((PyObject *)it, &PyArrayIter_Type);
 
     if (it == NULL) {
@@ -769,18 +841,18 @@ iter_subscript(PyArrayIterObject *self, PyObject *ind)
 
     /* Check for Integer or Slice */
     if (PyLong_Check(ind) || PyInt_Check(ind) || PySlice_Check(ind)) {
-        start = parse_subindex(ind, &step_size, &n_steps,
+        start = parse_index_entry(ind, &step_size, &n_steps,
                                self->size);
         if (start == -1) {
             goto fail;
         }
-        if (n_steps == RubberIndex || n_steps == PseudoIndex) {
+        if (n_steps == ELLIPSIS_INDEX || n_steps == NEWAXIS_INDEX) {
             PyErr_SetString(PyExc_IndexError,
                             "cannot use Ellipsis or newaxes here");
             goto fail;
         }
         PyArray_ITER_GOTO1D(self, start)
-            if (n_steps == SingleIndex) { /* Integer */
+            if (n_steps == SINGLE_INDEX) { /* Integer */
                 PyObject *tmp;
                 tmp = PyArray_ToScalar(self->dataptr, self->ao);
                 PyArray_ITER_RESET(self);
@@ -1041,17 +1113,17 @@ iter_ass_subscript(PyArrayIterObject *self, PyObject *ind, PyObject *val)
 
     /* Check Slice */
     if (PySlice_Check(ind)) {
-        start = parse_subindex(ind, &step_size, &n_steps, self->size);
+        start = parse_index_entry(ind, &step_size, &n_steps, self->size);
         if (start == -1) {
             goto finish;
         }
-        if (n_steps == RubberIndex || n_steps == PseudoIndex) {
+        if (n_steps == ELLIPSIS_INDEX || n_steps == NEWAXIS_INDEX) {
             PyErr_SetString(PyExc_IndexError,
                             "cannot use Ellipsis or newaxes here");
             goto finish;
         }
         PyArray_ITER_GOTO1D(self, start);
-        if (n_steps == SingleIndex) {
+        if (n_steps == SINGLE_INDEX) {
             /* Integer */
             copyswap(self->dataptr, PyArray_DATA(arrval), swap, arrval);
             PyArray_ITER_RESET(self);
@@ -1188,7 +1260,7 @@ iter_array(PyArrayIterObject *it, PyObject *NPY_UNUSED(op))
          * the chain of bases.
          */
         Py_INCREF(it->ao);
-        ((PyArrayObject_fieldaccess *)ret)->base = (PyObject *)it->ao;
+        ((PyArrayObject_fields *)ret)->base = (PyObject *)it->ao;
         PyArray_ENABLEFLAGS(ret, NPY_ARRAY_UPDATEIFCOPY);
         PyArray_CLEARFLAGS(it->ao, NPY_ARRAY_WRITEABLE);
     }
@@ -1452,7 +1524,7 @@ PyArray_MultiIterFromObjects(PyObject **mps, int n, int nadd, ...)
                      "array objects (inclusive).", NPY_MAXARGS);
         return NULL;
     }
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -1517,7 +1589,7 @@ PyArray_MultiIterNew(int n, ...)
 
     /* fprintf(stderr, "multi new...");*/
 
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -1580,7 +1652,7 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
         return NULL;
     }
 
-    multi = _pya_malloc(sizeof(PyArrayMultiIterObject));
+    multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
     if (multi == NULL) {
         return PyErr_NoMemory();
     }
@@ -1840,7 +1912,7 @@ static char* _set_constant(PyArrayNeighborhoodIterObject* iter,
         storeflags = PyArray_FLAGS(ar->ao);
         PyArray_ENABLEFLAGS(ar->ao, NPY_ARRAY_BEHAVED);
         st = PyArray_DESCR(ar->ao)->f->setitem((PyObject*)fill, ret, ar->ao);
-        ((PyArrayObject_fieldaccess *)ar->ao)->flags = storeflags;
+        ((PyArrayObject_fields *)ar->ao)->flags = storeflags;
 
         if (st < 0) {
             PyDataMem_FREE(ret);
@@ -1977,7 +2049,7 @@ PyArray_NeighborhoodIterNew(PyArrayIterObject *x, npy_intp *bounds,
     int i;
     PyArrayNeighborhoodIterObject *ret;
 
-    ret = _pya_malloc(sizeof(*ret));
+    ret = PyArray_malloc(sizeof(*ret));
     if (ret == NULL) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/iterators.h b/numpy/core/src/multiarray/iterators.h
index 3099425c5..e877f8520 100644
--- a/numpy/core/src/multiarray/iterators.h
+++ b/numpy/core/src/multiarray/iterators.h
@@ -1,12 +1,19 @@
 #ifndef _NPY_ARRAYITERATORS_H_
 #define _NPY_ARRAYITERATORS_H_
 
-NPY_NO_EXPORT intp
-parse_subindex(PyObject *op, intp *step_size, intp *n_steps, intp max);
-
+/*
+ * Parses an index that has no fancy indexing. Populates
+ * out_dimensions, out_strides, and out_offset. If out_maskstrides
+ * and out_maskoffset aren't NULL, then 'self' must have an NA mask
+ * which is used to populate those variables as well.
+ */
 NPY_NO_EXPORT int
 parse_index(PyArrayObject *self, PyObject *op,
-            intp *dimensions, intp *strides, intp *offset_ptr);
+            npy_intp *out_dimensions,
+            npy_intp *out_strides,
+            npy_intp *out_offset,
+            npy_intp *out_maskna_strides,
+            npy_intp *out_maskna_offset);
 
 NPY_NO_EXPORT PyObject
 *iter_subscript(PyArrayIterObject *, PyObject *);
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 4476e0a35..ec2173daf 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -188,9 +188,13 @@ static void
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                         NpyAuxData *NPY_UNUSED(data))
 {
-#if @elsize@ != 16
-    @type@ temp = @swap@@elsize@(*((@type@ *)src));
+#if @elsize@ == 1 && @dst_contig@
+    memset(dst, *src, N);
 #else
+
+#  if @elsize@ != 16
+    @type@ temp = @swap@@elsize@(*((@type@ *)src));
+#  else
     npy_uint64 temp0, temp1;
 #    if @is_swap@ == 0
         temp0 = (*((npy_uint64 *)src));
@@ -202,24 +206,25 @@ static void
         temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
         temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
 #    endif
-#endif
+#  endif
 
     while (N > 0) {
-#if @elsize@ != 16
+#  if @elsize@ != 16
         *((@type@ *)dst) = temp;
-#else
+#  else
         *((npy_uint64 *)dst) = temp0;
         *((npy_uint64 *)dst + 1) = temp1;
-#endif
-#if @dst_contig@
+#  endif
+#  if @dst_contig@
         dst += @elsize@;
-#else
+#  else
         dst += dst_stride;
-#endif
+#  endif
         --N;
     }
+#endif/* @elsize == 1 && @dst_contig@ -- else */
 }
-#endif
+#endif/* (@src_contig@ == 0) && @is_aligned@ */
 
 #endif/* @elsize@ >= @minelsize@ */
 
@@ -311,7 +316,7 @@ _contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
 }
 
 
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedCopyFn(int aligned, npy_intp src_stride,
                          npy_intp dst_stride, npy_intp itemsize)
 {
@@ -466,7 +471,7 @@ PyArray_GetStridedCopyFn(int aligned, npy_intp src_stride,
  * #not_pair = 1, 0#
  */
 
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 @function@(int aligned, npy_intp src_stride,
                              npy_intp dst_stride, npy_intp itemsize)
 {
@@ -848,7 +853,7 @@ static void
 
 /**end repeat**/
 
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
                              npy_intp dst_stride,
                              int src_type_num, int dst_type_num)
@@ -929,7 +934,7 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_StridedTransferFn *stransfer,
+                PyArray_StridedUnaryOp *stransfer,
                 NpyAuxData *data)
 {
     npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1;
@@ -1048,7 +1053,7 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_StridedTransferFn *stransfer,
+                PyArray_StridedUnaryOp *stransfer,
                 NpyAuxData *data)
 {
     npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
@@ -1168,7 +1173,7 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_MaskedStridedTransferFn *stransfer,
+                PyArray_MaskedStridedUnaryOp *stransfer,
                 NpyAuxData *data)
 {
     npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 5cd6531d4..258123e93 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -15,6 +15,9 @@
 #include "common.h"
 #include "iterators.h"
 #include "mapping.h"
+#include "na_object.h"
+#include "lowlevel_strided_loops.h"
+#include "item_selection.h"
 
 #define SOBJ_NOTFANCY 0
 #define SOBJ_ISFANCY 1
@@ -41,66 +44,131 @@ array_length(PyArrayObject *self)
 }
 
 NPY_NO_EXPORT PyObject *
-array_big_item(PyArrayObject *self, intp i)
+array_big_item(PyArrayObject *self, npy_intp i)
 {
     char *item;
-    PyArrayObject *r;
+    PyArrayObject *ret;
+    npy_intp dim0;
 
     if(PyArray_NDIM(self) == 0) {
         PyErr_SetString(PyExc_IndexError,
                         "0-d arrays can't be indexed");
         return NULL;
     }
-    if ((item = index2ptr(self, i)) == NULL) {
+
+    /* Bounds check and get the data pointer */
+    dim0 = PyArray_DIM(self, 0);
+    if (i < 0) {
+        i += dim0;
+    }
+    if (i < 0 || i >= dim0) {
+        PyErr_SetString(PyExc_IndexError,"index out of bounds");
         return NULL;
     }
+    item = PyArray_DATA(self) + i * PyArray_STRIDE(self, 0);
+
+    /* Create the view array */
     Py_INCREF(PyArray_DESCR(self));
-    r = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
                                               PyArray_DESCR(self),
                                               PyArray_NDIM(self)-1,
                                               PyArray_DIMS(self)+1,
                                               PyArray_STRIDES(self)+1, item,
-                                              PyArray_FLAGS(self),
+              PyArray_FLAGS(self) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
                                               (PyObject *)self);
-    if (r == NULL) {
+    if (ret == NULL) {
         return NULL;
     }
+
+    /* Take a view of the NA mask if it exists */
+    if (PyArray_HASMASKNA(self)) {
+        PyArrayObject_fields *fa = (PyArrayObject_fields *)ret;
+
+        fa->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fa->maskna_dtype);
+        fa->maskna_data = PyArray_MASKNA_DATA(self) +
+                          i * PyArray_MASKNA_STRIDES(self)[0];
+        if (fa->nd > 0) {
+            memcpy(fa->maskna_strides, PyArray_MASKNA_STRIDES(self)+1,
+                                        fa->nd * sizeof(npy_intp));
+        }
+        fa->flags |= NPY_ARRAY_MASKNA;
+    }
+
+    /* Set the base object */
     Py_INCREF(self);
-    if (PyArray_SetBaseObject(r, (PyObject *)self) < 0) {
-        Py_DECREF(r);
+    if (PyArray_SetBaseObject(ret, (PyObject *)self) < 0) {
+        Py_DECREF(ret);
         return NULL;
     }
-    PyArray_UpdateFlags(r, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
-    return (PyObject *)r;
+
+    PyArray_UpdateFlags(ret, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
+    return (PyObject *)ret;
 }
 
 NPY_NO_EXPORT int
 _array_ass_item(PyArrayObject *self, Py_ssize_t i, PyObject *v)
 {
-    return array_ass_big_item(self, (intp) i, v);
+    return array_ass_big_item(self, (npy_intp) i, v);
 }
+
 /* contains optimization for 1-d arrays */
 NPY_NO_EXPORT PyObject *
 array_item_nice(PyArrayObject *self, Py_ssize_t i)
 {
     if (PyArray_NDIM(self) == 1) {
         char *item;
-        if ((item = index2ptr(self, i)) == NULL) {
+        npy_intp dim0;
+
+        /* Bounds check and get the data pointer */
+        dim0 = PyArray_DIM(self, 0);
+        if (i < 0) {
+            i += dim0;
+        }
+        if (i < 0 || i >= dim0) {
+            PyErr_SetString(PyExc_IndexError,"index out of bounds");
             return NULL;
         }
+        item = PyArray_DATA(self) + i * PyArray_STRIDE(self, 0);
+
+        if (PyArray_HASMASKNA(self)) {
+            npy_mask maskvalue;
+
+            maskvalue = (npy_mask)*(PyArray_MASKNA_DATA(self) +
+                                    i * PyArray_MASKNA_STRIDES(self)[0]);
+            if (!NpyMaskValue_IsExposed(maskvalue)) {
+                NpyNA_fields *fna;
+
+                fna = (NpyNA_fields *)NpyNA_Type.tp_new(&NpyNA_Type, NULL, NULL);
+                if (fna == NULL) {
+                    return NULL;
+                }
+
+                fna->dtype = PyArray_DESCR(self);
+                Py_INCREF(fna->dtype);
+
+                if (PyArray_MASKNA_DTYPE(self)->type_num == NPY_MASK) {
+                    fna->payload = NpyMaskValue_GetPayload(maskvalue);
+                }
+
+                return (PyObject *)fna;
+            }
+        }
+
         return PyArray_Scalar(item, PyArray_DESCR(self), (PyObject *)self);
     }
     else {
         return PyArray_Return(
-                (PyArrayObject *) array_big_item(self, (intp) i));
+                (PyArrayObject *) array_big_item(self, (npy_intp) i));
     }
 }
 
 NPY_NO_EXPORT int
-array_ass_big_item(PyArrayObject *self, intp i, PyObject *v)
+array_ass_big_item(PyArrayObject *self, npy_intp i, PyObject *v)
 {
     PyArrayObject *tmp;
     char *item;
+    npy_intp dim0;
     int ret;
 
     if (v == NULL) {
@@ -108,11 +176,13 @@ array_ass_big_item(PyArrayObject *self, intp i, PyObject *v)
                         "can't delete array elements");
         return -1;
     }
+
     if (!PyArray_ISWRITEABLE(self)) {
         PyErr_SetString(PyExc_RuntimeError,
                         "array is not writeable");
         return -1;
     }
+
     if (PyArray_NDIM(self) == 0) {
         PyErr_SetString(PyExc_IndexError,
                         "0-d arrays can't be indexed.");
@@ -120,8 +190,10 @@ array_ass_big_item(PyArrayObject *self, intp i, PyObject *v)
     }
 
 
-    if (PyArray_NDIM(self) > 1) {
-        if((tmp = (PyArrayObject *)array_big_item(self, i)) == NULL) {
+    /* For multi-dimensional arrays and NA masked arrays, use CopyObject */
+    if (PyArray_NDIM(self) > 1 || PyArray_HASMASKNA(self)) {
+        tmp = (PyArrayObject *)array_big_item(self, i);
+        if(tmp == NULL) {
             return -1;
         }
         ret = PyArray_CopyObject(tmp, v);
@@ -129,13 +201,18 @@ array_ass_big_item(PyArrayObject *self, intp i, PyObject *v)
         return ret;
     }
 
-    if ((item = index2ptr(self, i)) == NULL) {
-        return -1;
+    /* Bounds check and get the data pointer */
+    dim0 = PyArray_DIM(self, 0);
+    if (i < 0) {
+        i += dim0;
     }
-    if (PyArray_DESCR(self)->f->setitem(v, item, self) == -1) {
+    if (i < 0 || i >= dim0) {
+        PyErr_SetString(PyExc_IndexError,"index out of bounds");
         return -1;
     }
-    return 0;
+    item = PyArray_DATA(self) + i * PyArray_STRIDE(self, 0);
+
+    return PyArray_DESCR(self)->f->setitem(v, item, self);
 }
 
 /* -------------------------------------------------------------- */
@@ -147,7 +224,7 @@ _swap_axes(PyArrayMapIterObject *mit, PyArrayObject **ret, int getmap)
     int n1, n2, n3, val, bnd;
     int i;
     PyArray_Dims permute;
-    intp d[MAX_DIMS];
+    npy_intp d[NPY_MAXDIMS];
     PyArrayObject *arr;
 
     permute.ptr = d;
@@ -379,7 +456,7 @@ count_new_axes_0d(PyObject *tuple)
                         " as an index");
         return -1;
     }
-    if (newaxis_count > MAX_DIMS) {
+    if (newaxis_count > NPY_MAXDIMS) {
         PyErr_SetString(PyExc_IndexError, "too many dimensions");
         return -1;
     }
@@ -389,29 +466,49 @@ count_new_axes_0d(PyObject *tuple)
 NPY_NO_EXPORT PyObject *
 add_new_axes_0d(PyArrayObject *arr,  int newaxis_count)
 {
-    PyArrayObject *other;
-    intp dimensions[MAX_DIMS];
+    PyArrayObject *ret;
+    npy_intp dimensions[NPY_MAXDIMS];
     int i;
 
     for (i = 0; i < newaxis_count; ++i) {
         dimensions[i]  = 1;
     }
     Py_INCREF(PyArray_DESCR(arr));
-    other = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(arr),
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(arr),
                                 PyArray_DESCR(arr),
                                 newaxis_count, dimensions,
                                 NULL, PyArray_DATA(arr),
-                                PyArray_FLAGS(arr),
+            PyArray_FLAGS(arr) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
                                 (PyObject *)arr);
-    if (other == NULL) {
+    if (ret == NULL) {
         return NULL;
     }
+
     Py_INCREF(arr);
-    if (PyArray_SetBaseObject(other, (PyObject *)arr) < 0) {
-        Py_DECREF(other);
+    if (PyArray_SetBaseObject(ret, (PyObject *)arr) < 0) {
+        Py_DECREF(ret);
         return NULL;
     }
-    return (PyObject *)other;
+
+    /* Take a view of the NA mask if it exists */
+    if (PyArray_HASMASKNA(arr)) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+
+        fret->maskna_dtype = PyArray_MASKNA_DTYPE(arr);
+        Py_INCREF(fret->maskna_dtype);
+
+        fret->maskna_data = PyArray_MASKNA_DATA(arr);
+
+        for (i = 0; i < newaxis_count; ++i) {
+            fret->maskna_strides[i]  = fret->maskna_dtype->elsize;
+        }
+
+        /* This view doesn't own the mask */
+        fret->flags |= NPY_ARRAY_MASKNA;
+        fret->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
+
+    return (PyObject *)ret;
 }
 
 
@@ -421,19 +518,19 @@ static int
 fancy_indexing_check(PyObject *args)
 {
     int i, n;
-    PyObject *obj;
     int retval = SOBJ_NOTFANCY;
 
     if (PyTuple_Check(args)) {
         n = PyTuple_GET_SIZE(args);
-        if (n >= MAX_DIMS) {
+        if (n >= NPY_MAXDIMS) {
             return SOBJ_TOOMANY;
         }
         for (i = 0; i < n; i++) {
-            obj = PyTuple_GET_ITEM(args,i);
+            PyObject *obj = PyTuple_GET_ITEM(args,i);
             if (PyArray_Check(obj)) {
-                if (PyArray_ISINTEGER((PyArrayObject *)obj) ||
-                    PyArray_ISBOOL((PyArrayObject *)obj)) {
+                int type_num = PyArray_DESCR((PyArrayObject *)obj)->type_num;
+                if (PyTypeNum_ISINTEGER(type_num) ||
+                                        PyTypeNum_ISBOOL(type_num)) {
                     retval = SOBJ_ISFANCY;
                 }
                 else {
@@ -447,8 +544,8 @@ fancy_indexing_check(PyObject *args)
         }
     }
     else if (PyArray_Check(args)) {
-        if ((PyArray_TYPE((PyArrayObject *)args)==NPY_BOOL) ||
-            (PyArray_ISINTEGER((PyArrayObject *)args))) {
+        int type_num = PyArray_DESCR((PyArrayObject *)args)->type_num;
+        if (PyTypeNum_ISINTEGER(type_num) || PyTypeNum_ISBOOL(type_num)) {
             return SOBJ_ISFANCY;
         }
         else {
@@ -457,24 +554,25 @@ fancy_indexing_check(PyObject *args)
     }
     else if (PySequence_Check(args)) {
         /*
-         * Sequences < MAX_DIMS with any slice objects
+         * Sequences < NPY_MAXDIMS with any slice objects
          * or newaxis, or Ellipsis is considered standard
          * as long as there are also no Arrays and or additional
          * sequences embedded.
          */
         retval = SOBJ_ISFANCY;
         n = PySequence_Size(args);
-        if (n < 0 || n >= MAX_DIMS) {
+        if (n < 0 || n >= NPY_MAXDIMS) {
             return SOBJ_ISFANCY;
         }
         for (i = 0; i < n; i++) {
-            obj = PySequence_GetItem(args, i);
+            PyObject *obj = PySequence_GetItem(args, i);
             if (obj == NULL) {
                 return SOBJ_ISFANCY;
             }
             if (PyArray_Check(obj)) {
-                if (PyArray_ISINTEGER((PyArrayObject *)obj) ||
-                                    PyArray_ISBOOL((PyArrayObject *)obj)) {
+                int type_num = PyArray_DESCR((PyArrayObject *)obj)->type_num;
+                if (PyTypeNum_ISINTEGER(type_num) ||
+                                            PyTypeNum_ISBOOL(type_num)) {
                     retval = SOBJ_LISTTUP;
                 }
                 else {
@@ -485,7 +583,7 @@ fancy_indexing_check(PyObject *args)
                 retval = SOBJ_LISTTUP;
             }
             else if (PySlice_Check(obj) || obj == Py_Ellipsis ||
-                    obj == Py_None) {
+                                                    obj == Py_None) {
                 retval = SOBJ_NOTFANCY;
             }
             Py_DECREF(obj);
@@ -514,13 +612,33 @@ fancy_indexing_check(PyObject *args)
 NPY_NO_EXPORT PyObject *
 array_subscript_simple(PyArrayObject *self, PyObject *op)
 {
-    npy_intp dimensions[MAX_DIMS], strides[MAX_DIMS];
-    npy_intp offset;
+    npy_intp dimensions[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp maskna_strides[NPY_MAXDIMS];
+    npy_intp offset, maskna_offset;
     int nd;
-    PyArrayObject *other;
+    PyArrayObject *ret;
     npy_intp value;
 
+    /*
+     * PyNumber_Index was introduced in Python 2.5 because of NumPy.
+     * http://www.python.org/dev/peps/pep-0357/
+     * Let's use it for indexing!
+     *
+     * Unfortunately, SciPy and possibly other code seems to rely
+     * on the lenient coercion. :(
+     */
+#if 0 /*PY_VERSION_HEX >= 0x02050000*/
+    PyObject *ind = PyNumber_Index(op);
+    if (ind != NULL) {
+        value = PyArray_PyIntAsIntp(ind);
+        Py_DECREF(ind);
+    }
+    else {
+        value = -1;
+    }
+#else
     value = PyArray_PyIntAsIntp(op);
+#endif
     if (value == -1 && PyErr_Occurred()) {
         PyErr_Clear();
     }
@@ -529,30 +647,610 @@ array_subscript_simple(PyArrayObject *self, PyObject *op)
     }
 
     /* Standard (view-based) Indexing */
-    nd = parse_index(self, op, dimensions, strides, &offset);
+    if (PyArray_HASMASKNA(self)) {
+        nd = parse_index(self, op, dimensions,
+                        strides, &offset, maskna_strides, &maskna_offset);
+    }
+    else {
+        nd = parse_index(self, op, dimensions,
+                        strides, &offset, NULL, NULL);
+    }
     if (nd == -1) {
         return NULL;
     }
 
-    /* This will only work if new array will be a view */
+    /* Create a view using the indexing result */
     Py_INCREF(PyArray_DESCR(self));
-    other = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
                                 PyArray_DESCR(self),
                                 nd, dimensions,
-                                strides, PyArray_DATA(self)+offset,
-                                PyArray_FLAGS(self),
+                                strides, PyArray_DATA(self) + offset,
+            PyArray_FLAGS(self) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
                                 (PyObject *)self);
-    if (other == NULL) {
+    if (ret == NULL) {
         return NULL;
     }
     Py_INCREF(self);
-    if (PyArray_SetBaseObject(other, (PyObject *)self) < 0) {
-        Py_DECREF(other);
+    if (PyArray_SetBaseObject(ret, (PyObject *)self) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
+
+    /* Take a view of the NA mask if it exists */
+    if (PyArray_HASMASKNA(self)) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+
+        fret->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fret->maskna_dtype);
+
+        fret->maskna_data = PyArray_MASKNA_DATA(self) + maskna_offset;
+
+        if (nd > 0) {
+            memcpy(fret->maskna_strides, maskna_strides,
+                                            nd * sizeof(npy_intp));
+        }
+
+        /* This view doesn't own the mask */
+        fret->flags |= NPY_ARRAY_MASKNA;
+        fret->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
+
+    return (PyObject *)ret;
+}
+
+/*
+ * Implements boolean indexing. This produces a one-dimensional
+ * array which picks out all of the elements of 'self' for which
+ * the corresponding element of 'op' is True.
+ *
+ * This operation is somewhat unfortunate, because to produce
+ * a one-dimensional output array, it has to choose a particular
+ * iteration order, in the case of NumPy that is always C order even
+ * though this function allows different choices.
+ */
+NPY_NO_EXPORT PyArrayObject *
+array_boolean_subscript(PyArrayObject *self,
+                    PyArrayObject *bmask, NPY_ORDER order)
+{
+    npy_intp size, itemsize;
+    char *ret_data, *ret_maskna_data = NULL;
+    PyArray_Descr *dtype;
+    PyArrayObject *ret;
+    int self_has_maskna = PyArray_HASMASKNA(self), needs_api = 0, containsna;
+    npy_intp bmask_size;
+
+    if (PyArray_DESCR(bmask)->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_TypeError,
+                "NumPy boolean array indexing requires a boolean index");
+        return NULL;
+    }
+
+    /*
+     * See the Boolean Indexing section of the missing data NEP.
+     */
+    containsna = PyArray_ContainsNA(bmask, NULL, NULL);
+    if (containsna == -1) {
+        return NULL;
+    }
+    else if (containsna) {
+        PyErr_SetString(PyExc_ValueError,
+                "The boolean mask indexing array "
+                "may not contain any NA values");
+        return NULL;
+    }
+
+    if (PyArray_NDIM(bmask) != PyArray_NDIM(self)) {
+        PyErr_SetString(PyExc_ValueError,
+                "The boolean mask assignment indexing array "
+                "must have the same number of dimensions as "
+                "the array being indexed");
+        return NULL;
+    }
+
+
+    /*
+     * Since we've checked that the mask contains no NAs, we
+     * can do a straightforward count of the boolean True values
+     * in the raw mask data array.
+     */
+    size = count_boolean_trues(PyArray_NDIM(bmask), PyArray_DATA(bmask),
+                                PyArray_DIMS(bmask), PyArray_STRIDES(bmask));
+    /* Correction factor for broadcasting 'bmask' to 'self' */
+    bmask_size = PyArray_SIZE(bmask);
+    if (bmask_size > 0) {
+        size *= PyArray_SIZE(self) / bmask_size;
+    }
+
+    /* Allocate the output of the boolean indexing */
+    dtype = PyArray_DESCR(self);
+    Py_INCREF(dtype);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self), dtype, 1, &size,
+                                NULL, NULL, 0, (PyObject *)self);
+    if (ret == NULL) {
         return NULL;
     }
-    PyArray_UpdateFlags(other, NPY_ARRAY_UPDATE_ALL);
 
-    return (PyObject *)other;
+    /* Allocate an NA mask for ret if required */
+    if (self_has_maskna) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        ret_maskna_data = PyArray_MASKNA_DATA(ret);
+    }
+
+    itemsize = dtype->elsize;
+    ret_data = PyArray_DATA(ret);
+
+    /* Create an iterator for the data */
+    if (size > 0) {
+        NpyIter *iter;
+        PyArrayObject *op[2] = {self, bmask};
+        npy_uint32 flags, op_flags[2];
+        npy_intp fixed_strides[3];
+        PyArray_StridedUnaryOp *stransfer = NULL;
+        NpyAuxData *transferdata = NULL;
+
+        NpyIter_IterNextFunc *iternext;
+        npy_intp innersize, *innerstrides;
+        char **dataptrs;
+
+        /* Set up the iterator */
+        flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
+        if (self_has_maskna) {
+            op_flags[0] = NPY_ITER_READONLY |
+                          NPY_ITER_NO_BROADCAST |
+                          NPY_ITER_USE_MASKNA;
+        }
+        else {
+            op_flags[0] = NPY_ITER_READONLY |
+                          NPY_ITER_NO_BROADCAST;
+        }
+        /*
+         * Since we already checked PyArray_ContainsNA(bmask), can
+         * ignore any MASKNA of bmask.
+         */
+        op_flags[1] = NPY_ITER_READONLY | NPY_ITER_IGNORE_MASKNA;
+
+        iter = NpyIter_MultiNew(2, op, flags, order, NPY_NO_CASTING,
+                                op_flags, NULL);
+        if (iter == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        /* Get a dtype transfer function */
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+        if (PyArray_GetDTypeTransferFunction(PyArray_ISALIGNED(self),
+                        fixed_strides[0], itemsize,
+                        dtype, dtype,
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            Py_DECREF(ret);
+            NpyIter_Deallocate(iter);
+            return NULL;
+        }
+
+        /* Get the values needed for the inner loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            Py_DECREF(ret);
+            NpyIter_Deallocate(iter);
+            NPY_AUXDATA_FREE(transferdata);
+            return NULL;
+        }
+        innerstrides = NpyIter_GetInnerStrideArray(iter);
+        dataptrs = NpyIter_GetDataPtrArray(iter);
+
+        /* Regular inner loop */
+        if (!self_has_maskna) {
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp subloopsize;
+            char *self_data;
+            char *bmask_data;
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                self_data = dataptrs[0];
+                bmask_data = dataptrs[1];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    ret_data += subloopsize * itemsize;
+                }
+            } while (iternext(iter));
+        }
+        /* NA masked inner loop */
+        else {
+            npy_intp i;
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp maskna_stride = innerstrides[2];
+            npy_intp subloopsize;
+            char *self_data;
+            char *bmask_data;
+            char *maskna_data;
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                self_data = dataptrs[0];
+                bmask_data = dataptrs[1];
+                maskna_data = dataptrs[2];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    maskna_data += subloopsize * maskna_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    /*
+                     * Because it's a newly allocated array, we
+                     * don't have to be careful about not overwriting
+                     * NA masked values. If 'ret' were an output parameter,
+                     * we would have to avoid that.
+                     */
+                    stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                    /* Copy the mask as well */
+                    for (i = 0; i < subloopsize; ++i) {
+                        *ret_maskna_data = *maskna_data;
+                        ++ret_maskna_data;
+                        maskna_data += maskna_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    ret_data += subloopsize * itemsize;
+                }
+            } while (iternext(iter));
+        }
+
+        NpyIter_Deallocate(iter);
+        NPY_AUXDATA_FREE(transferdata);
+    }
+
+    return ret;
+}
+
+/*
+ * Implements boolean indexing assignment. This takes the one-dimensional
+ * array 'v' and assigns its values to all of the elements of 'self' for which
+ * the corresponding element of 'op' is True.
+ *
+ * This operation is somewhat unfortunate, because to match up with
+ * a one-dimensional output array, it has to choose a particular
+ * iteration order, in the case of NumPy that is always C order even
+ * though this function allows different choices.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+array_ass_boolean_subscript(PyArrayObject *self,
+                    PyArrayObject *bmask, PyArrayObject *v, NPY_ORDER order)
+{
+    npy_intp size, src_itemsize, v_stride, v_maskna_stride = 0;
+    char *v_data, *v_maskna_data = NULL;
+    int self_has_maskna = PyArray_HASMASKNA(self);
+    int v_has_maskna = PyArray_HASMASKNA(v);
+    int needs_api = 0, containsna;
+    npy_intp bmask_size;
+    char constant_valid_mask = 1;
+
+    if (PyArray_DESCR(bmask)->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_TypeError,
+                "NumPy boolean array indexing assignment "
+                "requires a boolean index");
+        return -1;
+    }
+
+    if (PyArray_NDIM(v) > 1) {
+        PyErr_Format(PyExc_TypeError,
+                "NumPy boolean array indexing assignment "
+                "requires a 0 or 1-dimensional input, input "
+                "has %d dimensions", PyArray_NDIM(v));
+        return -1;
+    }
+
+    if (PyArray_NDIM(bmask) != PyArray_NDIM(self)) {
+        PyErr_SetString(PyExc_ValueError,
+                "The boolean mask assignment indexing array "
+                "must have the same number of dimensions as "
+                "the array being indexed");
+        return -1;
+    }
+
+    /* See the Boolean Indexing section of the missing data NEP */
+    containsna = PyArray_ContainsNA(bmask, NULL, NULL);
+    if (containsna == -1) {
+        return -1;
+    }
+    else if (containsna) {
+        PyErr_SetString(PyExc_ValueError,
+                "The boolean mask assignment indexing array "
+                "may not contain any NA values");
+        return -1;
+    }
+
+    /* Can't assign an NA to an array which doesn't support it */
+    if (v_has_maskna && !self_has_maskna) {
+        containsna = PyArray_ContainsNA(v, NULL, NULL);
+        if (containsna == -1) {
+            return -1;
+        }
+        else if (containsna) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+            return -1;
+        }
+        /* If there are no actual NAs, allow the assignment */
+        else {
+            v_has_maskna = 0;
+        }
+    }
+
+    /*
+     * Since we've checked that the mask contains no NAs, we
+     * can do a straightforward count of the boolean True values
+     * in the raw mask data array.
+     */
+    size = count_boolean_trues(PyArray_NDIM(bmask), PyArray_DATA(bmask),
+                                PyArray_DIMS(bmask), PyArray_STRIDES(bmask));
+    /* Correction factor for broadcasting 'bmask' to 'self' */
+    bmask_size = PyArray_SIZE(bmask);
+    if (bmask_size > 0) {
+        size *= PyArray_SIZE(self) / bmask_size;
+    }
+
+    /* Tweak the strides for 0-dim and broadcasting cases */
+    if (PyArray_NDIM(v) > 0 && PyArray_DIMS(v)[0] > 1) {
+        v_stride = PyArray_STRIDES(v)[0];
+        if (v_has_maskna) {
+            v_maskna_stride = PyArray_MASKNA_STRIDES(v)[0];
+        }
+        else {
+            v_maskna_stride = 0;
+        }
+
+        if (size != PyArray_DIMS(v)[0]) {
+            PyErr_Format(PyExc_ValueError,
+                    "NumPy boolean array indexing assignment "
+                    "cannot assign %d input values to "
+                    "the %d output values where the mask is true",
+                    (int)PyArray_DIMS(v)[0], (int)size);
+            return -1;
+        }
+    }
+    else {
+        v_stride = 0;
+        v_maskna_stride = 0;
+    }
+
+    src_itemsize = PyArray_DESCR(v)->elsize;
+    v_data = PyArray_DATA(v);
+    if (v_has_maskna) {
+        v_maskna_data = PyArray_MASKNA_DATA(v);
+    }
+    /* If assigning unmasked to masked, use a 0-stride all valid mask */
+    else if (self_has_maskna) {
+        v_maskna_data = &constant_valid_mask;
+    }
+
+    /* Create an iterator for the data */
+    if (size > 0) {
+        NpyIter *iter;
+        PyArrayObject *op[2] = {self, bmask};
+        npy_uint32 flags, op_flags[2];
+        npy_intp fixed_strides[3];
+
+        NpyIter_IterNextFunc *iternext;
+        npy_intp innersize, *innerstrides;
+        char **dataptrs;
+
+        /* Set up the iterator */
+        flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
+        if (self_has_maskna) {
+            op_flags[0] = NPY_ITER_WRITEONLY |
+                          NPY_ITER_NO_BROADCAST |
+                          NPY_ITER_USE_MASKNA;
+        }
+        else {
+            op_flags[0] = NPY_ITER_WRITEONLY |
+                          NPY_ITER_NO_BROADCAST;
+        }
+        /*
+         * Since we already checked PyArray_ContainsNA(bmask), can
+         * ignore any MASKNA of bmask.
+         */
+        op_flags[1] = NPY_ITER_READONLY | NPY_ITER_IGNORE_MASKNA;
+
+        iter = NpyIter_MultiNew(2, op, flags, order, NPY_NO_CASTING,
+                                op_flags, NULL);
+        if (iter == NULL) {
+            return -1;
+        }
+
+        /* Get the values needed for the inner loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            return -1;
+        }
+        innerstrides = NpyIter_GetInnerStrideArray(iter);
+        dataptrs = NpyIter_GetDataPtrArray(iter);
+
+        /* Regular inner loop */
+        if (!self_has_maskna) {
+            PyArray_StridedUnaryOp *stransfer = NULL;
+            NpyAuxData *transferdata = NULL;
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp subloopsize;
+            char *self_data;
+            char *bmask_data;
+
+            /* Get a dtype transfer function */
+            NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+            if (PyArray_GetDTypeTransferFunction(
+                            PyArray_ISALIGNED(self) && PyArray_ISALIGNED(v),
+                            v_stride, fixed_strides[0],
+                            PyArray_DESCR(v), PyArray_DESCR(self),
+                            0,
+                            &stransfer, &transferdata,
+                            &needs_api) != NPY_SUCCEED) {
+                NpyIter_Deallocate(iter);
+                return -1;
+            }
+
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                self_data = dataptrs[0];
+                bmask_data = dataptrs[1];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    stransfer(self_data, self_stride, v_data, v_stride,
+                                subloopsize, src_itemsize, transferdata);
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    v_data += subloopsize * v_stride;
+                }
+            } while (iternext(iter));
+
+            NPY_AUXDATA_FREE(transferdata);
+        }
+        /* NA masked inner loop */
+        else {
+            PyArray_MaskedStridedUnaryOp *stransfer = NULL;
+            NpyAuxData *transferdata = NULL;
+            npy_intp i;
+            npy_intp self_stride = innerstrides[0];
+            npy_intp bmask_stride = innerstrides[1];
+            npy_intp self_maskna_stride = innerstrides[2];
+            npy_intp subloopsize;
+            PyArray_Descr *v_maskna_dtype;
+            char *self_data;
+            char *bmask_data;
+            char *self_maskna_data;
+
+            if (PyArray_HASMASKNA(v)) {
+                v_maskna_dtype = PyArray_MASKNA_DTYPE(v);
+                Py_INCREF(v_maskna_dtype);
+            }
+            else {
+                v_maskna_dtype = PyArray_DescrFromType(NPY_BOOL);
+                if (v_maskna_dtype == NULL) {
+                    NpyIter_Deallocate(iter);
+                    return -1;
+                }
+            }
+
+            /* Get a dtype transfer function */
+            NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+            if (PyArray_GetMaskedDTypeTransferFunction(
+                            PyArray_ISALIGNED(self) && PyArray_ISALIGNED(v),
+                            v_stride, fixed_strides[0], v_maskna_stride,
+                            PyArray_DESCR(v),
+                            PyArray_DESCR(self),
+                            v_maskna_dtype,
+                            0,
+                            &stransfer, &transferdata,
+                            &needs_api) != NPY_SUCCEED) {
+                Py_DECREF(v_maskna_dtype);
+                NpyIter_Deallocate(iter);
+                return -1;
+            }
+            Py_DECREF(v_maskna_dtype);
+
+            do {
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                self_data = dataptrs[0];
+                bmask_data = dataptrs[1];
+                self_maskna_data = dataptrs[2];
+
+                while (innersize > 0) {
+                    /* Skip masked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data == 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    self_maskna_data += subloopsize * self_maskna_stride;
+                    /* Process unmasked values */
+                    subloopsize = 0;
+                    while (subloopsize < innersize && *bmask_data != 0) {
+                        ++subloopsize;
+                        bmask_data += bmask_stride;
+                    }
+                    /*
+                     * Because we're assigning to an existing array,
+                     * we have to be careful about not overwriting
+                     * NA masked values.
+                     */
+                    stransfer(self_data, self_stride, v_data, v_stride,
+                                (npy_mask *)v_maskna_data, v_maskna_stride,
+                                subloopsize, src_itemsize, transferdata);
+                    /* Copy the mask as well */
+                    for (i = 0; i < subloopsize; ++i) {
+                        *self_maskna_data = *v_maskna_data;
+                        self_maskna_data += self_maskna_stride;
+                        v_maskna_data += v_maskna_stride;
+                    }
+                    innersize -= subloopsize;
+                    self_data += subloopsize * self_stride;
+                    v_data += subloopsize * v_stride;
+                }
+            } while (iternext(iter));
+
+            NPY_AUXDATA_FREE(transferdata);
+        }
+
+        NpyIter_Deallocate(iter);
+    }
+
+    return 0;
 }
 
 NPY_NO_EXPORT PyObject *
@@ -639,7 +1337,8 @@ array_subscript(PyArrayObject *self, PyObject *op)
                 Py_INCREF(self);
                 return (PyObject *)self;
             }
-            if ((nd = count_new_axes_0d(op)) == -1) {
+            nd = count_new_axes_0d(op);
+            if (nd == -1) {
                 return NULL;
             }
             return add_new_axes_0d(self, nd);
@@ -652,7 +1351,7 @@ array_subscript(PyArrayObject *self, PyObject *op)
                 return (PyObject *)self;
             }
             else {
-                intp oned = 0;
+                npy_intp oned = 0;
                 Py_INCREF(PyArray_DESCR(self));
                 return PyArray_NewFromDescr(Py_TYPE(self),
                                             PyArray_DESCR(self),
@@ -666,6 +1365,13 @@ array_subscript(PyArrayObject *self, PyObject *op)
         return NULL;
     }
 
+    /* Boolean indexing special case which supports mask NA */
+    if (PyArray_Check(op) && (PyArray_TYPE((PyArrayObject *)op) == NPY_BOOL)
+                && (PyArray_NDIM(self) == PyArray_NDIM((PyArrayObject *)op))) {
+        return (PyObject *)array_boolean_subscript(self,
+                                        (PyArrayObject *)op, NPY_CORDER);
+    }
+
     fancy = fancy_indexing_check(op);
     if (fancy != SOBJ_NOTFANCY) {
         int oned;
@@ -714,7 +1420,7 @@ array_ass_sub_simple(PyArrayObject *self, PyObject *index, PyObject *op)
 {
     int ret;
     PyArrayObject *tmp;
-    intp value;
+    npy_intp value;
 
     value = PyArray_PyIntAsIntp(index);
     if (!error_converting(value)) {
@@ -751,12 +1457,7 @@ array_ass_sub_simple(PyArrayObject *self, PyObject *index, PyObject *op)
         tmp = (PyArrayObject *)tmp0;
     }
 
-    if (PyArray_ISOBJECT(self) && (PyArray_NDIM(tmp) == 0)) {
-        ret = PyArray_DESCR(tmp)->f->setitem(op, PyArray_DATA(tmp), tmp);
-    }
-    else {
-        ret = PyArray_CopyObject(tmp, op);
-    }
+    ret = PyArray_CopyObject(tmp, op);
     Py_DECREF(tmp);
     return ret;
 }
@@ -766,11 +1467,11 @@ array_ass_sub_simple(PyArrayObject *self, PyObject *index, PyObject *op)
    otherwise fill vals with converted integers
 */
 static int
-_tuple_of_integers(PyObject *seq, intp *vals, int maxvals)
+_tuple_of_integers(PyObject *seq, npy_intp *vals, int maxvals)
 {
     int i;
     PyObject *obj;
-    intp temp;
+    npy_intp temp;
 
     for(i=0; i<maxvals; i++) {
         obj = PyTuple_GET_ITEM(seq, i);
@@ -793,7 +1494,7 @@ array_ass_sub(PyArrayObject *self, PyObject *index, PyObject *op)
 {
     int ret, oned, fancy;
     PyArrayMapIterObject *mit;
-    intp vals[MAX_DIMS];
+    npy_intp vals[NPY_MAXDIMS];
 
     if (op == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -809,7 +1510,7 @@ array_ass_sub(PyArrayObject *self, PyObject *index, PyObject *op)
     if (PyInt_Check(index) || PyArray_IsScalar(index, Integer) ||
         PyLong_Check(index) || (PyIndex_Check(index) &&
                                 !PySequence_Check(index))) {
-        intp value;
+        npy_intp value;
         value = PyArray_PyIntAsIntp(index);
         if (PyErr_Occurred()) {
             PyErr_Clear();
@@ -880,7 +1581,7 @@ array_ass_sub(PyArrayObject *self, PyObject *index, PyObject *op)
                          (PyArray_DIMS((PyArrayObject *)index)==0) &&
                          PyArray_ISBOOL((PyArrayObject *)index))) {
             if (PyObject_IsTrue(index)) {
-                return PyArray_DESCR(self)->f->setitem(op, PyArray_DATA(self), self);
+                return PyArray_CopyObject(self, op);
             }
             else { /* don't do anything */
                 return 0;
@@ -891,30 +1592,106 @@ array_ass_sub(PyArrayObject *self, PyObject *index, PyObject *op)
     }
 
     /* Integer-tuple */
-    if (PyTuple_Check(index) && (PyTuple_GET_SIZE(index) == PyArray_NDIM(self))
-        && (_tuple_of_integers(index, vals, PyArray_NDIM(self)) >= 0)) {
-        int i;
-        char *item;
-
-        for (i = 0; i < PyArray_NDIM(self); i++) {
-            if (vals[i] < 0) {
-                vals[i] += PyArray_DIMS(self)[i];
+    if (PyTuple_Check(index) &&
+                (PyTuple_GET_SIZE(index) == PyArray_NDIM(self)) &&
+                (_tuple_of_integers(index, vals, PyArray_NDIM(self)) >= 0)) {
+        int idim, ndim = PyArray_NDIM(self);
+        npy_intp *shape = PyArray_DIMS(self);
+        npy_intp *strides = PyArray_STRIDES(self);
+        char *item = PyArray_DATA(self);
+
+        if (!PyArray_HASMASKNA(self)) {
+            for (idim = 0; idim < ndim; idim++) {
+                npy_intp v = vals[idim];
+                if (v < 0) {
+                    v += shape[idim];
+                }
+                if (v < 0 || v >= shape[idim]) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "index (%"INTP_FMT") out of range "\
+                                 "(0<=index<%"INTP_FMT") in dimension %d",
+                                 vals[idim], PyArray_DIMS(self)[idim], idim);
+                    return -1;
+                }
+                else {
+                    item += v * strides[idim];
+                }
             }
-            if ((vals[i] < 0) || (vals[i] >= PyArray_DIMS(self)[i])) {
-                PyErr_Format(PyExc_IndexError,
-                             "index (%"INTP_FMT") out of range "\
-                             "(0<=index<%"INTP_FMT") in dimension %d",
-                             vals[i], PyArray_DIMS(self)[i], i);
-                return -1;
+            return PyArray_DESCR(self)->f->setitem(op, item, self);
+        }
+        else {
+            char *maskna_item = PyArray_MASKNA_DATA(self);
+            npy_intp *maskna_strides = PyArray_MASKNA_STRIDES(self);
+            NpyNA *na;
+
+            for (idim = 0; idim < ndim; idim++) {
+                npy_intp v = vals[idim];
+                if (v < 0) {
+                    v += shape[idim];
+                }
+                if (v < 0 || v >= shape[idim]) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "index (%"INTP_FMT") out of range "\
+                                 "(0<=index<%"INTP_FMT") in dimension %d",
+                                 vals[idim], PyArray_DIMS(self)[idim], idim);
+                    return -1;
+                }
+                else {
+                    item += v * strides[idim];
+                    maskna_item += v * maskna_strides[idim];
+                }
+            }
+            na = NpyNA_FromObject(op, 1);
+            if (na == NULL) {
+                *maskna_item = 1;
+                return PyArray_DESCR(self)->f->setitem(op, item, self);
+            }
+            else {
+                *maskna_item = NpyNA_AsMaskValue(na);
+                Py_DECREF(na);
+                return 0;
             }
         }
-        item = PyArray_GetPtr(self, vals);
-        return PyArray_DESCR(self)->f->setitem(op, item, self);
     }
     PyErr_Clear();
 
+    /* Boolean indexing special case with NA mask support */
+    if (PyArray_Check(index) &&
+                (PyArray_TYPE((PyArrayObject *)index) == NPY_BOOL) &&
+                (PyArray_NDIM(self) == PyArray_NDIM((PyArrayObject *)index))) {
+        int retcode;
+        PyArrayObject *op_arr;
+        PyArray_Descr *dtype = NULL;
+
+        /* If it's an NA with no dtype, specify the dtype explicitly */
+        if (NpyNA_Check(op) && ((NpyNA_fields *)op)->dtype == NULL) {
+            dtype = PyArray_DESCR(self);
+            Py_INCREF(dtype);
+        }
+        op_arr = (PyArrayObject *)PyArray_FromAny(op, dtype, 0, 0,
+                                                      NPY_ARRAY_ALLOWNA, NULL);
+        if (op_arr == NULL) {
+            return -1;
+        }
+
+        if (PyArray_NDIM(op_arr) < 2) {
+            retcode = array_ass_boolean_subscript(self,
+                            (PyArrayObject *)index,
+                            op_arr, NPY_CORDER);
+            Py_DECREF(op_arr);
+            return retcode;
+        }
+        /*
+         * Assigning from multi-dimensional 'op' in this case seems
+         * inconsistent, so falling through to old code for backwards
+         * compatibility.
+         */
+        Py_DECREF(op_arr);
+    }
+
     fancy = fancy_indexing_check(index);
     if (fancy != SOBJ_NOTFANCY) {
+
         oned = ((PyArray_NDIM(self) == 1) &&
                 !(PyTuple_Check(index) && PyTuple_GET_SIZE(index) > 1));
         mit = (PyArrayMapIterObject *) PyArray_MapIterNew(index, oned, fancy);
@@ -957,12 +1734,12 @@ array_subscript_nice(PyArrayObject *self, PyObject *op)
 {
 
     PyArrayObject *mp;
-    intp vals[MAX_DIMS];
+    npy_intp vals[NPY_MAXDIMS];
 
     if (PyInt_Check(op) || PyArray_IsScalar(op, Integer) ||
         PyLong_Check(op) || (PyIndex_Check(op) &&
                              !PySequence_Check(op))) {
-        intp value;
+        npy_intp value;
         value = PyArray_PyIntAsIntp(op);
         if (PyErr_Occurred()) {
             PyErr_Clear();
@@ -972,26 +1749,64 @@ array_subscript_nice(PyArrayObject *self, PyObject *op)
         }
     }
     /* optimization for a tuple of integers */
-    if (PyArray_NDIM(self) > 1 && PyTuple_Check(op) &&
-        (PyTuple_GET_SIZE(op) == PyArray_NDIM(self))
-        && (_tuple_of_integers(op, vals, PyArray_NDIM(self)) >= 0)) {
-        int i;
-        char *item;
+    if (PyArray_NDIM(self) > 1 &&
+                PyTuple_Check(op) &&
+                (PyTuple_GET_SIZE(op) == PyArray_NDIM(self)) &&
+                (_tuple_of_integers(op, vals, PyArray_NDIM(self)) >= 0)) {
+        int idim, ndim = PyArray_NDIM(self);
+        npy_intp *shape = PyArray_DIMS(self);
+        npy_intp *strides = PyArray_STRIDES(self);
+        char *item = PyArray_DATA(self);
+
+        if (!PyArray_HASMASKNA(self)) {
+            for (idim = 0; idim < ndim; idim++) {
+                npy_intp v = vals[idim];
+                if (v < 0) {
+                    v += shape[idim];
+                }
+                if (v < 0 || v >= shape[idim]) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "index (%"INTP_FMT") out of range "\
+                                 "(0<=index<%"INTP_FMT") in dimension %d",
+                                 vals[idim], PyArray_DIMS(self)[idim], idim);
+                    return NULL;
+                }
+                else {
+                    item += v * strides[idim];
+                }
+            }
+            return PyArray_Scalar(item, PyArray_DESCR(self), (PyObject *)self);
+        }
+        else {
+            char *maskna_item = PyArray_MASKNA_DATA(self);
+            npy_intp *maskna_strides = PyArray_MASKNA_STRIDES(self);
 
-        for (i = 0; i < PyArray_NDIM(self); i++) {
-            if (vals[i] < 0) {
-                vals[i] += PyArray_DIMS(self)[i];
+            for (idim = 0; idim < ndim; idim++) {
+                npy_intp v = vals[idim];
+                if (v < 0) {
+                    v += shape[idim];
+                }
+                if (v < 0 || v >= shape[idim]) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "index (%"INTP_FMT") out of range "\
+                                 "(0<=index<%"INTP_FMT") in dimension %d",
+                                 vals[idim], PyArray_DIMS(self)[idim], idim);
+                    return NULL;
+                }
+                else {
+                    item += v * strides[idim];
+                    maskna_item += v * maskna_strides[idim];
+                }
             }
-            if ((vals[i] < 0) || (vals[i] >= PyArray_DIMS(self)[i])) {
-                PyErr_Format(PyExc_IndexError,
-                             "index (%"INTP_FMT") out of range "\
-                             "(0<=index<%"INTP_FMT") in dimension %d",
-                             vals[i], PyArray_DIMS(self)[i], i);
-                return NULL;
+            if (NpyMaskValue_IsExposed((npy_mask)*maskna_item)) {
+                return PyArray_Scalar(item, PyArray_DESCR(self),
+                                                    (PyObject *)self);
+            }
+            else {
+                return (PyObject *)NpyNA_FromDTypeAndPayload(
+                                        PyArray_DESCR(self), 0, 0);
             }
         }
-        item = PyArray_GetPtr(self, vals);
-        return PyArray_Scalar(item, PyArray_DESCR(self), (PyObject *)self);
     }
     PyErr_Clear();
 
@@ -1002,20 +1817,22 @@ array_subscript_nice(PyArrayObject *self, PyObject *op)
      * array_subscript_simple).  So, this cast is a bit dangerous..
      */
 
-    /*
-     * The following is just a copy of PyArray_Return with an
-     * additional logic in the nd == 0 case.
-     */
-
     if (mp == NULL) {
         return NULL;
     }
+
     if (PyErr_Occurred()) {
         Py_XDECREF(mp);
         return NULL;
     }
+
+    /*
+     * The following adds some additional logic to avoid calling
+     * PyArray_Return if there is an ellipsis.
+     */
+
     if (PyArray_Check(mp) && PyArray_NDIM(mp) == 0) {
-        Bool noellipses = TRUE;
+        npy_bool noellipses = TRUE;
         if ((op == Py_Ellipsis) || PyString_Check(op) || PyUnicode_Check(op)) {
             noellipses = FALSE;
         }
@@ -1041,12 +1858,10 @@ array_subscript_nice(PyArrayObject *self, PyObject *op)
             }
         }
         if (noellipses) {
-            PyObject *ret;
-            ret = PyArray_ToScalar(PyArray_DATA(mp), mp);
-            Py_DECREF(mp);
-            return ret;
+            return PyArray_Return(mp);
         }
     }
+
     return (PyObject *)mp;
 }
 
@@ -1082,10 +1897,10 @@ _nonzero_indices(PyObject *myBool, PyArrayIterObject **iters)
     PyArray_Descr *typecode;
     PyArrayObject *ba = NULL, *new = NULL;
     int nd, j;
-    intp size, i, count;
-    Bool *ptr;
-    intp coords[MAX_DIMS], dims_m1[MAX_DIMS];
-    intp *dptr[MAX_DIMS];
+    npy_intp size, i, count;
+    npy_bool *ptr;
+    npy_intp coords[NPY_MAXDIMS], dims_m1[NPY_MAXDIMS];
+    npy_intp *dptr[NPY_MAXDIMS];
 
     typecode=PyArray_DescrFromType(NPY_BOOL);
     ba = (PyArrayObject *)PyArray_FromAny(myBool, typecode, 0, 0,
@@ -1122,7 +1937,7 @@ _nonzero_indices(PyObject *myBool, PyArrayIterObject **iters)
         if (iters[j] == NULL) {
             goto fail;
         }
-        dptr[j] = (intp *)PyArray_DATA(iters[j]->ao);
+        dptr[j] = (npy_intp *)PyArray_DATA(iters[j]->ao);
         coords[j] = 0;
         dims_m1[j] = PyArray_DIMS(ba)[j]-1;
     }
@@ -1201,7 +2016,7 @@ _convert_obj(PyObject *obj, PyArrayIterObject **iter)
 NPY_NO_EXPORT void
 PyArray_MapIterReset(PyArrayMapIterObject *mit)
 {
-    int i,j; intp coord[MAX_DIMS];
+    int i,j; npy_intp coord[NPY_MAXDIMS];
     PyArrayIterObject *it;
     PyArray_CopySwapFunc *copyswap;
 
@@ -1210,7 +2025,7 @@ PyArray_MapIterReset(PyArrayMapIterObject *mit)
     copyswap = PyArray_DESCR(mit->iters[0]->ao)->f->copyswap;
 
     if (mit->subspace != NULL) {
-        memcpy(coord, mit->bscoord, sizeof(intp)*PyArray_NDIM(mit->ait->ao));
+        memcpy(coord, mit->bscoord, sizeof(npy_intp)*PyArray_NDIM(mit->ait->ao));
         PyArray_ITER_RESET(mit->subspace);
         for (i = 0; i < mit->numiter; i++) {
             it = mit->iters[i];
@@ -1249,7 +2064,7 @@ NPY_NO_EXPORT void
 PyArray_MapIterNext(PyArrayMapIterObject *mit)
 {
     int i, j;
-    intp coord[MAX_DIMS];
+    npy_intp coord[NPY_MAXDIMS];
     PyArrayIterObject *it;
     PyArray_CopySwapFunc *copyswap;
 
@@ -1264,7 +2079,7 @@ PyArray_MapIterNext(PyArrayMapIterObject *mit)
         if (mit->subspace->index >= mit->subspace->size) {
             /* reset coord to coordinates of beginning of the subspace */
             memcpy(coord, mit->bscoord,
-                        sizeof(intp)*PyArray_NDIM(mit->ait->ao));
+                        sizeof(npy_intp)*PyArray_NDIM(mit->ait->ao));
             PyArray_ITER_RESET(mit->subspace);
             for (i = 0; i < mit->numiter; i++) {
                 it = mit->iters[i];
@@ -1314,8 +2129,8 @@ PyArray_MapIterBind(PyArrayMapIterObject *mit, PyArrayObject *arr)
     PyObject *sub, *obj = NULL;
     int i, j, n, curraxis, ellipexp, noellip;
     PyArrayIterObject *it;
-    intp dimsize;
-    intp *indptr;
+    npy_intp dimsize;
+    npy_intp *indptr;
 
     subnd = PyArray_NDIM(arr) - mit->numiter;
     if (subnd < 0) {
@@ -1387,7 +2202,7 @@ PyArray_MapIterBind(PyArrayMapIterObject *mit, PyArrayObject *arr)
     j = 0;
     /* Only expand the first ellipsis */
     noellip = 1;
-    memset(mit->bscoord, 0, sizeof(intp)*PyArray_NDIM(arr));
+    memset(mit->bscoord, 0, sizeof(npy_intp)*PyArray_NDIM(arr));
     for (i = 0; i < n; i++) {
         /*
          * We need to fill in the starting coordinates for
@@ -1402,8 +2217,8 @@ PyArray_MapIterBind(PyArrayMapIterObject *mit, PyArrayObject *arr)
             noellip = 0;
         }
         else {
-            intp start = 0;
-            intp stop, step;
+            npy_intp start = 0;
+            npy_intp stop, step;
             /* Should be slice object or another Ellipsis */
             if (obj == Py_Ellipsis) {
                 mit->bscoord[curraxis] = 0;
@@ -1441,12 +2256,12 @@ PyArray_MapIterBind(PyArrayMapIterObject *mit, PyArrayObject *arr)
     }
 
     for (i = 0; i < mit->numiter; i++) {
-        intp indval;
+        npy_intp indval;
         it = mit->iters[i];
         PyArray_ITER_RESET(it);
         dimsize = PyArray_DIMS(arr)[mit->iteraxes[i]];
         while (it->index < it->size) {
-            indptr = ((intp *)it->dataptr);
+            indptr = ((npy_intp *)it->dataptr);
             indval = *indptr;
             if (indval < 0) {
                 indval += dimsize;
@@ -1482,8 +2297,8 @@ PyArray_MapIterNew(PyObject *indexobj, int oned, int fancy)
     int i, n, started, nonindex;
 
     if (fancy == SOBJ_BADARRAY) {
-        PyErr_SetString(PyExc_IndexError,                       \
-                        "arrays used as indices must be of "    \
+        PyErr_SetString(PyExc_IndexError,
+                        "arrays used as indices must be of "
                         "integer (or boolean) type");
         return NULL;
     }
@@ -1492,12 +2307,12 @@ PyArray_MapIterNew(PyObject *indexobj, int oned, int fancy)
         return NULL;
     }
 
-    mit = (PyArrayMapIterObject *)_pya_malloc(sizeof(PyArrayMapIterObject));
+    mit = (PyArrayMapIterObject *)PyArray_malloc(sizeof(PyArrayMapIterObject));
     PyObject_Init((PyObject *)mit, &PyArrayMapIter_Type);
     if (mit == NULL) {
         return NULL;
     }
-    for (i = 0; i < MAX_DIMS; i++) {
+    for (i = 0; i < NPY_MAXDIMS; i++) {
         mit->iters[i] = NULL;
     }
     mit->index = 0;
@@ -1552,7 +2367,6 @@ PyArray_MapIterNew(PyObject *indexobj, int oned, int fancy)
             PyTuple_SET_ITEM(mit->indexobj, i, PyInt_FromLong(0));
         }
     }
-
     else if (PyArray_Check(indexobj) || !PyTuple_Check(indexobj)) {
         mit->numiter = 1;
         indtype = PyArray_DescrFromType(NPY_INTP);
@@ -1567,7 +2381,7 @@ PyArray_MapIterNew(PyObject *indexobj, int oned, int fancy)
             goto fail;
         }
         mit->nd = PyArray_NDIM(arr);
-        memcpy(mit->dimensions, PyArray_DIMS(arr), mit->nd*sizeof(intp));
+        memcpy(mit->dimensions, PyArray_DIMS(arr), mit->nd*sizeof(npy_intp));
         mit->size = PyArray_SIZE(arr);
         Py_DECREF(arr);
         Py_DECREF(mit->indexobj);
@@ -1660,7 +2474,7 @@ arraymapiter_dealloc(PyArrayMapIterObject *mit)
     for (i = 0; i < mit->numiter; i++) {
         Py_XDECREF(mit->iters[i]);
     }
-    _pya_free(mit);
+    PyArray_free(mit);
 }
 
 /*
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 28fb8f8cf..8960d8a11 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -16,9 +16,13 @@
 #include "common.h"
 #include "ctors.h"
 #include "calculation.h"
+#include "convert_datatype.h"
+#include "item_selection.h"
+#include "conversion_utils.h"
+#include "shape.h"
+#include "boolean_ops.h"
 
 #include "methods.h"
-#include "convert_datatype.h"
 
 
 /* NpyArg_ParseKeywords
@@ -46,8 +50,76 @@ NpyArg_ParseKeywords(PyObject *keys, const char *format, char **kwlist, ...)
     return ret;
 }
 
-/* Should only be used if x is known to be an nd-array */
-#define _ARET(x) PyArray_Return((PyArrayObject *)(x))
+static PyObject *
+get_forwarding_ndarray_method(const char *name)
+{
+    PyObject *module_methods, *callable;
+
+    /* Get a reference to the function we're calling */
+    module_methods = PyImport_ImportModule("numpy.core._methods");
+    if (module_methods == NULL) {
+        return NULL;
+    }
+    callable = PyDict_GetItemString(PyModule_GetDict(module_methods), name);
+    if (callable == NULL) {
+        Py_DECREF(module_methods);
+        PyErr_Format(PyExc_RuntimeError,
+                "NumPy internal error: could not find function "
+                "numpy.core._methods.%s", name);
+    }
+
+    Py_INCREF(callable);
+    Py_DECREF(module_methods);
+    return callable;
+}
+
+/*
+ * Forwards an ndarray method to a the Python function
+ * numpy.core._methods.<name>(...)
+ */
+static PyObject *
+forward_ndarray_method(PyArrayObject *self, PyObject *args, PyObject *kwds,
+                            PyObject *forwarding_callable)
+{
+    PyObject *sargs, *ret;
+    int i, n;
+
+    /* Combine 'self' and 'args' together into one tuple */
+    n = PyTuple_GET_SIZE(args);
+    sargs = PyTuple_New(n + 1);
+    if (sargs == NULL) {
+        return NULL;
+    }
+    Py_INCREF(self);
+    PyTuple_SET_ITEM(sargs, 0, (PyObject *)self);
+    for (i = 0; i < n; ++i) {
+        PyObject *item = PyTuple_GET_ITEM(args, i);
+        Py_INCREF(item);
+        PyTuple_SET_ITEM(sargs, i+1, item);
+    }
+
+    /* Call the function and return */
+    ret = PyObject_Call(forwarding_callable, sargs, kwds);
+    Py_DECREF(sargs);
+    return ret;
+}
+
+/*
+ * Forwards an ndarray method to the function numpy.core._methods.<name>(...),
+ * caching the callable in a local static variable. Note that the
+ * initialization is not thread-safe, but relies on the CPython GIL to
+ * be correct.
+ */
+#define NPY_FORWARD_NDARRAY_METHOD(name) \
+        static PyObject *callable = NULL; \
+        if (callable == NULL) { \
+            callable = get_forwarding_ndarray_method(name); \
+            if (callable == NULL) { \
+                return NULL; \
+            } \
+        } \
+        return forward_ndarray_method(self, args, kwds, callable)
+
 
 static PyObject *
 array_take(PyArrayObject *self, PyObject *args, PyObject *kwds)
@@ -61,11 +133,12 @@ array_take(PyArrayObject *self, PyObject *args, PyObject *kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O&O&", kwlist,
                                      &indices,
                                      PyArray_AxisConverter, &dimension,
-                                     PyArray_OutputConverter, &out,
+                                     PyArray_OutputAllowNAConverter, &out,
                                      PyArray_ClipmodeConverter, &mode))
         return NULL;
 
-    return _ARET(PyArray_TakeFrom(self, indices, dimension, out, mode));
+    return PyArray_Return((PyArrayObject *)
+                PyArray_TakeFrom(self, indices, dimension, out, mode));
 }
 
 static PyObject *
@@ -103,7 +176,7 @@ array_reshape(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *keywords[] = {"order", NULL};
     PyArray_Dims newshape;
     PyObject *ret;
-    PyArray_ORDER order = PyArray_CORDER;
+    PyArray_ORDER order = NPY_CORDER;
     Py_ssize_t n = PyTuple_Size(args);
 
     if (!NpyArg_ParseKeywords(kwds, "|O&", keywords,
@@ -139,12 +212,28 @@ array_reshape(PyArrayObject *self, PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_squeeze(PyArrayObject *self, PyObject *args)
+array_squeeze(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    if (!PyArg_ParseTuple(args, "")) {
+    PyObject *axis_in = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+
+    static char *kwlist[] = {"axis", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist,
+                                     &axis_in)) {
         return NULL;
     }
-    return PyArray_Squeeze(self);
+
+    if (axis_in == NULL || axis_in == Py_None) {
+        return PyArray_Squeeze(self);
+    }
+    else {
+        if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(self),
+                                            axis_flags) != NPY_SUCCEED) {
+            return NULL;
+        }
+
+        return PyArray_SqueezeSelected(self, axis_flags);
+    }
 }
 
 static PyObject *
@@ -153,12 +242,38 @@ array_view(PyArrayObject *self, PyObject *args, PyObject *kwds)
     PyObject *out_dtype = NULL;
     PyObject *out_type = NULL;
     PyArray_Descr *dtype = NULL;
+    PyObject *ret;
+    int maskna = -1, ownmaskna = 0;
+    PyObject *maskna_in = Py_None;
 
-    static char *kwlist[] = {"dtype", "type", NULL};
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO", kwlist,
+    static char *kwlist[] = {"dtype", "type", "maskna", "ownmaskna", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOOi", kwlist,
                                      &out_dtype,
-                                     &out_type))
+                                     &out_type,
+                                     &maskna_in,
+                                     &ownmaskna)) {
         return NULL;
+    }
+
+    /* Treat None the same as not providing the parameter */
+    if (maskna_in != Py_None) {
+        maskna = PyObject_IsTrue(maskna_in);
+        if (maskna == -1) {
+            return NULL;
+        }
+    }
+
+    /* 'ownmaskna' forces 'maskna' to be True */
+    if (ownmaskna) {
+        if (maskna == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                    "cannot specify maskna=False and ownmaskna=True");
+            return NULL;
+        }
+        else {
+            maskna = 1;
+        }
+    }
 
     /* If user specified a positional argument, guess whether it
        represents a type or a dtype for backward compatibility. */
@@ -190,7 +305,30 @@ array_view(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    return PyArray_View(self, dtype, (PyTypeObject*)out_type);
+    ret = PyArray_View(self, dtype, (PyTypeObject*)out_type);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    if (maskna == 1) {
+        /* Ensure there is an NA mask if requested */
+        if (PyArray_AllocateMaskNA((PyArrayObject *)ret,
+                                        ownmaskna, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        return ret;
+    }
+    else if (maskna == 0 && PyArray_HASMASKNA((PyArrayObject *)ret)) {
+        PyErr_SetString(PyExc_ValueError,
+                    "Cannot take a view of an NA-masked array "
+                    "with maskna=False");
+        Py_DECREF(ret);
+        return NULL;
+    }
+    else {
+        return ret;
+    }
 }
 
 static PyObject *
@@ -205,7 +343,7 @@ array_argmax(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      PyArray_OutputConverter, &out))
         return NULL;
 
-    return _ARET(PyArray_ArgMax(self, axis, out));
+    return PyArray_Return((PyArrayObject *)PyArray_ArgMax(self, axis, out));
 }
 
 static PyObject *
@@ -220,42 +358,23 @@ array_argmin(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      PyArray_OutputConverter, &out))
         return NULL;
 
-    return _ARET(PyArray_ArgMin(self, axis, out));
+    return PyArray_Return((PyArrayObject *)PyArray_ArgMin(self, axis, out));
 }
 
 static PyObject *
 array_max(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArrayObject *out = NULL;
-    static char *kwlist[] = {"axis", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out))
-        return NULL;
-
-    return PyArray_Max(self, axis, out);
+    NPY_FORWARD_NDARRAY_METHOD("_amax");
 }
 
 static PyObject *
-array_ptp(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_min(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArrayObject *out = NULL;
-    static char *kwlist[] = {"axis", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out))
-        return NULL;
-
-    return PyArray_Ptp(self, axis, out);
+    NPY_FORWARD_NDARRAY_METHOD("_amin");
 }
 
-
 static PyObject *
-array_min(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_ptp(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int axis = MAX_DIMS;
     PyArrayObject *out = NULL;
@@ -263,12 +382,13 @@ array_min(PyArrayObject *self, PyObject *args, PyObject *kwds)
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
                                      PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out))
+                                     PyArray_OutputAllowNAConverter, &out))
         return NULL;
 
-    return PyArray_Min(self, axis, out);
+    return PyArray_Ptp(self, axis, out);
 }
 
+
 static PyObject *
 array_swapaxes(PyArrayObject *self, PyObject *args)
 {
@@ -292,7 +412,7 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
 
     if (offset < 0 || (offset + typed->elsize) > PyArray_DESCR(self)->elsize) {
         PyErr_Format(PyExc_ValueError,
-                     "Need 0 <= offset <= %d for requested type "  \
+                     "Need 0 <= offset <= %d for requested type "
                      "but received offset = %d",
                      PyArray_DESCR(self)->elsize-typed->elsize, offset);
         Py_DECREF(typed);
@@ -349,16 +469,16 @@ PyArray_SetField(PyArrayObject *self, PyArray_Descr *dtype,
 
     if (offset < 0 || (offset + dtype->elsize) > PyArray_DESCR(self)->elsize) {
         PyErr_Format(PyExc_ValueError,
-                     "Need 0 <= offset <= %d for requested type "  \
+                     "Need 0 <= offset <= %d for requested type "
                      "but received offset = %d",
                      PyArray_DESCR(self)->elsize-dtype->elsize, offset);
         Py_DECREF(dtype);
         return -1;
     }
     ret = PyArray_NewFromDescr(Py_TYPE(self),
-                               dtype, PyArray_NDIM(self), PyArray_DIMS(self),
-                               PyArray_STRIDES(self), PyArray_DATA(self) + offset,
-                               PyArray_FLAGS(self), (PyObject *)self);
+                           dtype, PyArray_NDIM(self), PyArray_DIMS(self),
+                           PyArray_STRIDES(self), PyArray_DATA(self) + offset,
+                           PyArray_FLAGS(self), (PyObject *)self);
     if (ret == NULL) {
         return -1;
     }
@@ -407,7 +527,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_Byteswap(PyArrayObject *self, Bool inplace)
 {
     PyArrayObject *ret;
-    intp size;
+    npy_intp size;
     PyArray_CopySwapNFunc *copyswapn;
     PyArrayIterObject *it;
 
@@ -425,7 +545,7 @@ PyArray_Byteswap(PyArrayObject *self, Bool inplace)
         }
         else { /* Use iterator */
             int axis = -1;
-            intp stride;
+            npy_intp stride;
             it = (PyArrayIterObject *)                      \
                 PyArray_IterAllButAxis((PyObject *)self, &axis);
             stride = PyArray_STRIDES(self)[axis];
@@ -534,103 +654,85 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
     return Py_None;
 }
 
-
 static PyObject *
-array_toscalar(PyArrayObject *self, PyObject *args) {
-    int n, nd;
-    n = PyTuple_GET_SIZE(args);
+array_toscalar(PyArrayObject *self, PyObject *args)
+{
+    npy_intp multi_index[NPY_MAXDIMS];
+    int n = PyTuple_GET_SIZE(args);
+    int idim, ndim = PyArray_NDIM(self);
 
-    if (n == 1) {
-        PyObject *obj;
-        obj = PyTuple_GET_ITEM(args, 0);
-        if (PyTuple_Check(obj)) {
-            args = obj;
-            n = PyTuple_GET_SIZE(args);
-        }
+    /* If there is a tuple as a single argument, treat it as the argument */
+    if (n == 1 && PyTuple_Check(PyTuple_GET_ITEM(args, 0))) {
+        args = PyTuple_GET_ITEM(args, 0);
+        n = PyTuple_GET_SIZE(args);
     }
 
     if (n == 0) {
-        if (PyArray_NDIM(self) == 0 || PyArray_SIZE(self) == 1)
-            return PyArray_DESCR(self)->f->getitem(PyArray_DATA(self), self);
+        if (PyArray_SIZE(self) == 1) {
+            for (idim = 0; idim < ndim; ++idim) {
+                multi_index[idim] = 0;
+            }
+        }
         else {
             PyErr_SetString(PyExc_ValueError,
-                            "can only convert an array "    \
+                            "can only convert an array "
                             " of size 1 to a Python scalar");
-            return NULL;
         }
     }
-    else if (n != PyArray_NDIM(self) && (n > 1 || PyArray_NDIM(self) == 0)) {
-        PyErr_SetString(PyExc_ValueError,
-                        "incorrect number of indices for "      \
-                        "array");
-        return NULL;
-    }
-    else if (n == 1) { /* allows for flat getting as well as 1-d case */
-        intp value, loc, index, factor;
-        intp factors[MAX_DIMS];
+    /* Special case of C-order flat indexing... :| */
+    else if (n == 1 && ndim != 1) {
+        npy_intp *shape = PyArray_SHAPE(self);
+        npy_intp value, size = PyArray_SIZE(self);
+
         value = PyArray_PyIntAsIntp(PyTuple_GET_ITEM(args, 0));
-        if (error_converting(value)) {
-            PyErr_SetString(PyExc_ValueError, "invalid integer");
+        if (value == -1 && PyErr_Occurred()) {
             return NULL;
         }
-        factor = PyArray_SIZE(self);
-        if (value < 0) value += factor;
-        if ((value >= factor) || (value < 0)) {
-            PyErr_SetString(PyExc_ValueError,
-                            "index out of bounds");
-            return NULL;
-        }
-        if (PyArray_NDIM(self) == 1) {
-            value *= PyArray_STRIDES(self)[0];
-            return PyArray_DESCR(self)->f->getitem(PyArray_DATA(self) + value,
-                                           self);
-        }
-        nd = PyArray_NDIM(self);
-        factor = 1;
-        while (nd--) {
-            factors[nd] = factor;
-            factor *= PyArray_DIMS(self)[nd];
-        }
-        loc = 0;
-        for (nd = 0; nd < PyArray_NDIM(self); nd++) {
-            index = value / factors[nd];
-            value = value % factors[nd];
-            loc += PyArray_STRIDES(self)[nd]*index;
-        }
 
-        return PyArray_DESCR(self)->f->getitem(PyArray_DATA(self) + loc,
-                                       self);
+        /* Negative indexing */
+        if (value < 0) {
+            value += size;
+        }
 
-    }
-    else {
-        intp loc, index[MAX_DIMS];
-        nd = PyArray_IntpFromSequence(args, index, MAX_DIMS);
-        if (nd < n) {
+        if (value < 0 || value >= size) {
+            PyErr_SetString(PyExc_ValueError, "index out of bounds");
             return NULL;
         }
-        loc = 0;
-        while (nd--) {
-            if (index[nd] < 0) {
-                index[nd] += PyArray_DIMS(self)[nd];
-            }
-            if (index[nd] < 0 ||
-                index[nd] >= PyArray_DIMS(self)[nd]) {
-                PyErr_SetString(PyExc_ValueError,
-                                "index out of bounds");
+
+        /* Convert the flat index into a multi-index */
+        for (idim = ndim-1; idim >= 0; --idim) {
+            multi_index[idim] = value % shape[idim];
+            value /= shape[idim];
+        }
+    }
+    /* A multi-index tuple */
+    else if (n == ndim) {
+        npy_intp value;
+
+        for (idim = 0; idim < ndim; ++idim) {
+            value = PyArray_PyIntAsIntp(PyTuple_GET_ITEM(args, idim));
+            if (value == -1 && PyErr_Occurred()) {
                 return NULL;
             }
-            loc += PyArray_STRIDES(self)[nd]*index[nd];
+            multi_index[idim] = value;
         }
-        return PyArray_DESCR(self)->f->getitem(PyArray_DATA(self) + loc, self);
     }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                        "incorrect number of indices for array");
+        return NULL;
+    }
+
+    return PyArray_MultiIndexGetItem(self, multi_index);
 }
 
 static PyObject *
-array_setscalar(PyArrayObject *self, PyObject *args) {
-    int n, nd;
-    int ret = -1;
+array_setscalar(PyArrayObject *self, PyObject *args)
+{
+    npy_intp multi_index[NPY_MAXDIMS];
+    int n = PyTuple_GET_SIZE(args) - 1;
+    int idim, ndim = PyArray_NDIM(self);
     PyObject *obj;
-    n = PyTuple_GET_SIZE(args) - 1;
 
     if (n < 0) {
         PyErr_SetString(PyExc_ValueError,
@@ -638,110 +740,76 @@ array_setscalar(PyArrayObject *self, PyObject *args) {
         return NULL;
     }
     obj = PyTuple_GET_ITEM(args, n);
+
+    /* If there is a tuple as a single argument, treat it as the argument */
+    if (n == 1 && PyTuple_Check(PyTuple_GET_ITEM(args, 0))) {
+        args = PyTuple_GET_ITEM(args, 0);
+        n = PyTuple_GET_SIZE(args);
+    }
+
     if (n == 0) {
-        if (PyArray_NDIM(self) == 0 || PyArray_SIZE(self) == 1) {
-            ret = PyArray_DESCR(self)->f->setitem(obj, PyArray_DATA(self), self);
+        if (PyArray_SIZE(self) == 1) {
+            for (idim = 0; idim < ndim; ++idim) {
+                multi_index[idim] = 0;
+            }
         }
         else {
             PyErr_SetString(PyExc_ValueError,
-                            "can only place a scalar for an "
-                            " array of size 1");
-            return NULL;
+                            "can only convert an array "
+                            " of size 1 to a Python scalar");
         }
     }
-    else if (n != PyArray_NDIM(self) && (n > 1 || PyArray_NDIM(self) == 0)) {
-        PyErr_SetString(PyExc_ValueError,
-                        "incorrect number of indices for "      \
-                        "array");
-        return NULL;
-    }
-    else if (n == 1) { /* allows for flat setting as well as 1-d case */
-        intp value, loc, index, factor;
-        intp factors[MAX_DIMS];
-        PyObject *indobj;
+    /* Special case of C-order flat indexing... :| */
+    else if (n == 1 && ndim != 1) {
+        npy_intp *shape = PyArray_SHAPE(self);
+        npy_intp value, size = PyArray_SIZE(self);
 
-        indobj = PyTuple_GET_ITEM(args, 0);
-        if (PyTuple_Check(indobj)) {
-            PyObject *res;
-            PyObject *newargs;
-            PyObject *tmp;
-            int i, nn;
-            nn = PyTuple_GET_SIZE(indobj);
-            newargs = PyTuple_New(nn+1);
-            Py_INCREF(obj);
-            for (i = 0; i < nn; i++) {
-                tmp = PyTuple_GET_ITEM(indobj, i);
-                Py_INCREF(tmp);
-                PyTuple_SET_ITEM(newargs, i, tmp);
-            }
-            PyTuple_SET_ITEM(newargs, nn, obj);
-            /* Call with a converted set of arguments */
-            res = array_setscalar(self, newargs);
-            Py_DECREF(newargs);
-            return res;
-        }
-        value = PyArray_PyIntAsIntp(indobj);
-        if (error_converting(value)) {
-            PyErr_SetString(PyExc_ValueError, "invalid integer");
-            return NULL;
-        }
-        if (value >= PyArray_SIZE(self)) {
-            PyErr_SetString(PyExc_ValueError,
-                            "index out of bounds");
+        value = PyArray_PyIntAsIntp(PyTuple_GET_ITEM(args, 0));
+        if (value == -1 && PyErr_Occurred()) {
             return NULL;
         }
-        if (PyArray_NDIM(self) == 1) {
-            value *= PyArray_STRIDES(self)[0];
-            ret = PyArray_DESCR(self)->f->setitem(obj, PyArray_DATA(self) + value,
-                                          self);
-            goto finish;
-        }
-        nd = PyArray_NDIM(self);
-        factor = 1;
-        while (nd--) {
-            factors[nd] = factor;
-            factor *= PyArray_DIMS(self)[nd];
-        }
-        loc = 0;
-        for (nd = 0; nd < PyArray_NDIM(self); nd++) {
-            index = value / factors[nd];
-            value = value % factors[nd];
-            loc += PyArray_STRIDES(self)[nd]*index;
+
+        /* Negative indexing */
+        if (value < 0) {
+            value += size;
         }
 
-        ret = PyArray_DESCR(self)->f->setitem(obj, PyArray_DATA(self) + loc, self);
-    }
-    else {
-        intp loc, index[MAX_DIMS];
-        PyObject *tupargs;
-        tupargs = PyTuple_GetSlice(args, 0, n);
-        nd = PyArray_IntpFromSequence(tupargs, index, MAX_DIMS);
-        Py_DECREF(tupargs);
-        if (nd < n) {
+        if (value < 0 || value >= size) {
+            PyErr_SetString(PyExc_ValueError, "index out of bounds");
             return NULL;
         }
-        loc = 0;
-        while (nd--) {
-            if (index[nd] < 0) {
-                index[nd] += PyArray_DIMS(self)[nd];
-            }
-            if (index[nd] < 0 ||
-                index[nd] >= PyArray_DIMS(self)[nd]) {
-                PyErr_SetString(PyExc_ValueError,
-                                "index out of bounds");
+
+        /* Convert the flat index into a multi-index */
+        for (idim = ndim-1; idim >= 0; --idim) {
+            multi_index[idim] = value % shape[idim];
+            value /= shape[idim];
+        }
+    }
+    /* A multi-index tuple */
+    else if (n == ndim) {
+        npy_intp value;
+
+        for (idim = 0; idim < ndim; ++idim) {
+            value = PyArray_PyIntAsIntp(PyTuple_GET_ITEM(args, idim));
+            if (value == -1 && PyErr_Occurred()) {
                 return NULL;
             }
-            loc += PyArray_STRIDES(self)[nd]*index[nd];
+            multi_index[idim] = value;
         }
-        ret = PyArray_DESCR(self)->f->setitem(obj, PyArray_DATA(self) + loc, self);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                        "incorrect number of indices for array");
+        return NULL;
     }
 
- finish:
-    if (ret < 0) {
+    if (PyArray_MultiIndexSetItem(self, multi_index, obj) < 0) {
         return NULL;
     }
-    Py_INCREF(Py_None);
-    return Py_None;
+    else {
+        Py_INCREF(Py_None);
+        return Py_None;
+    }
 }
 
 /* Sets the array values from another array as if they were flat */
@@ -755,7 +823,8 @@ array_setasflat(PyArrayObject *self, PyObject *args)
         return NULL;
     }
 
-    arr = (PyArrayObject *)PyArray_FromAny(arr_in, NULL, 0, 0, 0, NULL);
+    arr = (PyArrayObject *)PyArray_FromAny(arr_in, NULL,
+                                        0, 0, NPY_ARRAY_ALLOWNA, NULL);
     if (arr == NULL) {
         return NULL;
     }
@@ -1024,15 +1093,54 @@ array_getarray(PyArrayObject *self, PyObject *args)
 static PyObject *
 array_copy(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER order = PyArray_CORDER;
-    static char *kwlist[] = {"order", NULL};
+    PyArray_ORDER order = NPY_CORDER;
+    PyObject *maskna_in = Py_None;
+    int maskna = -1;
+    static char *kwlist[] = {"order", "maskna", NULL};
+    PyArrayObject *ret;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
-                            PyArray_OrderConverter, &order)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O", kwlist,
+                            PyArray_OrderConverter, &order,
+                            &maskna_in)) {
         return NULL;
     }
 
-    return PyArray_NewCopy(self, order);
+    /* Treat None the same as not providing the parameter */
+    if (maskna_in != Py_None) {
+        maskna = PyObject_IsTrue(maskna_in);
+        if (maskna == -1) {
+            return NULL;
+        }
+    }
+
+    /* If maskna=False was passed and self has an NA mask, strip it away */
+    if (maskna == 0 && PyArray_HASMASKNA(self)) {
+        /* An array with no NA mask */
+        ret = (PyArrayObject *)PyArray_NewLikeArray(self, order, NULL, 1);
+        if (ret == NULL) {
+            return NULL;
+        }
+
+        /* AssignArray validates that 'self' contains no NA values */
+        if (PyArray_AssignArray(ret, self, NULL, NPY_UNSAFE_CASTING,
+                                                        0, NULL) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+    else {
+        ret = (PyArrayObject *)PyArray_NewCopy(self, order);
+
+        /* Add the NA mask if requested */
+        if (ret != NULL && maskna == 1) {
+            if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+        }
+    }
+
+    return (PyObject *)ret;
 }
 
 #include <stdio.h>
@@ -1069,7 +1177,7 @@ array_resize(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    ret = PyArray_Resize(self, &newshape, refcheck, PyArray_CORDER);
+    ret = PyArray_Resize(self, &newshape, refcheck, NPY_CORDER);
     PyDimMem_FREE(newshape.ptr);
     if (ret == NULL) {
         return NULL;
@@ -1090,7 +1198,7 @@ array_repeat(PyArrayObject *self, PyObject *args, PyObject *kwds) {
                                      PyArray_AxisConverter, &axis)) {
         return NULL;
     }
-    return _ARET(PyArray_Repeat(self, repeats, axis));
+    return PyArray_Return((PyArrayObject *)PyArray_Repeat(self, repeats, axis));
 }
 
 static PyObject *
@@ -1117,7 +1225,7 @@ array_choose(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    return _ARET(PyArray_Choose(self, choices, out, clipmode));
+    return PyArray_Return((PyArrayObject *)PyArray_Choose(self, choices, out, clipmode));
 }
 
 static PyObject *
@@ -1161,13 +1269,13 @@ array_sort(PyArrayObject *self, PyObject *args, PyObject *kwds)
         }
         newd = PyArray_DescrNew(saved);
         newd->names = new_name;
-        ((PyArrayObject_fieldaccess *)self)->descr = newd;
+        ((PyArrayObject_fields *)self)->descr = newd;
     }
 
     val = PyArray_Sort(self, axis, sortkind);
     if (order != NULL) {
         Py_XDECREF(PyArray_DESCR(self));
-        ((PyArrayObject_fieldaccess *)self)->descr = saved;
+        ((PyArrayObject_fields *)self)->descr = saved;
     }
     if (val < 0) {
         return NULL;
@@ -1199,7 +1307,7 @@ array_argsort(PyArrayObject *self, PyObject *args, PyObject *kwds)
         PyObject *_numpy_internal;
         saved = PyArray_DESCR(self);
         if (!PyDataType_HASFIELDS(saved)) {
-            PyErr_SetString(PyExc_ValueError, "Cannot specify " \
+            PyErr_SetString(PyExc_ValueError, "Cannot specify "
                             "order when the array has no fields.");
             return NULL;
         }
@@ -1215,15 +1323,15 @@ array_argsort(PyArrayObject *self, PyObject *args, PyObject *kwds)
         }
         newd = PyArray_DescrNew(saved);
         newd->names = new_name;
-        ((PyArrayObject_fieldaccess *)self)->descr = newd;
+        ((PyArrayObject_fields *)self)->descr = newd;
     }
 
     res = PyArray_ArgSort(self, axis, sortkind);
     if (order != NULL) {
         Py_XDECREF(PyArray_DESCR(self));
-        ((PyArrayObject_fieldaccess *)self)->descr = saved;
+        ((PyArrayObject_fields *)self)->descr = saved;
     }
-    return _ARET(res);
+    return PyArray_Return((PyArrayObject *)res);
 }
 
 static PyObject *
@@ -1238,7 +1346,7 @@ array_searchsorted(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      PyArray_SearchsideConverter, &side)) {
         return NULL;
     }
-    return _ARET(PyArray_SearchSorted(self, keys, side));
+    return PyArray_Return((PyArrayObject *)PyArray_SearchSorted(self, keys, side));
 }
 
 static void
@@ -1460,10 +1568,10 @@ array_setstate(PyArrayObject *self, PyObject *args)
     PyObject *rawdata = NULL;
     char *datastr;
     Py_ssize_t len;
-    intp size, dimensions[MAX_DIMS];
+    npy_intp size, dimensions[MAX_DIMS];
     int nd;
 
-    PyArrayObject_fieldaccess *fa = (PyArrayObject_fieldaccess *)self;
+    PyArrayObject_fields *fa = (PyArrayObject_fields *)self;
 
     /* This will free any memory associated with a and
        use the string in setstate as the (writeable) memory.
@@ -1573,9 +1681,10 @@ array_setstate(PyArrayObject *self, PyObject *args)
     fa->nd = nd;
 
     if (nd > 0) {
-        fa->dimensions = PyDimMem_NEW(nd * 2);
+        fa->dimensions = PyDimMem_NEW(3*nd);
         fa->strides = PyArray_DIMS(self) + nd;
-        memcpy(PyArray_DIMS(self), dimensions, sizeof(intp)*nd);
+        fa->maskna_strides = PyArray_DIMS(self) + 2*nd;
+        memcpy(PyArray_DIMS(self), dimensions, sizeof(npy_intp)*nd);
         _array_fill_strides(PyArray_STRIDES(self), dimensions, nd,
                                PyArray_DESCR(self)->elsize,
                                (is_f_order ? NPY_ARRAY_F_CONTIGUOUS :
@@ -1593,7 +1702,7 @@ array_setstate(PyArrayObject *self, PyObject *args)
         /* Bytes are never interned */
         if (!_IsAligned(self) || swap) {
 #endif
-            intp num = PyArray_NBYTES(self);
+            npy_intp num = PyArray_NBYTES(self);
             fa->data = PyDataMem_NEW(num);
             if (PyArray_DATA(self) == NULL) {
                 fa->nd = 0;
@@ -1602,7 +1711,7 @@ array_setstate(PyArrayObject *self, PyObject *args)
                 return PyErr_NoMemory();
             }
             if (swap) { /* byte-swap on pickle-read */
-                intp numels = num / PyArray_DESCR(self)->elsize;
+                npy_intp numels = num / PyArray_DESCR(self)->elsize;
                 PyArray_DESCR(self)->f->copyswapn(PyArray_DATA(self),
                                         PyArray_DESCR(self)->elsize,
                                         datastr, PyArray_DESCR(self)->elsize,
@@ -1800,45 +1909,13 @@ _get_type_num_double(PyArray_Descr *dtype1, PyArray_Descr *dtype2)
 static PyObject *
 array_mean(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArray_Descr *dtype = NULL;
-    PyArrayObject *out = NULL;
-    int num;
-    static char *kwlist[] = {"axis", "dtype", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out)) {
-        Py_XDECREF(dtype);
-        return NULL;
-    }
-
-    num = _get_type_num_double(PyArray_DESCR(self), dtype);
-    Py_XDECREF(dtype);
-    return PyArray_Mean(self, axis, num, out);
+    NPY_FORWARD_NDARRAY_METHOD("_mean");
 }
 
 static PyObject *
 array_sum(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArray_Descr *dtype = NULL;
-    PyArrayObject *out = NULL;
-    int rtype;
-    static char *kwlist[] = {"axis", "dtype", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out)) {
-        Py_XDECREF(dtype);
-        return NULL;
-    }
-
-    rtype = _CHKTYPENUM(dtype);
-    Py_XDECREF(dtype);
-    return PyArray_Sum(self, axis, rtype, out);
+    NPY_FORWARD_NDARRAY_METHOD("_sum");
 }
 
 
@@ -1867,23 +1944,7 @@ array_cumsum(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_prod(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArray_Descr *dtype = NULL;
-    PyArrayObject *out = NULL;
-    int rtype;
-    static char *kwlist[] = {"axis", "dtype", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out)) {
-        Py_XDECREF(dtype);
-        return NULL;
-    }
-
-    rtype = _CHKTYPENUM(dtype);
-    Py_XDECREF(dtype);
-    return PyArray_Prod(self, axis, rtype, out);
+    NPY_FORWARD_NDARRAY_METHOD("_prod");
 }
 
 static PyObject *
@@ -1933,87 +1994,88 @@ array_dot(PyArrayObject *self, PyObject *args, PyObject *kwds)
 
 
 static PyObject *
-array_any(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_any(PyArrayObject *array, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
+    static char *kwlist[] = {"axis", "out", "skipna", "keepdims", NULL};
+
+    PyObject *axis_in = NULL;
     PyArrayObject *out = NULL;
-    static char *kwlist[] = {"axis", "out", NULL};
+    PyArrayObject *ret = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0, keepdims = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out))
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "|OO&ii:any", kwlist,
+                                &axis_in,
+                                &PyArray_OutputAllowNAConverter, &out,
+                                &skipna,
+                                &keepdims)) {
         return NULL;
+    }
+
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        return NULL;
+    }
 
-    return PyArray_Any(self, axis, out);
+    ret = PyArray_ReduceAny(array, out, axis_flags, skipna, keepdims);
+
+    if (out == NULL) {
+        return PyArray_Return(ret);
+    }
+    else {
+        return (PyObject *)ret;
+    }
 }
 
 
 static PyObject *
-array_all(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_all(PyArrayObject *array, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
+    static char *kwlist[] = {"axis", "out", "skipna", "keepdims", NULL};
+
+    PyObject *axis_in = NULL;
     PyArrayObject *out = NULL;
-    static char *kwlist[] = {"axis", "out", NULL};
+    PyArrayObject *ret = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0, keepdims = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out))
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "|OO&ii:all", kwlist,
+                                &axis_in,
+                                &PyArray_OutputAllowNAConverter, &out,
+                                &skipna,
+                                &keepdims)) {
         return NULL;
+    }
 
-    return PyArray_All(self, axis, out);
-}
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        return NULL;
+    }
+
+    ret = PyArray_ReduceAll(array, out, axis_flags, skipna, keepdims);
 
+    if (out == NULL) {
+        return PyArray_Return(ret);
+    }
+    else {
+        return (PyObject *)ret;
+    }
+}
 
 static PyObject *
 array_stddev(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArray_Descr *dtype = NULL;
-    PyArrayObject *out = NULL;
-    int num;
-    int ddof = 0;
-    static char *kwlist[] = {"axis", "dtype", "out", "ddof", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&i", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out,
-                                     &ddof)) {
-        Py_XDECREF(dtype);
-        return NULL;
-    }
-
-    num = _get_type_num_double(PyArray_DESCR(self), dtype);
-    Py_XDECREF(dtype);
-    return __New_PyArray_Std(self, axis, num, out, 0, ddof);
+    NPY_FORWARD_NDARRAY_METHOD("_std");
 }
 
-
 static PyObject *
 array_variance(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    int axis = MAX_DIMS;
-    PyArray_Descr *dtype = NULL;
-    PyArrayObject *out = NULL;
-    int num;
-    int ddof = 0;
-    static char *kwlist[] = {"axis", "dtype", "out", "ddof", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&i", kwlist,
-                                     PyArray_AxisConverter, &axis,
-                                     PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out,
-                                     &ddof)) {
-        Py_XDECREF(dtype);
-        return NULL;
-    }
-
-    num = _get_type_num_double(PyArray_DESCR(self), dtype);
-    Py_XDECREF(dtype);
-    return __New_PyArray_Std(self, axis, num, out, 1, ddof);
+    NPY_FORWARD_NDARRAY_METHOD("_var");
 }
 
-
 static PyObject *
 array_compress(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
@@ -2025,10 +2087,11 @@ array_compress(PyArrayObject *self, PyObject *args, PyObject *kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O&", kwlist,
                                      &condition,
                                      PyArray_AxisConverter, &axis,
-                                     PyArray_OutputConverter, &out)) {
+                                     PyArray_OutputAllowNAConverter, &out)) {
         return NULL;
     }
-    return _ARET(PyArray_Compress(self, condition, axis, out));
+    return PyArray_Return(
+                (PyArrayObject *)PyArray_Compress(self, condition, axis, out));
 }
 
 
@@ -2056,14 +2119,14 @@ array_trace(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      &axis1,
                                      &axis2,
                                      PyArray_DescrConverter2, &dtype,
-                                     PyArray_OutputConverter, &out)) {
+                                     PyArray_OutputAllowNAConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
 
     rtype = _CHKTYPENUM(dtype);
     Py_XDECREF(dtype);
-    return _ARET(PyArray_Trace(self, offset, axis1, axis2, rtype, out));
+    return PyArray_Return((PyArrayObject *)PyArray_Trace(self, offset, axis1, axis2, rtype, out));
 }
 
 #undef _CHKTYPENUM
@@ -2079,14 +2142,14 @@ array_clip(PyArrayObject *self, PyObject *args, PyObject *kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO&", kwlist,
                                      &min,
                                      &max,
-                                     PyArray_OutputConverter, &out)) {
+                                     PyArray_OutputAllowNAConverter, &out)) {
         return NULL;
     }
     if (max == NULL && min == NULL) {
         PyErr_SetString(PyExc_ValueError, "One of max or min must be given.");
         return NULL;
     }
-    return _ARET(PyArray_Clip(self, min, max, out));
+    return PyArray_Return((PyArrayObject *)PyArray_Clip(self, min, max, out));
 }
 
 
@@ -2096,7 +2159,7 @@ array_conjugate(PyArrayObject *self, PyObject *args)
 
     PyArrayObject *out = NULL;
     if (!PyArg_ParseTuple(args, "|O&",
-                          PyArray_OutputConverter,
+                          PyArray_OutputAllowNAConverter,
                           &out)) {
         return NULL;
     }
@@ -2109,6 +2172,7 @@ array_diagonal(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int axis1 = 0, axis2 = 1, offset = 0;
     static char *kwlist[] = {"offset", "axis1", "axis2", NULL};
+    PyArrayObject *ret;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iii", kwlist,
                                      &offset,
@@ -2116,14 +2180,16 @@ array_diagonal(PyArrayObject *self, PyObject *args, PyObject *kwds)
                                      &axis2)) {
         return NULL;
     }
-    return _ARET(PyArray_Diagonal(self, offset, axis1, axis2));
+
+    ret = (PyArrayObject *)PyArray_Diagonal(self, offset, axis1, axis2);
+    return PyArray_Return(ret);
 }
 
 
 static PyObject *
 array_flatten(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER order = PyArray_CORDER;
+    PyArray_ORDER order = NPY_CORDER;
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
@@ -2137,7 +2203,7 @@ array_flatten(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_ravel(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER order = PyArray_CORDER;
+    PyArray_ORDER order = NPY_CORDER;
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
@@ -2157,10 +2223,10 @@ array_round(PyArrayObject *self, PyObject *args, PyObject *kwds)
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO&", kwlist,
                                      &decimals,
-                                     PyArray_OutputConverter, &out)) {
+                                     PyArray_OutputAllowNAConverter, &out)) {
         return NULL;
     }
-    return _ARET(PyArray_Round(self, decimals, out));
+    return PyArray_Return((PyArrayObject *)PyArray_Round(self, decimals, out));
 }
 
 
@@ -2174,7 +2240,7 @@ array_setflags(PyArrayObject *self, PyObject *args, PyObject *kwds)
     PyObject *uic = Py_None;
     int flagback = PyArray_FLAGS(self);
 
-    PyArrayObject_fieldaccess *fa = (PyArrayObject_fieldaccess *)self;
+    PyArrayObject_fields *fa = (PyArrayObject_fields *)self;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO", kwlist,
                                      &write,
@@ -2412,7 +2478,7 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"squeeze",
         (PyCFunction)array_squeeze,
-        METH_VARARGS, NULL},
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"std",
         (PyCFunction)array_stddev,
         METH_VARARGS | METH_KEYWORDS, NULL},
@@ -2448,5 +2514,3 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
-
-#undef _ARET
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 2c1061656..cfbc0a3af 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -32,8 +32,6 @@
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
-#define PyAO PyArrayObject
-
 /* Internal APIs */
 #include "arraytypes.h"
 #include "arrayobject.h"
@@ -44,12 +42,19 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "scalartypes.h"
 #include "numpymemoryview.h"
 #include "convert_datatype.h"
+#include "conversion_utils.h"
 #include "nditer_pywrap.h"
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "datetime_busday.h"
 #include "datetime_busdaycal.h"
+#include "item_selection.h"
+#include "shape.h"
+#include "ctors.h"
+#include "na_object.h"
+#include "na_mask.h"
+#include "reduction.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -61,7 +66,7 @@ NPY_NO_EXPORT double
 PyArray_GetPriority(PyObject *obj, double default_)
 {
     PyObject *ret;
-    double priority = PyArray_PRIORITY;
+    double priority = NPY_PRIORITY;
 
     if (PyArray_CheckExact(obj))
         return priority;
@@ -202,7 +207,7 @@ PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int nd,
         break;
     case 2:
         n = PyArray_DIMS(ap)[0];
-        ptr2 = (char **)_pya_malloc(n * sizeof(char *));
+        ptr2 = (char **)PyArray_malloc(n * sizeof(char *));
         if (!ptr2) {
             goto fail;
         }
@@ -214,7 +219,7 @@ PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int nd,
     case 3:
         n = PyArray_DIMS(ap)[0];
         m = PyArray_DIMS(ap)[1];
-        ptr3 = (char ***)_pya_malloc(n*(m+1) * sizeof(char *));
+        ptr3 = (char ***)PyArray_malloc(n*(m+1) * sizeof(char *));
         if (!ptr3) {
             goto fail;
         }
@@ -294,177 +299,354 @@ PyArray_Free(PyObject *op, void *ptr)
         return -1;
     }
     if (PyArray_NDIM(ap) >= 2) {
-        _pya_free(ptr);
+        PyArray_free(ptr);
     }
     Py_DECREF(ap);
     return 0;
 }
 
 
-static PyObject *
-_swap_and_concat(PyObject *op, int axis, int n)
+/*
+ * Concatenates a list of ndarrays.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis)
 {
-    PyObject *newtup = NULL;
-    PyObject *otmp, *arr;
-    int i;
+    PyTypeObject *subtype = &PyArray_Type;
+    double priority = NPY_PRIORITY;
+    int iarrays, idim, ndim;
+    npy_intp shape[NPY_MAXDIMS], s, strides[NPY_MAXDIMS];
+    int strideperm[NPY_MAXDIMS];
+    PyArray_Descr *dtype = NULL;
+    PyArrayObject *ret = NULL;
+    PyArrayObject_fields *sliding_view = NULL;
+    int has_maskna;
 
-    newtup = PyTuple_New(n);
-    if (newtup == NULL) {
+    if (narrays <= 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "need at least one array to concatenate");
         return NULL;
     }
-    for (i = 0; i < n; i++) {
-        otmp = PySequence_GetItem(op, i);
-        arr = PyArray_FROM_O(otmp);
-        Py_DECREF(otmp);
-        if (arr == NULL) {
-            goto fail;
+
+    /* All the arrays must have the same 'ndim' */
+    ndim = PyArray_NDIM(arrays[0]);
+
+    if (ndim == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "zero-dimensional arrays cannot be concatenated");
+        return NULL;
+    }
+
+    /* Handle standard Python negative indexing */
+    if (axis < 0) {
+        axis += ndim;
+    }
+    if (axis < 0 || axis >= ndim) {
+        PyErr_Format(PyExc_IndexError,
+                        "axis %d out of bounds [0, %d)", axis, ndim);
+        return NULL;
+    }
+
+    /*
+     * Figure out the final concatenated shape starting from the first
+     * array's shape. Also check whether any of the inputs have an
+     * NA mask.
+     */
+    memcpy(shape, PyArray_SHAPE(arrays[0]), ndim * sizeof(shape[0]));
+    has_maskna = PyArray_HASMASKNA(arrays[0]);
+    for (iarrays = 1; iarrays < narrays; ++iarrays) {
+        npy_intp *arr_shape;
+
+        if (PyArray_NDIM(arrays[iarrays]) != ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                            "all the input arrays must have same "
+                            "number of dimensions");
+            return NULL;
         }
-        otmp = PyArray_SwapAxes((PyArrayObject *)arr, axis, 0);
-        Py_DECREF(arr);
-        if (otmp == NULL) {
-            goto fail;
+        arr_shape = PyArray_SHAPE(arrays[iarrays]);
+
+        for (idim = 0; idim < ndim; ++idim) {
+            /* Build up the size of the concatenation axis */
+            if (idim == axis) {
+                shape[idim] += arr_shape[idim];
+            }
+            /* Validate that the rest of the dimensions match */
+            else if (shape[idim] != arr_shape[idim]) {
+                PyErr_SetString(PyExc_ValueError,
+                                "all the input array dimensions "
+                                "except for the concatenation axis "
+                                "must match exactly");
+                return NULL;
+            }
         }
-        PyTuple_SET_ITEM(newtup, i, otmp);
+
+        has_maskna = has_maskna || PyArray_HASMASKNA(arrays[iarrays]);
     }
-    otmp = PyArray_Concatenate(newtup, 0);
-    Py_DECREF(newtup);
-    if (otmp == NULL) {
+
+    /* Get the priority subtype for the array */
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        if (Py_TYPE(arrays[iarrays]) != subtype) {
+            double pr = PyArray_GetPriority((PyObject *)(arrays[iarrays]), 0.0);
+            if (pr > priority) {
+                priority = pr;
+                subtype = Py_TYPE(arrays[iarrays]);
+            }
+        }
+    }
+
+    /* Get the resulting dtype from combining all the arrays */
+    dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
+    if (dtype == NULL) {
         return NULL;
     }
-    arr = PyArray_SwapAxes((PyArrayObject *)otmp, axis, 0);
-    Py_DECREF(otmp);
-    return arr;
 
- fail:
-    Py_DECREF(newtup);
-    return NULL;
+    /*
+     * Figure out the permutation to apply to the strides to match
+     * the memory layout of the input arrays, using ambiguity
+     * resolution rules matching that of the NpyIter.
+     */
+    PyArray_CreateMultiSortedStridePerm(narrays, arrays, ndim, strideperm);
+    s = dtype->elsize;
+    for (idim = ndim-1; idim >= 0; --idim) {
+        int iperm = strideperm[idim];
+        strides[iperm] = s;
+        s *= shape[iperm];
+    }
+
+    /* Allocate the array for the result. This steals the 'dtype' reference. */
+    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype,
+                                                    dtype,
+                                                    ndim,
+                                                    shape,
+                                                    strides,
+                                                    NULL,
+                                                    0,
+                                                    NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Add an NA mask if required */
+    if (has_maskna) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    /*
+     * Create a view which slides through ret for assigning the
+     * successive input arrays.
+     */
+    sliding_view = (PyArrayObject_fields *)PyArray_View(ret,
+                                                        NULL, &PyArray_Type);
+    if (sliding_view == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        /* Set the dimension to match the input array's */
+        sliding_view->dimensions[axis] = PyArray_SHAPE(arrays[iarrays])[axis];
+
+        /* Copy the data for this array */
+        if (PyArray_AssignArray((PyArrayObject *)sliding_view, arrays[iarrays],
+                            NULL, NPY_SAME_KIND_CASTING, 0, NULL) < 0) {
+            Py_DECREF(sliding_view);
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        /* Slide to the start of the next window */
+        sliding_view->data += sliding_view->dimensions[axis] *
+                                 sliding_view->strides[axis];
+        if (has_maskna) {
+            sliding_view->maskna_data += sliding_view->dimensions[axis] *
+                                     sliding_view->maskna_strides[axis];
+        }
+    }
+
+    Py_DECREF(sliding_view);
+    return ret;
 }
 
+/*
+ * Concatenates a list of ndarrays, flattening each in the specified order.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
+                                    NPY_ORDER order)
+{
+    PyTypeObject *subtype = &PyArray_Type;
+    double priority = NPY_PRIORITY;
+    int iarrays;
+    npy_intp shape[2], strides[2];
+    PyArray_Descr *dtype = NULL;
+    PyArrayObject *ret = NULL;
+    PyArrayObject_fields *sliding_view = NULL;
+    int has_maskna;
+
+    if (narrays <= 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "need at least one array to concatenate");
+        return NULL;
+    }
+
+    /* All the arrays must have the same total number of elements */
+    shape[0] = narrays;
+    shape[1] = PyArray_SIZE(arrays[0]);
+
+    /*
+     * Figure out the final concatenated shape starting from the first
+     * array's shape. Also check whether any of the inputs have an
+     * NA mask.
+     */
+    has_maskna = PyArray_HASMASKNA(arrays[0]);
+    for (iarrays = 1; iarrays < narrays; ++iarrays) {
+        if (PyArray_SIZE(arrays[iarrays]) != shape[1]) {
+            PyErr_SetString(PyExc_ValueError,
+                            "all the input arrays must have same "
+                            "number of elements");
+            return NULL;
+        }
+
+        has_maskna = has_maskna || PyArray_HASMASKNA(arrays[iarrays]);
+    }
+
+    /* Get the priority subtype for the array */
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        if (Py_TYPE(arrays[iarrays]) != subtype) {
+            double pr = PyArray_GetPriority((PyObject *)(arrays[iarrays]), 0.0);
+            if (pr > priority) {
+                priority = pr;
+                subtype = Py_TYPE(arrays[iarrays]);
+            }
+        }
+    }
+
+    /* Get the resulting dtype from combining all the arrays */
+    dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
+    if (dtype == NULL) {
+        return NULL;
+    }
+
+    strides[1] = dtype->elsize;
+    strides[0] = strides[1] * shape[1];
+
+    /* Allocate the array for the result. This steals the 'dtype' reference. */
+    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype,
+                                                    dtype,
+                                                    2,
+                                                    shape,
+                                                    strides,
+                                                    NULL,
+                                                    0,
+                                                    NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Add an NA mask if required */
+    if (has_maskna) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    /*
+     * Create a view which slides through ret for assigning the
+     * successive input arrays.
+     */
+    sliding_view = (PyArrayObject_fields *)PyArray_View(ret,
+                                                        NULL, &PyArray_Type);
+    if (sliding_view == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    /* Each array gets flattened into one slot along 'axis' */
+    sliding_view->dimensions[0] = 1;
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+
+        /* Copy the data for this array */
+        if (PyArray_CopyAsFlat((PyArrayObject *)sliding_view, arrays[iarrays],
+                            order) < 0) {
+            Py_DECREF(sliding_view);
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        /* Slide to the start of the next window */
+        sliding_view->data += sliding_view->strides[0];
+        if (has_maskna) {
+            sliding_view->maskna_data += sliding_view->maskna_strides[0];
+        }
+    }
+
+    Py_DECREF(sliding_view);
+    return ret;
+}
+
+
 /*NUMPY_API
  * Concatenate
  *
  * Concatenate an arbitrary Python sequence into an array.
  * op is a python object supporting the sequence interface.
  * Its elements will be concatenated together to form a single
- * multidimensional array. If axis is MAX_DIMS or bigger, then
+ * multidimensional array. If axis is NPY_MAXDIMS or bigger, then
  * each sequence object will be flattened before concatenation
 */
 NPY_NO_EXPORT PyObject *
 PyArray_Concatenate(PyObject *op, int axis)
 {
-    PyArrayObject *ret, **mps;
-    PyObject *otmp;
-    int i, n, tmp, nd = 0, new_dim;
-    char *data;
-    PyTypeObject *subtype;
-    double prior1, prior2;
-    npy_intp numbytes;
+    int iarrays, narrays;
+    PyArrayObject **arrays;
+    PyArrayObject *ret;
 
-    n = PySequence_Length(op);
-    if (n == -1) {
-        return NULL;
-    }
-    if (n == 0) {
-        PyErr_SetString(PyExc_ValueError,
-                        "concatenation of zero-length sequences is "\
-                        "impossible");
+    /* Convert the input list into arrays */
+    narrays = PySequence_Size(op);
+    if (narrays < 0) {
         return NULL;
     }
-
-    if ((axis < 0) || ((0 < axis) && (axis < MAX_DIMS))) {
-        return _swap_and_concat(op, axis, n);
-    }
-    mps = PyArray_ConvertToCommonType(op, &n);
-    if (mps == NULL) {
+    arrays = PyArray_malloc(narrays * sizeof(arrays[0]));
+    if (arrays == NULL) {
+        PyErr_NoMemory();
         return NULL;
     }
-
-    /*
-     * Make sure these arrays are legal to concatenate.
-     * Must have same dimensions except d0
-     */
-    prior1 = PyArray_PRIORITY;
-    subtype = &PyArray_Type;
-    ret = NULL;
-    for (i = 0; i < n; i++) {
-        if (axis >= MAX_DIMS) {
-            otmp = PyArray_Ravel(mps[i],0);
-            Py_DECREF(mps[i]);
-            mps[i] = (PyArrayObject *)otmp;
-        }
-        if (Py_TYPE(mps[i]) != subtype) {
-            prior2 = PyArray_GetPriority((PyObject *)(mps[i]), 0.0);
-            if (prior2 > prior1) {
-                prior1 = prior2;
-                subtype = Py_TYPE(mps[i]);
-            }
-        }
-    }
-
-    new_dim = 0;
-    for (i = 0; i < n; i++) {
-        if (mps[i] == NULL) {
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        PyObject *item = PySequence_GetItem(op, iarrays);
+        if (item == NULL) {
+            narrays = iarrays;
             goto fail;
         }
-        if (i == 0) {
-            nd = PyArray_NDIM(mps[i]);
-        }
-        else {
-            if (nd != PyArray_NDIM(mps[i])) {
-                PyErr_SetString(PyExc_ValueError,
-                                "arrays must have same "\
-                                "number of dimensions");
-                goto fail;
-            }
-            if (!PyArray_CompareLists(PyArray_DIMS(mps[0])+1,
-                                      PyArray_DIMS(mps[i])+1,
-                                      nd-1)) {
-                PyErr_SetString(PyExc_ValueError,
-                                "array dimensions must "\
-                                "agree except for d_0");
-                goto fail;
-            }
-        }
-        if (nd == 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "0-d arrays can't be concatenated");
+        arrays[iarrays] = (PyArrayObject *)PyArray_FromAny(item, NULL,
+                                            0, 0, NPY_ARRAY_ALLOWNA, NULL);
+        Py_DECREF(item);
+        if (arrays[iarrays] == NULL) {
+            narrays = iarrays;
             goto fail;
         }
-        new_dim += PyArray_DIMS(mps[i])[0];
     }
-    tmp = PyArray_DIMS(mps[0])[0];
-    PyArray_DIMS(mps[0])[0] = new_dim;
-    Py_INCREF(PyArray_DESCR(mps[0]));
-    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype,
-                                                PyArray_DESCR(mps[0]), nd,
-                                                PyArray_DIMS(mps[0]),
-                                                NULL, NULL, 0,
-                                                (PyObject *)ret);
-    PyArray_DIMS(mps[0])[0] = tmp;
 
-    if (ret == NULL) {
-        goto fail;
+    if (axis >= NPY_MAXDIMS) {
+        ret = PyArray_ConcatenateFlattenedArrays(narrays, arrays, NPY_CORDER);
     }
-    data = PyArray_DATA(ret);
-    for (i = 0; i < n; i++) {
-        numbytes = PyArray_NBYTES(mps[i]);
-        memcpy(data, PyArray_DATA(mps[i]), numbytes);
-        data += numbytes;
+    else {
+        ret = PyArray_ConcatenateArrays(narrays, arrays, axis);
     }
 
-    PyArray_INCREF(ret);
-    for (i = 0; i < n; i++) {
-        Py_XDECREF(mps[i]);
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        Py_DECREF(arrays[iarrays]);
     }
-    PyDataMem_FREE(mps);
+
     return (PyObject *)ret;
 
- fail:
-    Py_XDECREF(ret);
-    for (i = 0; i < n; i++) {
-        Py_XDECREF(mps[i]);
+fail:
+    /* 'narrays' was set to how far we got in the conversion */
+    for (iarrays = 0; iarrays < narrays; ++iarrays) {
+        Py_DECREF(arrays[iarrays]);
     }
-    PyDataMem_FREE(mps);
+
     return NULL;
 }
 
@@ -652,7 +834,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     int typenum, nd, axis;
     npy_intp is1, is2, os;
     char *op;
-    npy_intp dimensions[MAX_DIMS];
+    npy_intp dimensions[NPY_MAXDIMS];
     PyArray_DotFunc *dot;
     PyArray_Descr *typec;
     NPY_BEGIN_THREADS_DEF;
@@ -761,7 +943,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     int typenum, nd, axis, matchDim;
     npy_intp is1, is2, os;
     char *op;
-    npy_intp dimensions[MAX_DIMS];
+    npy_intp dimensions[NPY_MAXDIMS];
     PyArray_DotFunc *dot;
     PyArray_Descr *typec;
     NPY_BEGIN_THREADS_DEF;
@@ -941,7 +1123,8 @@ PyArray_CopyAndTranspose(PyObject *op)
 }
 
 /*
- * Implementation which is common between PyArray_Correlate and PyArray_Correlate2
+ * Implementation which is common between PyArray_Correlate
+ * and PyArray_Correlate2.
  *
  * inverted is set to 1 if computed correlate(ap2, ap1), 0 otherwise
  */
@@ -1229,159 +1412,6 @@ array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
-/*NUMPY_API
- * Convert an object to FORTRAN / C / ANY / KEEP
- */
-NPY_NO_EXPORT int
-PyArray_OrderConverter(PyObject *object, NPY_ORDER *val)
-{
-    char *str;
-    /* Leave the desired default from the caller for NULL/Py_None */
-    if (object == NULL || object == Py_None) {
-        return PY_SUCCEED;
-    }
-    else if (PyUnicode_Check(object)) {
-        PyObject *tmp;
-        int ret;
-        tmp = PyUnicode_AsASCIIString(object);
-        ret = PyArray_OrderConverter(tmp, val);
-        Py_DECREF(tmp);
-        return ret;
-    }
-    else if (!PyBytes_Check(object) || PyBytes_GET_SIZE(object) < 1) {
-        if (PyObject_IsTrue(object)) {
-            *val = NPY_FORTRANORDER;
-        }
-        else {
-            *val = NPY_CORDER;
-        }
-        if (PyErr_Occurred()) {
-            return PY_FAIL;
-        }
-        return PY_SUCCEED;
-    }
-    else {
-        str = PyBytes_AS_STRING(object);
-        if (str[0] == 'C' || str[0] == 'c') {
-            *val = NPY_CORDER;
-        }
-        else if (str[0] == 'F' || str[0] == 'f') {
-            *val = NPY_FORTRANORDER;
-        }
-        else if (str[0] == 'A' || str[0] == 'a') {
-            *val = NPY_ANYORDER;
-        }
-        else if (str[0] == 'K' || str[0] == 'k') {
-            *val = NPY_KEEPORDER;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                            "order not understood");
-            return PY_FAIL;
-        }
-    }
-    return PY_SUCCEED;
-}
-
-/*NUMPY_API
- * Convert an object to NPY_RAISE / NPY_CLIP / NPY_WRAP
- */
-NPY_NO_EXPORT int
-PyArray_ClipmodeConverter(PyObject *object, NPY_CLIPMODE *val)
-{
-    if (object == NULL || object == Py_None) {
-        *val = NPY_RAISE;
-    }
-    else if (PyBytes_Check(object)) {
-        char *str;
-        str = PyBytes_AS_STRING(object);
-        if (str[0] == 'C' || str[0] == 'c') {
-            *val = NPY_CLIP;
-        }
-        else if (str[0] == 'W' || str[0] == 'w') {
-            *val = NPY_WRAP;
-        }
-        else if (str[0] == 'R' || str[0] == 'r') {
-            *val = NPY_RAISE;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                            "clipmode not understood");
-            return PY_FAIL;
-        }
-    }
-    else if (PyUnicode_Check(object)) {
-        PyObject *tmp;
-        int ret;
-        tmp = PyUnicode_AsASCIIString(object);
-        ret = PyArray_ClipmodeConverter(tmp, val);
-        Py_DECREF(tmp);
-        return ret;
-    }
-    else {
-        int number = PyInt_AsLong(object);
-        if (number == -1 && PyErr_Occurred()) {
-            goto fail;
-        }
-        if (number <= (int) NPY_RAISE
-                && number >= (int) NPY_CLIP) {
-            *val = (NPY_CLIPMODE) number;
-        }
-        else {
-            goto fail;
-        }
-    }
-    return PY_SUCCEED;
-
- fail:
-    PyErr_SetString(PyExc_TypeError,
-                    "clipmode not understood");
-    return PY_FAIL;
-}
-
-/*NUMPY_API
- * Convert an object to an array of n NPY_CLIPMODE values.
- * This is intended to be used in functions where a different mode
- * could be applied to each axis, like in ravel_multi_index.
- */
-NPY_NO_EXPORT int
-PyArray_ConvertClipmodeSequence(PyObject *object, NPY_CLIPMODE *modes, int n)
-{
-    int i;
-    /* Get the clip mode(s) */
-    if (object && (PyTuple_Check(object) || PyList_Check(object))) {
-        if (PySequence_Size(object) != n) {
-            PyErr_Format(PyExc_ValueError,
-                    "list of clipmodes has wrong length (%d instead of %d)",
-                    (int)PySequence_Size(object), n);
-            return PY_FAIL;
-        }
-
-        for (i = 0; i < n; ++i) {
-            PyObject *item = PySequence_GetItem(object, i);
-            if(item == NULL) {
-                return PY_FAIL;
-            }
-
-            if(PyArray_ClipmodeConverter(item, &modes[i]) != PY_SUCCEED) {
-                Py_DECREF(item);
-                return PY_FAIL;
-            }
-
-            Py_DECREF(item);
-        }
-    }
-    else if (PyArray_ClipmodeConverter(object, &modes[0]) == PY_SUCCEED) {
-        for (i = 1; i < n; ++i) {
-            modes[i] = modes[0];
-        }
-    }
-    else {
-        return PY_FAIL;
-    }
-    return PY_SUCCEED;
-}
-
 /*
  * Compare the field dictionaries for two types.
  *
@@ -1510,8 +1540,8 @@ PyArray_EquivTypenums(int typenum1, int typenum2)
 static PyObject *
 _prepend_ones(PyArrayObject *arr, int nd, int ndmin)
 {
-    npy_intp newdims[MAX_DIMS];
-    npy_intp newstrides[MAX_DIMS];
+    npy_intp newdims[NPY_MAXDIMS];
+    npy_intp newstrides[NPY_MAXDIMS];
     int i, k, num;
     PyArrayObject *ret;
     PyArray_Descr *dtype;
@@ -1530,7 +1560,9 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin)
     Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(arr),
                         dtype, ndmin, newdims, newstrides,
-                        PyArray_DATA(arr), PyArray_FLAGS(arr), (PyObject *)arr);
+                        PyArray_DATA(arr),
+            PyArray_FLAGS(arr) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
+                        (PyObject *)arr);
     if (ret == NULL) {
         return NULL;
     }
@@ -1539,17 +1571,34 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin)
         Py_DECREF(ret);
         return NULL;
     }
+
+    /* Take a view of the NA mask as well if necessary */
+    if (PyArray_HASMASKNA(arr)) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+
+        fret->maskna_dtype = PyArray_MASKNA_DTYPE(arr);
+        Py_INCREF(fret->maskna_dtype);
+        fret->maskna_data = PyArray_MASKNA_DATA(arr);
+
+        for (i = 0; i < num; ++i) {
+            fret->maskna_strides[i] = 0;
+        }
+        for (i = num; i < ndmin; ++i) {
+            fret->maskna_strides[i] = PyArray_MASKNA_STRIDES(arr)[i - num];
+        }
+        fret->flags |= NPY_ARRAY_MASKNA;
+    }
+
+
     return (PyObject *)ret;
 }
 
 
-#define _ARET(x) PyArray_Return((PyArrayObject *)(x))
-
-#define STRIDING_OK(op, order) ((order) == NPY_ANYORDER ||          \
-                                ((order) == NPY_CORDER &&           \
-                                 PyArray_ISCONTIGUOUS(op)) ||           \
-                                ((order) == NPY_FORTRANORDER &&     \
-                                 PyArray_ISFORTRAN(op)))
+#define STRIDING_OK(op, order) \
+                ((order) == NPY_ANYORDER || \
+                 (order) == NPY_KEEPORDER || \
+                 ((order) == NPY_CORDER && PyArray_ISCONTIGUOUS(op)) || \
+                 ((order) == NPY_FORTRANORDER && PyArray_ISFORTRAN(op)))
 
 static PyObject *
 _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
@@ -1561,40 +1610,88 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     int ndmin = 0, nd;
     PyArray_Descr *type = NULL;
     PyArray_Descr *oldtype = NULL;
-    NPY_ORDER order = NPY_ANYORDER;
-    int flags = 0;
+    NPY_ORDER order = NPY_KEEPORDER;
+    int flags = 0, maskna = -1, ownmaskna = 0;
+    PyObject *maskna_in = Py_None;
 
     static char *kwd[]= {"object", "dtype", "copy", "order", "subok",
-                         "ndmin", NULL};
+                         "ndmin", "maskna", "ownmaskna", NULL};
 
     if (PyTuple_GET_SIZE(args) > 2) {
         PyErr_SetString(PyExc_ValueError,
                         "only 2 non-keyword arguments accepted");
         return NULL;
     }
-    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i", kwd, &op,
+    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&iOi", kwd,
+                &op,
                 PyArray_DescrConverter2, &type,
                 PyArray_BoolConverter, &copy,
                 PyArray_OrderConverter, &order,
                 PyArray_BoolConverter, &subok,
-                &ndmin)) {
+                &ndmin,
+                &maskna_in,
+                &ownmaskna)) {
         goto clean_type;
     }
 
+    /*
+     * Treat None the same as not providing the parameter, set
+     * maskna to -1 (unprovided), 0 (False), or 1 (True).
+     */
+    if (maskna_in != Py_None) {
+        maskna = PyObject_IsTrue(maskna_in);
+        if (maskna == -1) {
+            return NULL;
+        }
+    }
+
+    /* 'ownmaskna' forces 'maskna' to be True */
+    if (ownmaskna) {
+        if (maskna == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                    "cannot specify maskna=False and ownmaskna=True");
+            return NULL;
+        }
+        else {
+            maskna = 1;
+        }
+    }
+
     if (ndmin > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
-                "ndmin bigger than allowable number of dimensions "\
+                "ndmin bigger than allowable number of dimensions "
                 "NPY_MAXDIMS (=%d)", NPY_MAXDIMS);
         goto clean_type;
     }
     /* fast exit if simple call */
-    if ((subok && PyArray_Check(op))
-            || (!subok && PyArray_CheckExact(op))) {
+    if (((subok && PyArray_Check(op)) ||
+                 (!subok && PyArray_CheckExact(op))) &&
+              ((maskna == -1) ||
+               (maskna == 1 && PyArray_HASMASKNA((PyArrayObject *)op)) ||
+               (maskna == 0 && !PyArray_HASMASKNA((PyArrayObject *)op)))) {
         oparr = (PyArrayObject *)op;
         if (type == NULL) {
             if (!copy && STRIDING_OK(oparr, order)) {
-                Py_INCREF(op);
-                ret = oparr;
+                /*
+                 * If mask ownership is requested and the array doesn't
+                 * already own its own mask, make a view which owns its
+                 * own mask.
+                 */
+                if (ownmaskna &&
+                            !(PyArray_FLAGS(oparr) & NPY_ARRAY_OWNMASKNA)) {
+                    ret = (PyArrayObject *)PyArray_View(oparr, NULL, NULL);
+                    if (ret == NULL) {
+                        return NULL;
+                    }
+                    if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+                        Py_DECREF(ret);
+                        return NULL;
+                    }
+                }
+                else {
+                    ret = oparr;
+                    Py_INCREF(ret);
+                }
                 goto finish;
             }
             else {
@@ -1605,7 +1702,7 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
         /* One more chance */
         oldtype = PyArray_DESCR(oparr);
         if (PyArray_EquivTypes(oldtype, type)) {
-            if (!copy && STRIDING_OK(oparr, order)) {
+            if (!copy && !ownmaskna && STRIDING_OK(oparr, order)) {
                 Py_INCREF(op);
                 ret = oparr;
                 goto finish;
@@ -1617,7 +1714,7 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
                 }
                 Py_INCREF(oldtype);
                 Py_DECREF(PyArray_DESCR(ret));
-                ((PyArrayObject_fieldaccess *)ret)->descr = oldtype;
+                ((PyArrayObject_fields *)ret)->descr = oldtype;
                 goto finish;
             }
         }
@@ -1639,6 +1736,23 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
         flags |= NPY_ARRAY_ENSUREARRAY;
     }
 
+    /* If maskna is the default, allow NA to pass through */
+    if (maskna == -1) {
+        flags |= NPY_ARRAY_ALLOWNA;
+    }
+    /* If maskna is True, force there to be an NA mask */
+    else if (maskna == 1) {
+        flags |= NPY_ARRAY_MASKNA | NPY_ARRAY_ALLOWNA;
+        if (ownmaskna) {
+            flags |= NPY_ARRAY_OWNMASKNA;
+        }
+    }
+    /*
+     * Otherwise maskna is False, so we don't specify NPY_ARRAY_ALLOWNA.
+     * An array with an NA mask will cause a copy into an array
+     * without an NA mask
+     */
+
     flags |= NPY_ARRAY_FORCECAST;
     Py_XINCREF(type);
     ret = (PyArrayObject *)PyArray_CheckFromAny(op, type,
@@ -1646,10 +1760,12 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
 
  finish:
     Py_XDECREF(type);
-    if (!ret) {
-        return (PyObject *)ret;
+    if (ret == NULL) {
+        return NULL;
     }
-    else if ((nd=PyArray_NDIM(ret)) >= ndmin) {
+
+    nd = PyArray_NDIM(ret);
+    if (nd >= ndmin) {
         return (PyObject *)ret;
     }
     /*
@@ -1667,16 +1783,18 @@ static PyObject *
 array_copyto(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"dst","src","casting","where",NULL};
+    static char *kwlist[] = {"dst","src","casting","where","preservena",NULL};
     PyObject *wheremask_in = NULL;
     PyArrayObject *dst = NULL, *src = NULL, *wheremask = NULL;
     NPY_CASTING casting = NPY_SAME_KIND_CASTING;
+    int preservena = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O&|O&O", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O&|O&Oi", kwlist,
                 &PyArray_Type, &dst,
-                &PyArray_Converter, &src,
+                &PyArray_AllowNAConverter, &src,
                 &PyArray_CastingConverter, &casting,
-                &wheremask_in)) {
+                &wheremask_in,
+                &preservena)) {
         goto fail;
     }
 
@@ -1687,41 +1805,15 @@ array_copyto(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
             goto fail;
         }
         wheremask = (PyArrayObject *)PyArray_FromAny(wheremask_in,
-                                                dtype, 0, 0, 0, NULL);
+                                        dtype, 0, 0, NPY_ARRAY_ALLOWNA, NULL);
         if (wheremask == NULL) {
             goto fail;
         }
-
-        /* Use the 'move' function which handles overlapping */
-        if (PyArray_MaskedMoveInto(dst, src, wheremask, casting) < 0) {
-            goto fail;
-        }
     }
-    else {
-        /*
-         * MoveInto doesn't accept a casting rule, must check it
-         * ourselves.
-         */
-        if (!PyArray_CanCastArrayTo(src, PyArray_DESCR(dst), casting)) {
-            PyObject *errmsg;
-            errmsg = PyUString_FromString("Cannot cast array data from ");
-            PyUString_ConcatAndDel(&errmsg,
-                    PyObject_Repr((PyObject *)PyArray_DESCR(src)));
-            PyUString_ConcatAndDel(&errmsg,
-                    PyUString_FromString(" to "));
-            PyUString_ConcatAndDel(&errmsg,
-                    PyObject_Repr((PyObject *)PyArray_DESCR(dst)));
-            PyUString_ConcatAndDel(&errmsg,
-                    PyUString_FromFormat(" according to the rule %s",
-                            npy_casting_to_string(casting)));
-            PyErr_SetObject(PyExc_TypeError, errmsg);
-            goto fail;
-        }
 
-        /* Use the 'move' function which handles overlapping */
-        if (PyArray_MoveInto(dst, src) < 0) {
-            goto fail;
-        }
+    if (PyArray_AssignArray(dst, src,
+                    wheremask, casting, preservena, NULL) < 0) {
+        goto fail;
     }
 
     Py_XDECREF(src);
@@ -1740,17 +1832,19 @@ static PyObject *
 array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"shape","dtype","order",NULL};
+    static char *kwlist[] = {"shape","dtype","order","maskna",NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
     npy_bool is_f_order;
-    PyObject *ret = NULL;
+    PyArrayObject *ret = NULL;
+    int maskna = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&i", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &maskna)) {
         goto fail;
     }
 
@@ -1767,9 +1861,22 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
             goto fail;
     }
 
-    ret = PyArray_Empty(shape.len, shape.ptr, typecode, is_f_order);
+    ret = (PyArrayObject *)PyArray_Empty(shape.len, shape.ptr,
+                                            typecode, is_f_order);
+
+    if (maskna) {
+        /*
+         * Default the mask to all NA values, because the data is
+         * not initialized
+         */
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 0) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
     PyDimMem_FREE(shape.ptr);
-    return ret;
+    return (PyObject *)ret;
 
  fail:
     Py_XDECREF(typecode);
@@ -1781,24 +1888,38 @@ static PyObject *
 array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"prototype","dtype","order","subok",NULL};
+    static char *kwlist[] = {"prototype","dtype","order","subok","maskna",NULL};
     PyArrayObject *prototype = NULL;
     PyArray_Descr *dtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
-    PyObject *ret = NULL;
-    int subok = 1;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&i", kwlist,
-                PyArray_Converter, &prototype,
-                PyArray_DescrConverter2, &dtype,
-                PyArray_OrderConverter, &order,
-                &subok)) {
+    PyArrayObject *ret = NULL;
+    int subok = 1, maskna = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&ii", kwlist,
+                &PyArray_AllowNAConverter, &prototype,
+                &PyArray_DescrConverter2, &dtype,
+                &PyArray_OrderConverter, &order,
+                &subok,
+                &maskna)) {
         goto fail;
     }
     /* steals the reference to dtype if it's not NULL */
-    ret = PyArray_NewLikeArray(prototype, order, dtype, subok);
+    ret = (PyArrayObject *)PyArray_NewLikeArray(prototype,
+                                            order, dtype, subok);
     Py_DECREF(prototype);
-    return ret;
+
+    if (maskna) {
+        /*
+         * Default the mask to all NA values, because the data is
+         * not initialized
+         */
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 0) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    return (PyObject *)ret;
 
  fail:
     Py_XDECREF(prototype);
@@ -1840,7 +1961,7 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     }
     else {
         if (obj == NULL) {
-            dptr = _pya_malloc(typecode->elsize);
+            dptr = PyArray_malloc(typecode->elsize);
             if (dptr == NULL) {
                 return PyErr_NoMemory();
             }
@@ -1865,7 +1986,7 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 
     /* free dptr which contains zeros */
     if (alloc) {
-        _pya_free(dptr);
+        PyArray_free(dptr);
     }
     return ret;
 }
@@ -1873,17 +1994,19 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */
+    static char *kwlist[] = {"shape","dtype","order","maskna",NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
     npy_bool is_f_order = FALSE;
-    PyObject *ret = NULL;
+    PyArrayObject *ret = NULL;
+    int maskna = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&i", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &maskna)) {
         goto fail;
     }
 
@@ -1900,48 +2023,112 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
             goto fail;
     }
 
-    ret = PyArray_Zeros(shape.len, shape.ptr, typecode, (int) is_f_order);
+    ret = (PyArrayObject *)PyArray_Zeros(shape.len, shape.ptr,
+                                        typecode, (int) is_f_order);
+
+    if (maskna) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
     PyDimMem_FREE(shape.ptr);
-    return ret;
+    return (PyObject *)ret;
 
  fail:
     Py_XDECREF(typecode);
     PyDimMem_FREE(shape.ptr);
-    return ret;
+    return (PyObject *)ret;
 }
 
 static PyObject *
-array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args)
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
-    PyObject *array_in;
-    PyArrayObject *array;
-    npy_intp count;
+    static char *kwlist[] = {"arr", "axis", "out", "skipna", "keepdims", NULL};
 
-    if (!PyArg_ParseTuple(args, "O", &array_in)) {
+    PyObject *array_in, *axis_in = NULL, *out_in = NULL;
+    PyObject *ret = NULL;
+    PyArrayObject *array, *out = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0, keepdims = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "O|OOii:count_nonzero", kwlist,
+                                &array_in,
+                                &axis_in,
+                                &out_in,
+                                &skipna,
+                                &keepdims)) {
         return NULL;
     }
 
-    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL, 0, 0, 0, NULL);
+    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL,
+                                        0, 0, NPY_ARRAY_ALLOWNA, NULL);
     if (array == NULL) {
         return NULL;
     }
 
-    count =  PyArray_CountNonzero(array);
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        Py_DECREF(array);
+        return NULL;
+    }
+
+    if (out_in != NULL) {
+        if (PyArray_Check(out_in)) {
+            out = (PyArrayObject *)out_in;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "'out' must be an array");
+            return NULL;
+        }
+    }
+
+    ret = PyArray_ReduceCountNonzero(array, out, axis_flags, skipna, keepdims);
 
     Py_DECREF(array);
 
-#if defined(NPY_PY3K)
-    return (count == -1) ? NULL : PyLong_FromSsize_t(count);
-#elif PY_VERSION_HEX >= 0x02050000
-    return (count == -1) ? NULL : PyInt_FromSsize_t(count);
-#else
-    if ((npy_intp)((long)count) == count) {
-        return (count == -1) ? NULL : PyInt_FromLong(count);
+    return ret;
+}
+
+static PyObject *
+array_count_reduce_items(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"arr", "axis", "skipna", "keepdims", NULL};
+
+    PyObject *array_in, *axis_in = NULL;
+    PyObject *ret = NULL;
+    PyArrayObject *array;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0, keepdims = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "O|Oii:count_reduce_items", kwlist,
+                                &array_in,
+                                &axis_in,
+                                &skipna,
+                                &keepdims)) {
+        return NULL;
     }
-    else {
-        return (count == -1) ? NULL : PyLong_FromVoidPtr((void*)count);
+
+    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL,
+                                        0, 0, NPY_ARRAY_ALLOWNA, NULL);
+    if (array == NULL) {
+        return NULL;
     }
-#endif
+
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        Py_DECREF(array);
+        return NULL;
+    }
+
+    ret = PyArray_CountReduceItems(array, axis_flags, skipna, keepdims);
+
+    Py_DECREF(array);
+
+    return ret;
 }
 
 static PyObject *
@@ -2070,7 +2257,7 @@ array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (!PyArg_ParseTuple(args, "OO", &a0, &b0)) {
         return NULL;
     }
-    return _ARET(PyArray_InnerProduct(a0, b0));
+    return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(a0, b0));
 }
 
 static PyObject *
@@ -2090,7 +2277,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
                         "'out' must be an array");
         return NULL;
     }
-    return _ARET(PyArray_MatrixProduct2(a, v, (PyArrayObject *)o));
+    return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct2(a, v, (PyArrayObject *)o));
 }
 
 static int
@@ -2431,7 +2618,7 @@ array_einsum(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 
     /* If no output was supplied, possibly convert to a scalar */
     if (ret != NULL && out == NULL) {
-        ret = _ARET(ret);
+        ret = PyArray_Return((PyArrayObject *)ret);
     }
 
 finish:
@@ -2447,6 +2634,18 @@ finish:
 }
 
 static PyObject *
+array_isna(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *a;
+    static char *kwlist[] = {"a", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:isna", kwlist, &a)) {
+        return NULL;
+    }
+    return PyArray_IsNA(a);
+}
+
+static PyObject *
 array_fastCopyAndTranspose(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *a0;
@@ -2454,7 +2653,7 @@ array_fastCopyAndTranspose(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (!PyArg_ParseTuple(args, "O", &a0)) {
         return NULL;
     }
-    return _ARET(PyArray_CopyAndTranspose(a0));
+    return PyArray_Return((PyArrayObject *)PyArray_CopyAndTranspose(a0));
 }
 
 static PyObject *
@@ -2488,17 +2687,30 @@ array_correlate2(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 static PyObject *
 array_arange(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws) {
     PyObject *o_start = NULL, *o_stop = NULL, *o_step = NULL, *range=NULL;
-    static char *kwd[]= {"start", "stop", "step", "dtype", NULL};
+    static char *kwd[]= {"start", "stop", "step", "dtype", "maskna", NULL};
     PyArray_Descr *typecode = NULL;
-
-    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&", kwd,
-                &o_start, &o_stop, &o_step,
-                PyArray_DescrConverter2, &typecode)) {
+    int maskna = 0;
+
+    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&i", kwd,
+                &o_start,
+                &o_stop,
+                &o_step,
+                PyArray_DescrConverter2, &typecode,
+                &maskna)) {
         Py_XDECREF(typecode);
         return NULL;
     }
     range = PyArray_ArangeObj(o_start, o_stop, o_step, typecode);
     Py_XDECREF(typecode);
+
+    /* Allocate an NA mask if requested */
+    if (maskna) {
+        if (PyArray_AllocateMaskNA((PyArrayObject *)range, 1, 0, 1) < 0) {
+            Py_DECREF(range);
+            return NULL;
+        }
+    }
+
     return range;
 }
 
@@ -2640,7 +2852,7 @@ static PyObject *
 array_set_datetimeparse_function(PyObject *NPY_UNUSED(self),
         PyObject *NPY_UNUSED(args), PyObject *NPY_UNUSED(kwds))
 {
-    PyErr_SetString(PyExc_RuntimeError, "This function is to be removed");
+    PyErr_SetString(PyExc_RuntimeError, "This function has been removed");
     return NULL;
 }
 
@@ -2685,7 +2897,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y)
         Py_DECREF(obj);
         return NULL;
     }
-    ret = PyArray_Choose((PyAO *)obj, tup, NULL, NPY_RAISE);
+    ret = PyArray_Choose((PyArrayObject *)obj, tup, NULL, NPY_RAISE);
     Py_DECREF(obj);
     Py_DECREF(tup);
     return ret;
@@ -2712,11 +2924,9 @@ array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i", kwlist, &obj, &axis)) {
         return NULL;
     }
-    return _ARET(PyArray_LexSort(obj, axis));
+    return PyArray_Return((PyArrayObject *)PyArray_LexSort(obj, axis));
 }
 
-#undef _ARET
-
 static PyObject *
 array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
         PyObject *kwds)
@@ -2749,7 +2959,7 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
                                 PyArray_IsPythonNumber(from_obj)) {
         PyArrayObject *arr;
         arr = (PyArrayObject *)PyArray_FromAny(from_obj,
-                                        NULL, 0, 0, 0, NULL);
+                                        NULL, 0, 0, NPY_ARRAY_ALLOWNA, NULL);
         if (arr == NULL) {
             goto finish;
         }
@@ -2811,7 +3021,8 @@ array_min_scalar_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL, 0, 0, 0, NULL);
+    array = (PyArrayObject *)PyArray_FromAny(array_in,
+                                NULL, 0, 0, NPY_ARRAY_ALLOWNA, NULL);
     if (array == NULL) {
         return NULL;
     }
@@ -2856,7 +3067,7 @@ array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
                 goto finish;
             }
             arr[narr] = (PyArrayObject *)PyArray_FromAny(obj,
-                                            NULL, 0, 0, 0, NULL);
+                                        NULL, 0, 0, NPY_ARRAY_ALLOWNA, NULL);
             if (arr[narr] == NULL) {
                 goto finish;
             }
@@ -2953,17 +3164,17 @@ _SigSegv_Handler(int signum)
 }
 #endif
 
-#define _test_code() {                          \
-        test = *((char*)memptr);                \
-        if (!ro) {                              \
-            *((char *)memptr) = '\0';           \
-            *((char *)memptr) = test;           \
-        }                                       \
-        test = *((char*)memptr+size-1);         \
-        if (!ro) {                              \
-            *((char *)memptr+size-1) = '\0';    \
-            *((char *)memptr+size-1) = test;    \
-        }                                       \
+#define _test_code() { \
+        test = *((char*)memptr); \
+        if (!ro) { \
+            *((char *)memptr) = '\0'; \
+            *((char *)memptr) = test; \
+        } \
+        test = *((char*)memptr+size-1); \
+        if (!ro) { \
+            *((char *)memptr+size-1) = '\0'; \
+            *((char *)memptr+size-1) = test; \
+        } \
     }
 
 static PyObject *
@@ -3465,7 +3676,10 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
-        METH_VARARGS, NULL},
+        METH_VARARGS|METH_KEYWORDS, NULL},
+    {"count_reduce_items",
+        (PyCFunction)array_count_reduce_items,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -3502,6 +3716,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"isna",
+        (PyCFunction)array_isna,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"_fastCopyAndTranspose",
         (PyCFunction)array_fastCopyAndTranspose,
         METH_VARARGS, NULL},
@@ -3824,7 +4041,7 @@ PyMODINIT_FUNC initmultiarray(void) {
     if (!d) {
         goto err;
     }
-    PyArray_Type.tp_free = _pya_free;
+    PyArray_Type.tp_free = PyArray_free;
     if (PyType_Ready(&PyArray_Type) < 0) {
         return RETVAL;
     }
@@ -3834,7 +4051,7 @@ PyMODINIT_FUNC initmultiarray(void) {
     PyArrayIter_Type.tp_iter = PyObject_SelfIter;
     NpyIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMultiIter_Type.tp_iter = PyObject_SelfIter;
-    PyArrayMultiIter_Type.tp_free = _pya_free;
+    PyArrayMultiIter_Type.tp_free = PyArray_free;
     if (PyType_Ready(&PyArrayIter_Type) < 0) {
         return RETVAL;
     }
@@ -3851,6 +4068,9 @@ PyMODINIT_FUNC initmultiarray(void) {
     if (PyType_Ready(&NpyIter_Type) < 0) {
         return RETVAL;
     }
+    if (PyType_Ready(&NpyNA_Type) < 0) {
+        return RETVAL;
+    }
 
     PyArrayDescr_Type.tp_hash = PyArray_DescrHash;
     if (PyType_Ready(&PyArrayDescr_Type) < 0) {
@@ -3946,6 +4166,12 @@ PyMODINIT_FUNC initmultiarray(void) {
     Py_INCREF(&NpyBusDayCalendar_Type);
     PyDict_SetItemString(d, "busdaycalendar",
                             (PyObject *)&NpyBusDayCalendar_Type);
+    /* NA Type */
+    PyDict_SetItemString(d, "NAType", (PyObject *)&NpyNA_Type);
+    Py_INCREF(&NpyNA_Type);
+    /* NA  Singleton */
+    Py_INCREF(Npy_NA);
+    PyDict_SetItemString(d, "NA", Npy_NA);
 
     set_flaginfo(d);
 
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index da732c600..15d281d2e 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -45,7 +45,13 @@
 #include "lowlevel_strided_loops.c"
 #include "dtype_transfer.c"
 #include "einsum.c"
-
+#include "array_assign.c"
+#include "array_assign_scalar.c"
+#include "array_assign_array.c"
+#include "reduction.c"
+#include "na_mask.c"
+#include "na_object.c"
+#include "boolean_ops.c"
 
 #ifndef Py_UNICODE_WIDE
 #include "ucsnarrow.c"
diff --git a/numpy/core/src/multiarray/na_mask.c b/numpy/core/src/multiarray/na_mask.c
new file mode 100644
index 000000000..ce3fe3863
--- /dev/null
+++ b/numpy/core/src/multiarray/na_mask.c
@@ -0,0 +1,955 @@
+/*
+ * This file implements missing value NA mask support for the NumPy array.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/arrayobject.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "shape.h"
+#include "lowlevel_strided_loops.h"
+#include "array_assign.h"
+#include "na_object.h"
+
+/*NUMPY_API
+ *
+ * Returns true if the array has an NA mask. When
+ * NA dtypes are implemented, will also return true
+ * if the array's dtype has NA support.
+ */
+NPY_NO_EXPORT npy_bool
+PyArray_HasNASupport(PyArrayObject *arr)
+{
+    return PyArray_HASMASKNA(arr);
+}
+
+/*NUMPY_API
+ *
+ * Returns false if the array has no NA support. Returns
+ * true if the array has NA support AND there is an
+ * NA anywhere in the array.
+ *
+ * If 'wheremask' is non-NULL, only positions with True
+ * in 'wheremask' are checked for NA.
+ *
+ * The parameter 'whichna' is not yet supported, but is
+ * provided for future multi-NA support. It should be set
+ * to NULL.
+ *
+ * Returns -1 on failure, otherwise 0 for False and 1 for True.
+ */
+NPY_NO_EXPORT int
+PyArray_ContainsNA(PyArrayObject *arr, PyArrayObject *wheremask,
+                    npy_bool *whichna)
+{
+    /* Validate that the parameter for future expansion is NULL */
+    if (whichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA is not yet supported in PyArray_ContainsNA");
+        return -1;
+    }
+
+    if (wheremask == NULL) {
+        /* Need NA support to contain NA */
+        if (PyArray_HASMASKNA(arr)) {
+            int idim, ndim;
+            char *data;
+            npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+            npy_intp i, coord[NPY_MAXDIMS];
+
+            if (PyArray_HASFIELDS(arr)) {
+                PyErr_SetString(PyExc_RuntimeError,
+                        "field-NA is not yet supported");
+                return -1;
+            }
+
+            /* Use raw iteration with no heap memory allocation */
+            if (PyArray_PrepareOneRawArrayIter(
+                        PyArray_NDIM(arr), PyArray_DIMS(arr),
+                        PyArray_MASKNA_DATA(arr), PyArray_MASKNA_STRIDES(arr),
+                        &ndim, shape,
+                        &data, strides) < 0) {
+                return -1;
+            }
+
+            /* Do the iteration */
+            NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+                char *d = data;
+                /* Process the innermost dimension */
+                for (i = 0; i < shape[0]; ++i, d += strides[0]) {
+                    if (!NpyMaskValue_IsExposed((npy_mask)(*d))) {
+                        return 1;
+                    }
+                }
+            } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+        }
+    }
+    else {
+        npy_intp wheremask_strides_bcast[NPY_MAXDIMS];
+        int containsna;
+
+        containsna = PyArray_ContainsNA(wheremask, NULL, NULL);
+        if (containsna != 0) {
+            if (containsna == -1) {
+                return -1;
+            }
+            else {
+                PyErr_SetString(PyExc_ValueError,
+                        "the where mask may not contain any NA values");
+                return -1;
+            }
+        }
+
+        /*
+         * Broadcast the where-mask onto arr. Note that this
+         * is before checking if 'arr' has an NA mask, to
+         * catch any broadcasting errors.
+         */
+        if (broadcast_strides(PyArray_NDIM(arr), PyArray_DIMS(arr),
+                        PyArray_NDIM(wheremask), PyArray_DIMS(wheremask),
+                        PyArray_STRIDES(wheremask), "where mask",
+                        wheremask_strides_bcast) < 0) {
+            return -1;
+        }
+
+        if (PyArray_DTYPE(wheremask)->type_num != NPY_BOOL) {
+            PyErr_SetString(PyExc_ValueError,
+                    "the where mask must have a 'bool' dtype");
+            return -1;
+        }
+
+        if (PyArray_HASMASKNA(arr)) {
+            int idim, ndim;
+            char *data, *wheremask_data;
+            npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+            npy_intp wheremask_strides[NPY_MAXDIMS];
+            npy_intp i, coord[NPY_MAXDIMS];
+
+            if (PyArray_HASFIELDS(arr)) {
+                PyErr_SetString(PyExc_RuntimeError,
+                        "field-NA is not yet supported");
+                return -1;
+            }
+
+            /* Use raw iteration with no heap memory allocation */
+            if (PyArray_PrepareTwoRawArrayIter(
+                        PyArray_NDIM(arr), PyArray_DIMS(arr),
+                        PyArray_MASKNA_DATA(arr), PyArray_MASKNA_STRIDES(arr),
+                        PyArray_DATA(wheremask), wheremask_strides_bcast,
+                        &ndim, shape,
+                        &data, strides,
+                        &wheremask_data, wheremask_strides) < 0) {
+                return -1;
+            }
+
+            /* Do the iteration */
+            NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+                char *d = data, *where_d = wheremask_data;
+                /* Process the innermost dimension */
+                for (i = 0; i < shape[0]; ++i) {
+                    if (*where_d && !NpyMaskValue_IsExposed((npy_mask)(*d))) {
+                        return 1;
+                    }
+
+                    d += strides[0];
+                    where_d += wheremask_strides[0];
+                }
+            } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape,
+                                    data, strides,
+                                    wheremask_data, wheremask_strides);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Fills a raw array whose dtype has size one with the specified byte
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+fill_raw_byte_array(int ndim, npy_intp *shape,
+                char *data, npy_intp *strides, char fillvalue)
+{
+    int idim;
+    npy_intp shape_it[NPY_MAXDIMS], strides_it[NPY_MAXDIMS];
+    npy_intp i, coord[NPY_MAXDIMS];
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, shape,
+                    data, strides,
+                    &ndim, shape_it,
+                    &data, strides_it) < 0) {
+        PyErr_Clear();
+        return 1;
+    }
+
+    /* Special case contiguous inner stride */
+    if (strides_it[0] == 1) {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+            /* Process the innermost dimension */
+            memset(data, fillvalue, shape_it[0]);
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape_it, data, strides_it);
+    }
+    /* General inner stride */
+    else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+            char *d = data;
+            /* Process the innermost dimension */
+            for (i = 0; i < shape_it[0]; ++i, d += strides_it[0]) {
+                *d = fillvalue;
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape_it, data, strides_it);
+    }
+
+    return 0;
+}
+
+/*NUMPY_API
+ *
+ * Assigns the mask value to all the NA mask elements of
+ * the array. This routine is intended to be used to mask
+ * all the elments of an array, or if you will also be assigning
+ * values to everything at the same time, to unmask all the elements.
+ *
+ * If 'wheremask' isn't NULL, it should be a boolean mask which
+ * specifies where to do the assignment.
+ *
+ * The parameters 'preservena' and 'preservewhichna' are NOT YET
+ * SUPPORTED, but are in place to allow for future expansion to
+ * multi-NA. 'preservewhichna' should be set to NULL, while
+ * preservena has no effect for straight NPY_BOOL NA masks, because
+ * different NAs are indistinguishable.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignMaskNA(PyArrayObject *arr, npy_mask maskvalue,
+                PyArrayObject *wheremask,
+                npy_bool preservena, npy_bool *preservewhichna)
+{
+    PyArray_Descr *maskvalue_dtype;
+    int retcode = 0;
+
+    /* Need NA support to fill the NA mask */
+    if (!PyArray_HASMASKNA(arr)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot assign to the NA mask of an "
+                "array which has no NA mask");
+        return -1;
+    }
+
+    if (preservewhichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA support is not yet implemented");
+        return -1;
+    }
+
+    /*
+     * If the mask given has no payload, assign from boolean type, otherwise
+     * assign from the mask type.
+     */
+    if ((maskvalue & (~0x01)) == 0) {
+        maskvalue_dtype = PyArray_DescrFromType(NPY_BOOL);
+    }
+    else {
+        maskvalue_dtype = PyArray_DescrFromType(NPY_MASK);
+    }
+    if (maskvalue_dtype == NULL) {
+        return -1;
+    }
+
+    if (wheremask == NULL) {
+        retcode = raw_array_assign_scalar(
+                        PyArray_NDIM(arr), PyArray_DIMS(arr),
+                        PyArray_MASKNA_DTYPE(arr),
+                        PyArray_MASKNA_DATA(arr),
+                        PyArray_MASKNA_STRIDES(arr),
+                        maskvalue_dtype, (char *)&maskvalue);
+    }
+    else {
+        npy_intp wheremask_strides[NPY_MAXDIMS];
+
+        /* Broadcast the wheremask to 'arr' */
+        if (broadcast_strides(PyArray_NDIM(arr), PyArray_DIMS(arr),
+                    PyArray_NDIM(wheremask), PyArray_DIMS(wheremask),
+                    PyArray_STRIDES(wheremask), "where mask",
+                    wheremask_strides) < 0) {
+            Py_DECREF(maskvalue_dtype);
+            return -1;
+        }
+
+        retcode = raw_array_wheremasked_assign_scalar(
+                        PyArray_NDIM(arr), PyArray_DIMS(arr),
+                        PyArray_MASKNA_DTYPE(arr),
+                        PyArray_MASKNA_DATA(arr),
+                        PyArray_MASKNA_STRIDES(arr),
+                        maskvalue_dtype, (char *)&maskvalue,
+                        PyArray_DESCR(wheremask), PyArray_DATA(wheremask),
+                        wheremask_strides);
+    }
+
+    Py_DECREF(maskvalue_dtype);
+    return retcode;
+}
+
+/*NUMPY_API
+ *
+ * If the array does not have an NA mask already, allocates one for it.
+ *
+ * If 'ownmaskna' is True, it also allocates one for it if the array does
+ * not already own its own mask, then copies the data from the old mask
+ * to the new mask.
+ *
+ * If 'multina' is True, the mask is allocated with an NPY_MASK dtype
+ * instead of NPY_BOOL.
+ *
+ * If a new mask is allocated, and no mask was there to copy from,
+ * the mask is filled with the 'defaultmask' value. Normally you
+ * set this to 1, so all the values are exposed.
+ *
+ * Returns -1 on failure, 0 on success.
+ */
+NPY_NO_EXPORT int
+PyArray_AllocateMaskNA(PyArrayObject *arr,
+                npy_bool ownmaskna,
+                npy_bool multina,
+                npy_mask defaultmask)
+{
+    PyArrayObject_fields *fa = (PyArrayObject_fields *)arr;
+    PyArray_Descr *maskna_dtype = NULL;
+    char *maskna_data = NULL;
+    npy_intp size;
+
+    /* If the array already owns a mask, done */
+    if (fa->flags & NPY_ARRAY_OWNMASKNA) {
+        return 0;
+    }
+
+    /* If ownership wasn't requested, and there's already a mask, done */
+    if (!ownmaskna && (fa->flags & NPY_ARRAY_MASKNA)) {
+        return 0;
+    }
+
+    size = PyArray_SIZE(arr);
+
+    /* Create the mask dtype */
+    if (PyArray_HASFIELDS(arr)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "NumPy field-NA isn't supported yet");
+        return -1;
+    }
+    else {
+        maskna_dtype = PyArray_DescrFromType(multina ? NPY_MASK
+                                                         : NPY_BOOL);
+        if (maskna_dtype == NULL) {
+            return -1;
+        }
+    }
+
+    /* Allocate the mask memory */
+    maskna_data = PyArray_malloc(size * maskna_dtype->elsize);
+    if (maskna_data == NULL) {
+        Py_DECREF(maskna_dtype);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Copy the data and fill in the strides */
+    if (fa->nd == 1) {
+        /* If there already was a mask copy it, otherwise set it to all ones */
+        if (fa->flags & NPY_ARRAY_MASKNA) {
+            if (fa->maskna_strides[0] == 1) {
+                memcpy(maskna_data, fa->maskna_data,
+                            size * maskna_dtype->elsize);
+            }
+            else {
+                if (PyArray_CastRawArrays(fa->dimensions[0],
+                                (char *)fa->maskna_data, maskna_data,
+                                fa->maskna_strides[0], maskna_dtype->elsize,
+                                fa->maskna_dtype, maskna_dtype, 0) < 0) {
+                    Py_DECREF(maskna_dtype);
+                    PyArray_free(maskna_data);
+                    return -1;
+                }
+            }
+        }
+        else {
+            memset(maskna_data, defaultmask, size * maskna_dtype->elsize);
+        }
+
+        fa->maskna_strides[0] = maskna_dtype->elsize;
+    }
+    else if (fa->nd > 1) {
+        npy_stride_sort_item strideperm[NPY_MAXDIMS];
+        npy_intp stride, maskna_strides[NPY_MAXDIMS], *shape;
+        int i;
+
+        shape = fa->dimensions;
+
+        /* This causes the NA mask and data memory orderings to match */
+        PyArray_CreateSortedStridePerm(fa->nd, fa->dimensions,
+                                            fa->strides, strideperm);
+        stride = maskna_dtype->elsize;
+        for (i = fa->nd-1; i >= 0; --i) {
+            npy_intp i_perm = strideperm[i].perm;
+            maskna_strides[i_perm] = stride;
+            stride *= shape[i_perm];
+        }
+
+        /* If there already was a mask copy it, otherwise set it to all ones */
+        if (fa->flags & NPY_ARRAY_MASKNA) {
+            if (PyArray_CastRawNDimArrays(fa->nd, fa->dimensions,
+                            (char *)fa->maskna_data, maskna_data,
+                            fa->maskna_strides, maskna_strides,
+                            fa->maskna_dtype, maskna_dtype, 0) < 0) {
+                Py_DECREF(maskna_dtype);
+                PyArray_free(maskna_data);
+                return -1;
+            }
+        }
+        else {
+            memset(maskna_data, defaultmask, size * maskna_dtype->elsize);
+        }
+
+        memcpy(fa->maskna_strides, maskna_strides, fa->nd * sizeof(npy_intp));
+    }
+    else {
+        /* If there already was a mask copy it, otherwise set it to all ones */
+        if (fa->flags & NPY_ARRAY_MASKNA) {
+            maskna_data[0] = fa->maskna_data[0];
+        }
+        else {
+            maskna_data[0] = defaultmask;
+        }
+    }
+
+    /* Set the NA mask data in the array */
+    fa->maskna_dtype = maskna_dtype;
+    fa->maskna_data = maskna_data;
+    fa->flags |= (NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA);
+
+    return 0;
+}
+
+/*NUMPY_API
+ *
+ * Assigns the given NA value to all the elements in the array. If
+ * 'arr' has a mask, masks all the elements of the array.
+ *
+ * In the future, when 'arr' has an NA dtype, will assign the
+ * appropriate NA bitpatterns to the elements.
+ *
+ * The parameters 'preservena' and 'preservewhichna' are NOT YET
+ * SUPPORTED, but are in place to allow for future expansion to
+ * multi-NA. 'preservewhichna' should be set to NULL, while
+ * preservena has no effect for straight NPY_BOOL NA masks, because
+ * different NAs are indistinguishable.
+ *
+ * Returns -1 on failure, 0 on success.
+ */
+NPY_NO_EXPORT int
+PyArray_AssignNA(PyArrayObject *arr, NpyNA *na,
+                PyArrayObject *wheremask,
+                npy_bool preservena, npy_bool *preservewhichna)
+{
+    NpyNA_fields *fna = (NpyNA_fields *)na;
+    char maskvalue;
+
+    /* Turn the payload into a mask value */
+    if (fna->payload == NPY_NA_NOPAYLOAD) {
+        maskvalue = 0;
+    }
+    else if (PyArray_MASKNA_DTYPE(arr)->type_num !=
+                                        NPY_MASK) {
+        /* TODO: also handle struct-NA mask dtypes */
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot assign an NA with a payload to an "
+                "NA-array with a boolean mask, requires a multi-NA mask");
+        return -1;
+    }
+    else {
+        maskvalue = (char)NpyMaskValue_Create(0, fna->payload);
+    }
+
+    return PyArray_AssignMaskNA(arr, maskvalue,
+                        wheremask, preservena, preservewhichna);
+}
+
+/*
+ * A ufunc-like function, which returns a boolean or an array
+ * of booleans indicating which values are NA.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_IsNA(PyObject *obj)
+{
+    /* NA objects are NA */
+    if (NpyNA_Check(obj)) {
+        Py_INCREF(Py_True);
+        return Py_True;
+    }
+    /* Otherwise non-array objects are not NA */
+    else if (!PyArray_Check(obj)) {
+        Py_INCREF(Py_False);
+        return Py_False;
+    }
+    /* Create a boolean array based on the mask */
+    else {
+        PyArrayObject *ret;
+        PyArray_Descr *dtype;
+
+        if (PyArray_HASFIELDS((PyArrayObject *)obj)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "field-NA is not supported yet");
+            return NULL;
+        }
+
+        dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (dtype == NULL) {
+            return NULL;
+        }
+
+        /*
+         * TODO: Could change this to use raw iteration to
+         *       avoid the iterator creation overhead.
+         */
+        if (PyArray_HASMASKNA((PyArrayObject *)obj)) {
+            NpyIter *iter;
+            PyArrayObject *op[2] = {(PyArrayObject *)obj, NULL};
+            npy_uint32 flags, op_flags[2];
+            PyArray_Descr *op_dtypes[2] = {NULL, dtype};
+
+            flags = NPY_ITER_EXTERNAL_LOOP |
+                    NPY_ITER_ZEROSIZE_OK |
+                    NPY_ITER_REFS_OK;
+            /*
+             * This USE_MASKNA causes there to be 3 operands, where operand
+             * 2 is the mask for operand 0
+             */
+            op_flags[0] = NPY_ITER_READONLY | NPY_ITER_USE_MASKNA;
+            op_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+
+            iter = NpyIter_MultiNew(2, op, flags, NPY_KEEPORDER, NPY_NO_CASTING,
+                                    op_flags, op_dtypes);
+            if (iter == NULL) {
+                Py_DECREF(dtype);
+                return NULL;
+            }
+
+            if (NpyIter_GetIterSize(iter) > 0) {
+                NpyIter_IterNextFunc *iternext;
+                npy_intp innersize, *innerstrides;
+                npy_intp innerstridemask, innerstride1;
+                char **dataptrs, *dataptrmask, *dataptr1;
+
+                iternext = NpyIter_GetIterNext(iter, NULL);
+                if (iternext == NULL) {
+                    Py_DECREF(dtype);
+                    return NULL;
+                }
+                innerstrides = NpyIter_GetInnerStrideArray(iter);
+                innerstridemask = innerstrides[2];
+                innerstride1 = innerstrides[1];
+                /* Because buffering is disabled, the innersize is fixed */
+                innersize = *NpyIter_GetInnerLoopSizePtr(iter);
+                dataptrs = NpyIter_GetDataPtrArray(iter);
+
+                do {
+                    npy_intp i;
+                    dataptrmask = dataptrs[2];
+                    dataptr1 = dataptrs[1];
+
+                    for (i = 0; i < innersize; ++i) {
+                        /*
+                         * Bit 0 of the mask is 0 -> NA, 1 -> available,
+                         * so invert it and clear the rest of the bits.
+                         */
+                        *dataptr1 = ~(*dataptrmask) & 0x01;
+                        dataptrmask += innerstridemask;
+                        dataptr1 += innerstride1;
+                    }
+                } while (iternext(iter));
+            }
+
+            ret = NpyIter_GetOperandArray(iter)[1];
+            Py_INCREF(ret);
+            Py_DECREF(dtype);
+            NpyIter_Deallocate(iter);
+        }
+        /* Create an array of all zeros */
+        else {
+            npy_intp size;
+            ret = (PyArrayObject *)PyArray_NewLikeArray(
+                            (PyArrayObject *)obj, NPY_KEEPORDER, dtype, 0);
+            if (ret == NULL) {
+                return NULL;
+            }
+            /*
+             * Can use memset because the newly allocated array is
+             * packed tightly in memory
+             */
+            size = PyArray_SIZE(ret);
+            if (size > 0) {
+                memset(PyArray_DATA(ret), 0, dtype->elsize * size);
+            }
+        }
+
+        return (PyObject *)ret;
+    }
+}
+
+/*
+ * This function performs a reduction on the masks for an array.
+ * The masks are provided in raw form, with their strides conformed
+ * for the reduction.
+ *
+ * This is for use with a reduction where 'skipna=False'.
+ *
+ * ndim, shape: The geometry of the arrays
+ * src_dtype, dst_dtype: The NA mask dtypes.
+ * src_data, dst_data: The NA mask data pointers.
+ * src_strides, dst_strides: The NA mask strides, matching the geometry.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+raw_reduce_maskna_array(int ndim, npy_intp *shape,
+            PyArray_Descr *src_dtype, char *src_data, npy_intp *src_strides,
+            PyArray_Descr *dst_dtype, char *dst_data, npy_intp *dst_strides)
+{
+    int idim;
+    npy_intp i, coord[NPY_MAXDIMS];
+    npy_intp shape_it[NPY_MAXDIMS];
+    npy_intp src_strides_it[NPY_MAXDIMS];
+    npy_intp dst_strides_it[NPY_MAXDIMS];
+
+    /* Confirm that dst is not larger than src */
+    for (idim = 0; idim < ndim; ++idim) {
+        if (src_strides[idim] == 0 && dst_strides[idim] != 0) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "ReduceMaskNAArray cannot reduce into a larger array");
+            return -1;
+        }
+    }
+
+    if (src_dtype->type_num != NPY_BOOL || dst_dtype->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_ValueError,
+                "multi-NA and field-NA are not yet supported");
+        return -1;
+    }
+
+    /* Initialize the destination mask to all ones, exposed data */
+    if (fill_raw_byte_array(ndim, shape, dst_data, dst_strides, 1) < 0) {
+        return -1;
+    }
+
+    /*
+     * Sort axes based on 'src', which has more non-zero strides,
+     * by making it the first operand here
+     */
+    if (PyArray_PrepareTwoRawArrayIter(ndim, shape,
+                                    src_data, src_strides,
+                                    dst_data, dst_strides,
+                                    &ndim, shape_it,
+                                    &src_data, src_strides_it,
+                                    &dst_data, dst_strides_it) < 0) {
+        return NPY_FAIL;
+    }
+
+    /* Special case a reduction in the inner loop */
+    if (dst_strides_it[0] == 0) {
+        /* Special case a contiguous reduction in the inner loop */
+        if (src_strides_it[0] == 1) {
+            NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+                /* If there's a zero in src, set dst to zero */
+                if (memchr(src_data, 0, shape_it[0]) != NULL) {
+                    *dst_data = 0;
+                }
+            } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                        src_data, src_strides_it,
+                                        dst_data, dst_strides_it);
+        }
+        else {
+            NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+                char *src_d = src_data;
+                /* If there's a zero in src, set dst to zero */
+                for (i = 0; i < shape_it[0]; ++i) {
+                    if (*src_d == 0) {
+                        *dst_data = 0;
+                        break;
+                    }
+                    src_d += src_strides_it[0];
+                }
+            } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                        src_data, src_strides_it,
+                                        dst_data, dst_strides_it);
+        }
+    }
+    else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+            char *src_d = src_data, *dst_d = dst_data;
+            for (i = 0; i < shape_it[0]; ++i) {
+                *dst_d &= *src_d;
+                src_d += src_strides_it[0];
+                dst_d += dst_strides_it[0];
+            }
+        } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                    src_data, src_strides_it,
+                                    dst_data, dst_strides_it);
+    }
+
+    return 0;
+}
+
+/*
+ * This function performs a reduction on the masks for an array.
+ *
+ * This is for use with a reduction where 'skipna=False'.
+ *
+ * operand: The operand for which the reduction is being done. This array
+ *          must have an NA mask.
+ * result: The result array, which should have the same 'ndim' as
+ *         'operand' but with dimensions of size one for every reduction
+ *         axis. This array must have an NA mask.
+ * wheremask: NOT SUPPORTED YET, but is for a boolean mask which can
+ *            broadcast to 'result', indicating where to do the reduction.
+ *            Should pass in NULL.
+ * skipwhichna: NOT SUPPORTED YET, but for future expansion to multi-NA,
+ *              where reductions can be done on NAs with a subset of
+ *              the possible payloads. Should pass in NULL.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_ReduceMaskNAArray(PyArrayObject *operand, PyArrayObject *result,
+                            PyArrayObject *wheremask, npy_bool *skipwhichna)
+{
+    int idim, ndim;
+    npy_intp result_strides[NPY_MAXDIMS];
+    npy_intp *result_shape, *operand_shape;
+    npy_intp *result_maskna_strides;
+
+    ndim = PyArray_NDIM(operand);
+    if (ndim != PyArray_NDIM(result)) {
+        PyErr_SetString(PyExc_ValueError,
+                "result and operand must have the same 'ndim' in "
+                "ReduceMaskNAArray");
+        return -1;
+    }
+    if (!PyArray_HASMASKNA(result) || !PyArray_HASMASKNA(operand)) {
+        PyErr_SetString(PyExc_ValueError,
+                "both result and operand must have NA masks in "
+                "ReduceMaskNAArray");
+        return -1;
+    }
+
+    /* Validate that the parameters for future expansion are NULL */
+    if (wheremask != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "the NA mask reduction operation in NumPy does not "
+                "yet support a where mask");
+        return -1;
+    }
+    if (skipwhichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA support is not yet implemented in "
+                "the NA mask reduction operation");
+        return -1;
+    }
+
+    /* Need to make sure the appropriate strides are 0 in 'result' */
+    result_shape = PyArray_SHAPE(result);
+    operand_shape = PyArray_SHAPE(operand);
+    result_maskna_strides = PyArray_MASKNA_STRIDES(result);
+    for (idim = 0; idim < ndim; ++idim) {
+        if (result_shape[idim] == 1) {
+            result_strides[idim] = 0;
+        }
+        else if (result_shape[idim] != operand_shape[idim]) {
+            PyErr_SetString(PyExc_ValueError,
+                "the result shape must match the operand shape wherever "
+                "it is not 1 in ReduceMaskNAArray");
+            return -1;
+        }
+        else {
+            result_strides[idim] = result_maskna_strides[idim];
+        }
+    }
+
+    return raw_reduce_maskna_array(ndim, PyArray_DIMS(operand),
+                    PyArray_MASKNA_DTYPE(operand),
+                    PyArray_MASKNA_DATA(operand),
+                    PyArray_MASKNA_STRIDES(operand),
+                    PyArray_MASKNA_DTYPE(result),
+                    PyArray_MASKNA_DATA(result),
+                    result_strides);
+}
+
+static void
+_strided_bool_mask_inversion(char *dst, npy_intp dst_stride,
+                            char *src, npy_intp src_stride,
+                            npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                            NpyAuxData *NPY_UNUSED(opdata))
+{
+    while (N > 0) {
+        *dst = ((*src) ^ 0x01) & 0x01;
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+NPY_NO_EXPORT int
+PyArray_GetMaskInversionFunction(
+        npy_intp dst_mask_stride, npy_intp src_mask_stride,
+        PyArray_Descr *mask_dtype,
+        PyArray_StridedUnaryOp **out_unop, NpyAuxData **out_opdata)
+{
+    /* Will use the opdata with the field version */
+    if (PyDataType_HASFIELDS(mask_dtype)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "field-based masks are not supported yet");
+        return -1;
+    }
+
+    if (mask_dtype->type_num != NPY_BOOL && mask_dtype->type_num != NPY_MASK) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "unsupported data type for mask");
+        return -1;
+    }
+
+    /* TODO: Specialize for contiguous data */
+
+    *out_unop = &_strided_bool_mask_inversion;
+    *out_opdata = NULL;
+    return 0;
+}
+
+static void
+_strided_bool_mask_noinv0_noinv1_and(char *dst, npy_intp dst_stride,
+                            char *src0, npy_intp src0_stride,
+                            char *src1, npy_intp src1_stride,
+                            npy_intp N, NpyAuxData *NPY_UNUSED(opdata))
+{
+    while (N > 0) {
+        *dst = (*src0) & (*src1);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        --N;
+    }
+}
+
+static void
+_strided_bool_mask_inv0_noinv1_and(char *dst, npy_intp dst_stride,
+                            char *src0, npy_intp src0_stride,
+                            char *src1, npy_intp src1_stride,
+                            npy_intp N, NpyAuxData *NPY_UNUSED(opdata))
+{
+    while (N > 0) {
+        *dst = ((*src0) ^ 0x01) & (*src1);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        --N;
+    }
+}
+
+static void
+_strided_bool_mask_noinv0_inv1_and(char *dst, npy_intp dst_stride,
+                            char *src0, npy_intp src0_stride,
+                            char *src1, npy_intp src1_stride,
+                            npy_intp N, NpyAuxData *NPY_UNUSED(opdata))
+{
+    while (N > 0) {
+        *dst = (*src0) & ((*src1) ^ 0x01);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        --N;
+    }
+}
+
+static void
+_strided_bool_mask_inv0_inv1_and(char *dst, npy_intp dst_stride,
+                            char *src0, npy_intp src0_stride,
+                            char *src1, npy_intp src1_stride,
+                            npy_intp N, NpyAuxData *NPY_UNUSED(opdata))
+{
+    while (N > 0) {
+        *dst = ((*src0) | (*src1)) ^ 0x01;
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        --N;
+    }
+}
+
+/*
+ * Gets a function which ANDs together two masks, possibly inverting
+ * one or both of the masks as well.
+ *
+ * The dtype of the output must match 'mask0_dtype'.
+ */
+NPY_NO_EXPORT int
+PyArray_GetMaskAndFunction(
+        npy_intp mask0_stride, PyArray_Descr *mask0_dtype, int invert_mask0,
+        npy_intp mask1_stride, PyArray_Descr *mask1_dtype, int invert_mask1,
+        PyArray_StridedBinaryOp **out_binop, NpyAuxData **out_opdata)
+{
+    /* Will use the opdata with the field version */
+    if (PyDataType_HASFIELDS(mask0_dtype) ||
+                        PyDataType_HASFIELDS(mask1_dtype)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "field-based masks are not supported yet");
+        return -1;
+    }
+
+    if (mask0_dtype->type_num == NPY_MASK ||
+                            mask1_dtype->type_num == NPY_MASK) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA masks are not supported yet");
+        return -1;
+    }
+
+    if (mask0_dtype->type_num != NPY_BOOL ||
+                            mask1_dtype->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "unsupported data type for mask");
+        return -1;
+    }
+
+    /* TODO: Specialize for contiguous data */
+
+    if (invert_mask0) {
+        if (invert_mask1) {
+            *out_binop = &_strided_bool_mask_inv0_inv1_and;
+        }
+        else {
+            *out_binop = &_strided_bool_mask_inv0_noinv1_and;
+        }
+    }
+    else {
+        if (invert_mask1) {
+            *out_binop = &_strided_bool_mask_noinv0_inv1_and;
+        }
+        else {
+            *out_binop = &_strided_bool_mask_noinv0_noinv1_and;
+        }
+    }
+    *out_opdata = NULL;
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/na_mask.h b/numpy/core/src/multiarray/na_mask.h
new file mode 100644
index 000000000..160327de2
--- /dev/null
+++ b/numpy/core/src/multiarray/na_mask.h
@@ -0,0 +1,58 @@
+#ifndef _NPY_PRIVATE__NA_MASK_H_
+#define _NPY_PRIVATE__NA_MASK_H_
+
+#include "lowlevel_strided_loops.h"
+
+/*
+ * A ufunc-like function, which returns a boolean or an array
+ * of booleans indicating which values are NA.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_IsNA(PyObject *obj);
+
+/*
+ * Gets a strided unary operation which inverts mask values.
+ */
+NPY_NO_EXPORT int
+PyArray_GetMaskInversionFunction(npy_intp dst_mask_stride,
+                            npy_intp src_mask_stride,
+                            PyArray_Descr *mask_dtype,
+                            PyArray_StridedUnaryOp **out_unop,
+                            NpyAuxData **out_opdata);
+
+/*
+ * Gets a function which ANDs together two masks, possibly inverting
+ * one or both of the masks as well.
+ *
+ * The dtype of the output must match 'mask0_dtype'.
+ */
+NPY_NO_EXPORT int
+PyArray_GetMaskAndFunction(
+        npy_intp mask0_stride, PyArray_Descr *mask0_dtype, int invert_mask0,
+        npy_intp mask1_stride, PyArray_Descr *mask1_dtype, int invert_mask1,
+        PyArray_StridedBinaryOp **out_binop, NpyAuxData **out_opdata);
+
+/*
+ * This function performs a reduction on the masks for an array.
+ *
+ * This is for use with a reduction where 'skipna=False'.
+ *
+ * operand: The operand for which the reduction is being done. This array
+ *          must have an NA mask.
+ * result: The result array, which should have the same 'ndim' as
+ *         'operand' but with dimensions of size one for every reduction
+ *         axis. This array must have an NA mask.
+ * wheremask: NOT SUPPORTED YET, but is for a boolean mask which can
+ *            broadcast to 'result', indicating where to do the reduction.
+ *            Should pass in NULL.
+ * skipwhichna: NOT SUPPORTED YET, but for future expansion to multi-NA,
+ *              where reductions can be done on NAs with a subset of
+ *              the possible payloads. Should pass in NULL.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_ReduceMaskNAArray(PyArrayObject *operand, PyArrayObject *result,
+                            PyArrayObject *wheremask, npy_bool *skipwhichna);
+
+#endif
diff --git a/numpy/core/src/multiarray/na_object.c b/numpy/core/src/multiarray/na_object.c
new file mode 100644
index 000000000..ed319a348
--- /dev/null
+++ b/numpy/core/src/multiarray/na_object.c
@@ -0,0 +1,791 @@
+/*
+ * This file implements the missing value NA singleton object for NumPy.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "descriptor.h"
+#include "common.h"
+#include "na_object.h"
+
+static PyObject *
+na_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
+{
+    NpyNA_fields *self;
+
+    self = (NpyNA_fields *)subtype->tp_alloc(subtype, 0);
+    if (self != NULL) {
+        self->payload = NPY_NA_NOPAYLOAD;
+        self->dtype = NULL;
+        self->is_singleton = 0;
+    }
+
+    return (PyObject *)self;
+}
+
+static int
+na_init(NpyNA_fields *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"payload", "dtype", NULL};
+    int payload = NPY_MAX_INT;
+    PyArray_Descr *dtype = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO&:NA", kwlist,
+                        &payload,
+                        &PyArray_DescrConverter, &dtype)) {
+        Py_XDECREF(dtype);
+        return -1;
+    }
+
+    /* Using NPY_MAX_INT as the default for 'payload' */
+    if (payload == NPY_MAX_INT) {
+        self->payload = NPY_NA_NOPAYLOAD;
+    }
+    else if (payload < 0 || payload > 127) {
+        PyErr_Format(PyExc_ValueError,
+                    "out of bounds payload for NumPy NA, "
+                    "%d is not in the range [0,127]", payload);
+        Py_XDECREF(dtype);
+        return -1;
+    }
+    else {
+        self->payload = (npy_uint8)payload;
+    }
+
+    Py_XDECREF(self->dtype);
+    self->dtype = dtype;
+
+    return 0;
+}
+
+/*
+ * The call function proxies to the na_init function to handle
+ * the payload and dtype parameters.
+ */
+static PyObject *
+na_call(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+{
+    NpyNA_fields *ret;
+
+    ret = (NpyNA_fields *)na_new(&NpyNA_Type, NULL, NULL);
+    if (ret != NULL) {
+        if (na_init(ret, args, kwds) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    return (PyObject *)ret;
+}
+
+static void
+na_dealloc(NpyNA_fields *self)
+{
+    Py_XDECREF(self->dtype);
+    self->dtype = NULL;
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *
+na_repr(NpyNA_fields *self)
+{
+    if (self->dtype == NULL) {
+        if (self->payload == NPY_NA_NOPAYLOAD) {
+            return PyUString_FromString("NA");
+        }
+        else {
+            return PyUString_FromFormat("NA(%d)", (int)self->payload);
+        }
+    }
+    else {
+        PyObject *s;
+        if (self->payload == NPY_NA_NOPAYLOAD) {
+            s = PyUString_FromString("NA(dtype=");
+        }
+        else {
+            s  = PyUString_FromFormat("NA(%d, dtype=", (int)self->payload);
+        }
+        PyUString_ConcatAndDel(&s,
+                arraydescr_construction_repr(self->dtype, 1, 0));
+        PyUString_ConcatAndDel(&s,
+                PyUString_FromString(")"));
+        return s;
+    }
+}
+
+/*
+ * The str function is the same as repr, except it throws away
+ * the dtype. It is always either "NA" or "NA(payload)".
+ */
+static PyObject *
+na_str(NpyNA_fields *self)
+{
+    if (self->payload == NPY_NA_NOPAYLOAD) {
+        return PyUString_FromString("NA");
+    }
+    else {
+        return PyUString_FromFormat("NA(%d)", (int)self->payload);
+    }
+}
+
+/*
+ * Any comparison with NA produces an NA.
+ */
+static PyObject *
+na_richcompare(NpyNA_fields *self, PyObject *other, int cmp_op)
+{
+    /* If an ndarray is compared directly with NA, let the array handle it */
+    if (PyArray_Check(other)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* Otherwise always return the NA singleton */
+    else {
+        PyArray_Descr *bool_dtype;
+        NpyNA *ret;
+        bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (bool_dtype == NULL) {
+            return NULL;
+        }
+        ret = NpyNA_FromDTypeAndPayload(bool_dtype, 0, 0);
+        Py_DECREF(bool_dtype);
+        return (PyObject *)ret;
+    }
+}
+
+static PyObject *
+na_payload_get(NpyNA_fields *self)
+{
+    if (self->payload == NPY_NA_NOPAYLOAD) {
+        Py_INCREF(Py_None);
+        return Py_None;
+    }
+    else {
+        return PyInt_FromLong(self->payload);
+    }
+}
+
+static int
+na_payload_set(NpyNA_fields *self, PyObject *value)
+{
+    long payload;
+
+    /* Don't allow changing the static singleton instance */
+    if (self->is_singleton) {
+        PyErr_SetString(PyExc_RuntimeError,
+                    "cannot change the payload of the NumPy NA singleton, "
+                    "make a new copy like 'numpy.NA(payload)'");
+        return -1;
+    }
+    else if (value == NULL || value == Py_None) {
+        self->payload = NPY_NA_NOPAYLOAD;
+    }
+    else {
+        /* Use PyNumber_Index to ensure an integer in Python >= 2.5*/
+#if PY_VERSION_HEX >= 0x02050000
+        value = PyNumber_Index(value);
+        if (value == NULL) {
+            return -1;
+        }
+#else
+        Py_INCREF(value);
+#endif
+        payload = PyInt_AsLong(value);
+        Py_DECREF(value);
+        if (payload == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        else if (payload < 0 || payload > 127) {
+            PyErr_Format(PyExc_ValueError,
+                        "out of bounds payload for NumPy NA, "
+                        "%ld is not in the range [0,127]", payload);
+            return -1;
+        }
+        else {
+            self->payload = (npy_uint8)payload;
+        }
+    }
+
+    return 0;
+}
+
+static PyObject *
+na_dtype_get(NpyNA_fields *self)
+{
+    if (self->dtype == NULL) {
+        Py_INCREF(Py_None);
+        return Py_None;
+    }
+    else {
+        Py_INCREF(self->dtype);
+        return (PyObject *)self->dtype;
+    }
+}
+
+static int
+na_dtype_set(NpyNA_fields *self, PyObject *value)
+{
+    PyArray_Descr *dtype = NULL;
+
+    /* Don't allow changing the static singleton instance */
+    if (self->is_singleton) {
+        PyErr_SetString(PyExc_RuntimeError,
+                    "cannot change the dtype of the NumPy NA singleton, "
+                    "make a new copy like 'numpy.NA(dtype=val)'");
+        return -1;
+    }
+    /* Convert the input into a dtype object */
+    else if (!PyArray_DescrConverter(value, &dtype)) {
+        return -1;
+    }
+
+    /* Replace the existing dtype in self */
+    Py_XDECREF(self->dtype);
+    self->dtype = dtype;
+
+    return 0;
+}
+
+static PyGetSetDef na_getsets[] = {
+    {"payload",
+        (getter)na_payload_get,
+        (setter)na_payload_set,
+        NULL, NULL},
+    {"dtype",
+        (getter)na_dtype_get,
+        (setter)na_dtype_set,
+        NULL, NULL},
+
+    {NULL, NULL, NULL, NULL, NULL}
+};
+
+/* Combines two NA values together, merging their payloads and dtypes. */
+NPY_NO_EXPORT NpyNA *
+NpyNA_CombineNA(NpyNA *na1, NpyNA *na2)
+{
+    NpyNA_fields *ret, *fna1, *fna2;
+
+    fna1 = (NpyNA_fields *)na1;
+    fna2 = (NpyNA_fields *)na2;
+
+    ret = (NpyNA_fields *)na_new(&NpyNA_Type, NULL, NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Combine the payloads */
+    ret->payload = NpyNA_CombinePayloads(fna1->payload, fna2->payload);
+
+    /* Combine the dtypes */
+    Py_XDECREF(ret->dtype);
+    ret->dtype = NULL;
+    if (fna1->dtype != NULL && fna2->dtype != NULL) {
+        ret->dtype = PyArray_PromoteTypes(fna1->dtype, fna2->dtype);
+        if (ret->dtype == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+    else if (fna1->dtype != NULL) {
+        ret->dtype = fna1->dtype;
+        Py_INCREF(ret->dtype);
+    }
+    else if (fna2->dtype != NULL) {
+        ret->dtype = fna2->dtype;
+        Py_INCREF(ret->dtype);
+    }
+
+    return (NpyNA *)ret;
+}
+
+/*
+ * Combines an NA with an object, raising an error if the object has
+ * no extractable NumPy dtype.
+ */
+NPY_NO_EXPORT NpyNA *
+NpyNA_CombineNAWithObject(NpyNA *na, PyObject *obj)
+{
+    NpyNA_fields *ret, *fna;
+    PyArray_Descr *dtype = NULL;
+
+    fna = (NpyNA_fields *)na;
+
+    /* If 'obj' is NA, handle it specially */
+    if (NpyNA_Check(obj)) {
+        return NpyNA_CombineNA(na, (NpyNA *)obj);
+    }
+
+    /* Extract a dtype from 'obj' */
+    if (PyArray_IsScalar(obj, Generic)) {
+        dtype = PyArray_DescrFromScalar(obj);
+        if (dtype == NULL) {
+            return NULL;
+        }
+    }
+    else if (PyArray_Check(obj)) {
+        /* TODO: This needs to be more complicated... */
+        dtype = PyArray_DESCR((PyArrayObject *)obj);
+        Py_INCREF(dtype);
+    }
+    else {
+        dtype = _array_find_python_scalar_type(obj);
+        if (dtype == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                    "numpy.NA only supports operations with scalars "
+                    "and NumPy arrays");
+            return NULL;
+        }
+    }
+
+    ret = (NpyNA_fields *)na_new(&NpyNA_Type, NULL, NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Copy the payload */
+    ret->payload = fna->payload;
+
+    /* Combine the dtypes */
+    Py_XDECREF(ret->dtype);
+    if (fna->dtype == NULL) {
+        ret->dtype = dtype;
+    }
+    else {
+        ret->dtype = PyArray_PromoteTypes(fna->dtype, dtype);
+        Py_DECREF(dtype);
+        if (ret->dtype == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    return (NpyNA *)ret;
+}
+
+/*NUMPY_API
+ *
+ * Returns the *dtype* field of the NA object, which is NULL when
+ * the NA has no dtype.  Does not raise an error.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+NpyNA_GetDType(NpyNA* na)
+{
+    NpyNA_fields *fna = (NpyNA_fields *)na;
+    return fna->dtype;
+}
+
+/*NUMPY_API
+ *
+ * Returns true if the NA has a multi-NA payload, false otherwise.
+ */
+NPY_NO_EXPORT npy_bool
+NpyNA_IsMultiNA(NpyNA* na)
+{
+    NpyNA_fields *fna = (NpyNA_fields *)na;
+    return fna->payload != NPY_NA_NOPAYLOAD;
+}
+
+/*NUMPY_API
+ *
+ * Gets the multi-NA payload of the NA, or 0 if *na* doesn't have
+ * a multi-NA payload.
+ */
+NPY_NO_EXPORT int
+NpyNA_GetPayload(NpyNA* na)
+{
+    NpyNA_fields *fna = (NpyNA_fields *)na;
+    return fna->payload == NPY_NA_NOPAYLOAD ? 0 : fna->payload;
+}
+
+
+/*NUMPY_API
+ *
+ * Converts an object into an NA if possible.
+ *
+ * If 'suppress_error' is enabled, doesn't raise an error when something
+ * isn't NA.
+ */
+NPY_NO_EXPORT NpyNA *
+NpyNA_FromObject(PyObject *obj, int suppress_error)
+{
+    /* Pass through existing NAs */
+    if (NpyNA_Check(obj)) {
+        Py_INCREF(obj);
+        return (NpyNA *)obj;
+    }
+    /* Convert zero-dimensional masked elements into NAs */
+    else if (PyArray_Check(obj)) {
+        if (PyArray_NDIM((PyArrayObject *)obj) == 0 &&
+                    !PyArray_HASFIELDS((PyArrayObject *)obj)) {
+            if (PyArray_HASMASKNA((PyArrayObject *)obj)) {
+                npy_mask maskvalue;
+                NpyNA_fields *fna;
+
+                maskvalue = (npy_mask)*PyArray_MASKNA_DATA(
+                                            (PyArrayObject *)obj);
+                if (NpyMaskValue_IsExposed(maskvalue)) {
+                    if (!suppress_error) {
+                        PyErr_SetString(PyExc_ValueError,
+                                "Cannot convert zero-dimensional array with "
+                                "valid value into NA");
+                    }
+                    return NULL;
+                }
+
+                fna = (NpyNA_fields *)na_new(&NpyNA_Type, NULL, NULL);
+                if (fna == NULL) {
+                    return NULL;
+                }
+
+                fna->dtype = PyArray_DESCR((PyArrayObject *)obj);
+                Py_INCREF(fna->dtype);
+
+                if (PyArray_MASKNA_DTYPE((PyArrayObject *)obj)->type_num ==
+                                        NPY_MASK) {
+                    fna->payload = NpyMaskValue_GetPayload(maskvalue);
+                }
+
+                return (NpyNA *)fna;
+            }
+        }
+        else {
+            if (!suppress_error) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot convert array with one or more dimensions "
+                        "into an NA");
+            }
+            return NULL;
+        }
+    }
+
+    if (!suppress_error) {
+        PyErr_SetString(PyExc_ValueError, "Cannot convert object into an NA");
+    }
+    return NULL;
+}
+
+/*NUMPY_API
+ *
+ * Converts a dtype reference and payload value into an NA.
+ * Doesn't steal the 'dtype' reference. Raises an error
+ * if the payload is invalid
+ */
+NPY_NO_EXPORT NpyNA *
+NpyNA_FromDTypeAndPayload(PyArray_Descr *dtype, int multina, int payload)
+{
+    NpyNA_fields *fna;
+
+    if (dtype == NULL && multina == 0 && payload == 0) {
+        Py_INCREF(Npy_NA);
+        return (NpyNA *)Npy_NA;
+    }
+
+    fna = (NpyNA_fields *)na_new(&NpyNA_Type, NULL, NULL);
+    if (fna == NULL) {
+        return NULL;
+    }
+
+    fna->dtype = dtype;
+    Py_XINCREF(fna->dtype);
+
+    if (multina) {
+        if (payload < 0 || payload > 0x7f) {
+            PyErr_Format(PyExc_ValueError,
+                    "Given NA payload, %d, is out of bounds [0, 128)",
+                    payload);
+            Py_DECREF(fna);
+        }
+        fna->payload = (npy_uint8)payload;
+    }
+    else if (payload != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "NA payload must be zero when multi-NA is disabled");
+        Py_DECREF(fna);
+        return NULL;
+    }
+
+    return (NpyNA *)fna;
+}
+
+/*
+ * Returns a mask value corresponding to the NA.
+ */
+NPY_NO_EXPORT npy_mask
+NpyNA_AsMaskValue(NpyNA *na)
+{
+    return NpyMaskValue_Create(0, NpyNA_GetPayload(na));
+}
+
+/* An NA unary op simply passes along the same NA */
+static PyObject *
+na_unaryop(PyObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+static PyObject *
+na_binaryop(PyObject *op1, PyObject *op2)
+{
+    /* If an ndarray is operated on with NA, let the array handle it */
+    if (PyArray_Check(op1) || PyArray_Check(op2)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* Combine NAs according to standard rules */
+    else {
+        if (NpyNA_Check(op1)) {
+            return (PyObject *)NpyNA_CombineNAWithObject((NpyNA *)op1, op2);
+        }
+        else if (NpyNA_Check(op2)) {
+            return (PyObject *)NpyNA_CombineNAWithObject((NpyNA *)op2, op1);
+        }
+        else {
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+        }
+    }
+}
+
+static PyObject *
+na_power(PyObject *op1, PyObject *op2, PyObject *NPY_UNUSED(op3))
+{
+    return na_binaryop(op1, op2);
+}
+
+/* Special case bitwise <and> with a boolean 'other' */
+static PyObject *
+na_and(PyObject *op1, PyObject *op2)
+{
+    NpyNA *na;
+    PyObject *other;
+
+    if (NpyNA_Check(op1)) {
+        na = (NpyNA *)op1;
+        other = op2;
+    }
+    else if (NpyNA_Check(op2)) {
+        na = (NpyNA *)op2;
+        other = op1;
+    }
+    else {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    /* If an ndarray is operated on with NA, let the array handle it */
+    if (PyArray_Check(other)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* NA & False is False */
+    else if (other == Py_False ||
+                        ((Py_TYPE(other) == &PyBoolArrType_Type) &&
+                         ((PyBoolScalarObject *)other)->obval == 0)) {
+        Py_INCREF(Py_False);
+        return Py_False;
+    }
+    /* Combine NAs according to standard rules */
+    else {
+        return (PyObject *)NpyNA_CombineNAWithObject(na, other);
+    }
+}
+
+/* Special case bitwise <or> with a boolean 'other' */
+static PyObject *
+na_or(PyObject *op1, PyObject *op2)
+{
+    NpyNA *na;
+    PyObject *other;
+
+    if (NpyNA_Check(op1)) {
+        na = (NpyNA *)op1;
+        other = op2;
+    }
+    else if (NpyNA_Check(op2)) {
+        na = (NpyNA *)op2;
+        other = op1;
+    }
+    else {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    /* If an ndarray is operated on with NA, let the array handle it */
+    if (PyArray_Check(other)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* NA & True is True */
+    else if (other == Py_True ||
+                        ((Py_TYPE(other) == &PyBoolArrType_Type) &&
+                         ((PyBoolScalarObject *)other)->obval != 0)) {
+        Py_INCREF(Py_True);
+        return Py_True;
+    }
+    /* Combine NAs according to standard rules */
+    else {
+        return (PyObject *)NpyNA_CombineNAWithObject(na, other);
+    }
+}
+
+/* Using NA in an if statement is always an error */
+static int
+na_nonzero(PyObject *NPY_UNUSED(self))
+{
+    PyErr_SetString(PyExc_ValueError,
+            "numpy.NA represents an unknown missing value, "
+            "so its truth value cannot be determined");
+    return -1;
+}
+
+NPY_NO_EXPORT PyNumberMethods na_as_number = {
+    (binaryfunc)na_binaryop,                    /*nb_add*/
+    (binaryfunc)na_binaryop,                    /*nb_subtract*/
+    (binaryfunc)na_binaryop,                    /*nb_multiply*/
+#if defined(NPY_PY3K)
+#else
+    (binaryfunc)na_binaryop,                    /*nb_divide*/
+#endif
+    (binaryfunc)na_binaryop,                    /*nb_remainder*/
+    (binaryfunc)na_binaryop,                    /*nb_divmod*/
+    (ternaryfunc)na_power,                      /*nb_power*/
+    (unaryfunc)na_unaryop,                      /*nb_neg*/
+    (unaryfunc)na_unaryop,                      /*nb_pos*/
+    (unaryfunc)na_unaryop,                      /*nb_abs,*/
+    (inquiry)na_nonzero,                        /*nb_nonzero*/
+    (unaryfunc)na_unaryop,                      /*nb_invert*/
+    (binaryfunc)na_binaryop,                    /*nb_lshift*/
+    (binaryfunc)na_binaryop,                    /*nb_rshift*/
+    (binaryfunc)na_and,                         /*nb_and*/
+    (binaryfunc)na_binaryop,                    /*nb_xor*/
+    (binaryfunc)na_or,                          /*nb_or*/
+#if defined(NPY_PY3K)
+#else
+    0,                                          /*nb_coerce*/
+#endif
+    0,                                          /*nb_int*/
+#if defined(NPY_PY3K)
+    0,                                          /*nb_reserved*/
+#else
+    0,                                          /*nb_long*/
+#endif
+    0,                                          /*nb_float*/
+#if defined(NPY_PY3K)
+#else
+    0,                                          /*nb_oct*/
+    0,                                          /*nb_hex*/
+#endif
+    0,                                          /*inplace_add*/
+    0,                                          /*inplace_subtract*/
+    0,                                          /*inplace_multiply*/
+#if defined(NPY_PY3K)
+#else
+    0,                                          /*inplace_divide*/
+#endif
+    0,                                          /*inplace_remainder*/
+    0,                                          /*inplace_power*/
+    0,                                          /*inplace_lshift*/
+    0,                                          /*inplace_rshift*/
+    0,                                          /*inplace_and*/
+    0,                                          /*inplace_xor*/
+    0,                                          /*inplace_or*/
+    (binaryfunc)na_binaryop,                    /*nb_floor_divide*/
+    (binaryfunc)na_binaryop,                    /*nb_true_divide*/
+    0,                                          /*nb_inplace_floor_divide*/
+    0,                                          /*nb_inplace_true_divide*/
+#if PY_VERSION_HEX >= 0x02050000
+    0,                                          /*nb_index*/
+#endif
+};
+
+NPY_NO_EXPORT PyTypeObject NpyNA_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.NAType",                             /* tp_name */
+    sizeof(NpyNA_fields),                  /* tp_basicsize */
+    0,                                          /* tp_itemsize */
+    /* methods */
+    (destructor)na_dealloc,                     /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    (reprfunc)na_repr,                          /* tp_repr */
+    &na_as_number,                              /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    (ternaryfunc)na_call,                       /* tp_call */
+    (reprfunc)na_str,                           /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT                          /* tp_flags */
+#if !defined(NPY_PY3K)
+    | Py_TPFLAGS_CHECKTYPES,
+#else
+    ,
+#endif
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    (richcmpfunc)na_richcompare,                /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    na_getsets,                                 /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    (initproc)na_init,                          /* tp_init */
+    0,                                          /* tp_alloc */
+    na_new,                                     /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+#if PY_VERSION_HEX >= 0x02060000
+    0,                                          /* tp_version_tag */
+#endif
+};
+
+NPY_NO_EXPORT NpyNA_fields _Npy_NASingleton = {
+    PyObject_HEAD_INIT(&NpyNA_Type)
+    NPY_NA_NOPAYLOAD,  /* payload */
+    NULL,              /* dtype */
+    1                  /* is_singleton */
+};
+
+/* This symbol is exported in the NumPy C API */
+NPY_NO_EXPORT PyObject *Npy_NA = (PyObject *)&_Npy_NASingleton;
diff --git a/numpy/core/src/multiarray/na_object.h b/numpy/core/src/multiarray/na_object.h
new file mode 100644
index 000000000..3e602ccdf
--- /dev/null
+++ b/numpy/core/src/multiarray/na_object.h
@@ -0,0 +1,60 @@
+#ifndef _NPY_PRIVATE__NA_SINGLETON_H_
+#define _NPY_PRIVATE__NA_SINGLETON_H_
+
+/* Direct access to the fields of the NA object is just internal to NumPy. */
+typedef struct {
+    PyObject_HEAD
+    /* NA payload, 0 by default */
+    npy_uint8 payload;
+    /* NA dtype, NULL by default */
+    PyArray_Descr *dtype;
+    /* Internal flag, whether this is the singleton numpy.NA or not */
+    int is_singleton;
+} NpyNA_fields;
+
+#define NPY_NA_NOPAYLOAD (255)
+
+static NPY_INLINE npy_uint8
+NpyNA_CombinePayloads(npy_uint p1, npy_uint p2)
+{
+    if (p1 == NPY_NA_NOPAYLOAD || p2 == NPY_NA_NOPAYLOAD) {
+        return NPY_NA_NOPAYLOAD;
+    }
+    else {
+        return (p1 < p2) ? p1 : p2;
+    }
+}
+
+/* Combines two NA values together, merging their payloads and dtypes. */
+NPY_NO_EXPORT NpyNA *
+NpyNA_CombineNA(NpyNA *na1, NpyNA *na2);
+
+/*
+ * Combines an NA with an object, raising an error if the object has
+ * no extractable NumPy dtype.
+ */
+NPY_NO_EXPORT NpyNA *
+NpyNA_CombineNAWithObject(NpyNA *na, PyObject *obj);
+
+/*
+ * Returns a mask value corresponding to the NA.
+ */
+NPY_NO_EXPORT npy_mask
+NpyNA_AsMaskValue(NpyNA *na);
+
+/*
+ * Returns True if the object is an NA in the form of a 0-dimensional
+ * array.
+ */
+static NPY_INLINE npy_bool
+NpyNA_IsZeroDimArrayNA(PyObject *obj)
+{
+    return PyArray_Check(obj) &&
+            PyArray_NDIM((PyArrayObject *)obj) == 0 &&
+            PyArray_HASMASKNA((PyArrayObject *)obj) &&
+            !PyArray_HASFIELDS((PyArrayObject *)obj) &&
+            !NpyMaskValue_IsExposed((npy_mask)*PyArray_MASKNA_DATA(
+                                                    (PyArrayObject *)obj));
+}
+
+#endif
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 4875c1e34..fd5d4d74e 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -272,7 +272,9 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
 }
 
 /*NUMPY_API
- * Resets the iterator to its initial state, with new base data pointers
+ * Resets the iterator to its initial state, with new base data pointers.
+ * This function requires great caution, even more so if any
+ * NPY_ITER_USE_MASKNA operands were specified.
  *
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
@@ -691,6 +693,74 @@ NpyIter_HasIndex(NpyIter *iter)
 }
 
 /*NUMPY_API
+ * Checks to see whether this is the first time the elements
+ * of the specified reduction operand which the iterator points at are
+ * being seen for the first time. The function returns
+ * a reasonable answer for reduction operands and when buffering is
+ * disabled. The answer may be incorrect for buffered non-reduction
+ * operands.
+ *
+ * This function is intended to be used in EXTERNAL_LOOP mode only,
+ * and will produce some wrong answers when that mode is not enabled.
+ *
+ * If this function returns true, the caller should also
+ * check the inner loop stride of the operand, because if
+ * that stride is 0, then only the first element of the innermost
+ * external loop is being visited for the first time.
+ *
+ * WARNING: For performance reasons, 'iop' is not bounds-checked,
+ *          it is not confirmed that 'iop' is actually a reduction
+ *          operand, and it is not confirmed that EXTERNAL_LOOP
+ *          mode is enabled. These checks are the responsibility of
+ *          the caller, and should be done outside of any inner loops.
+ */
+NPY_NO_EXPORT npy_bool
+NpyIter_IsFirstVisit(NpyIter *iter, int iop)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    axisdata = NIT_AXISDATA(iter);
+
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp coord = NAD_INDEX(axisdata);
+        npy_intp stride = NAD_STRIDES(axisdata)[iop];
+
+        /*
+         * If this is a reduction dimension and the coordinate
+         * is not at the start, it's definitely not the first visit
+         */
+        if (stride == 0 && coord != 0) {
+            return 0;
+        }
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+    /*
+     * In reduction buffering mode, there's a double loop being
+     * tracked in the buffer part of the iterator data structure.
+     * We only need to check the outer level of this two-level loop,
+     * because of the requirement that EXTERNAL_LOOP be enabled.
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        /* The outer reduce loop */
+        if (NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop] == 0 &&
+                NBF_REDUCE_POS(bufferdata) != 0) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*NUMPY_API
  * Whether the iteration could be done with no buffering.
  */
 NPY_NO_EXPORT npy_bool
@@ -748,6 +818,35 @@ NpyIter_GetNOp(NpyIter *iter)
 }
 
 /*NUMPY_API
+ * Gets the index of the first operand which is the
+ * mask for an NPY_ITER_USE_MASKNA operand.
+ */
+NPY_NO_EXPORT int
+NpyIter_GetFirstMaskNAOp(NpyIter *iter)
+{
+    return NIT_FIRST_MASKNA_OP(iter);
+}
+
+/*NUMPY_API
+ * Gets the correspondences between the operands with
+ * NPY_ITER_USEMASKNA set and their corresponding masks.
+ *
+ * If i < NpyIter_GetFirstMaskNAOp(iter), then
+ * NpyIter_GetMaskNAIndices(iter)[i] is either -1 or
+ * an index >= NpyIter_GetFirstMaskNAOp(iter) of the corresponding
+ * mask.
+ *
+ * If i >= NpyIter_GetFirstMaskNAOp(iter), then
+ * NpyIter_GetMaskNAIndices(iter)[i] is the index
+ * of the corresponding maskna operand for the mask.
+ */
+NPY_NO_EXPORT npy_int8 *
+NpyIter_GetMaskNAIndexArray(NpyIter *iter)
+{
+    return NIT_MASKNA_INDICES(iter);
+}
+
+/*NUMPY_API
  * Gets the number of elements being iterated
  */
 NPY_NO_EXPORT npy_intp
@@ -1004,6 +1103,7 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     PyArrayObject *obj, *view;
@@ -1012,8 +1112,9 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
     int writeable;
+    npy_int8 *maskna_indices = NIT_MASKNA_INDICES(iter);
 
-    if (i < 0 || i >= nop) {
+    if (i < 0 || i >= first_maskna_op) {
         PyErr_SetString(PyExc_IndexError,
                 "index provided for an iterator view was out of bounds");
         return NULL;
@@ -1034,9 +1135,11 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Retrieve the shape and strides from the axisdata */
-    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+    for (idim = 0; idim < ndim; ++idim) {
         shape[ndim-idim-1] = NAD_SHAPE(axisdata);
         strides[ndim-idim-1] = NAD_STRIDES(axisdata)[i];
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
     }
 
     Py_INCREF(dtype);
@@ -1055,6 +1158,29 @@ NpyIter_GetIterView(NpyIter *iter, npy_intp i)
     }
     /* Make sure all the flags are good */
     PyArray_UpdateFlags(view, NPY_ARRAY_UPDATE_ALL);
+    /*
+     * Add the mask to the view if the operand was NPY_ITER_USE_MASKNA.
+     */
+    if (maskna_indices[i] >= 0) {
+        PyArrayObject_fields *fview = (PyArrayObject_fields *)view;
+        int i_maskna = maskna_indices[i];
+        npy_intp *maskna_strides = fview->maskna_strides;
+
+        fview->maskna_dtype = PyArray_MASKNA_DTYPE(obj);
+        Py_INCREF(fview->maskna_dtype);
+        fview->maskna_data = NIT_RESETDATAPTR(iter)[i_maskna];
+
+        axisdata = NIT_AXISDATA(iter);
+        for (idim = 0; idim < ndim; ++idim) {
+            maskna_strides[ndim-idim-1] = NAD_STRIDES(axisdata)[i_maskna];
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+
+        /* This view doesn't own the mask */
+        fview->flags |= NPY_ARRAY_MASKNA;
+        fview->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
 
     return view;
 }
@@ -1337,9 +1463,20 @@ NpyIter_DebugPrint(NpyIter *iter)
         printf("REDUCE ");
     if (itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS)
         printf("REUSE_REDUCE_LOOPS ");
+    if (itflags&NPY_ITFLAG_HAS_MASKNA_OP)
+        printf("HAS_MASKNA_OP ");
+
     printf("\n");
     printf("| NDim: %d\n", (int)ndim);
     printf("| NOp: %d\n", (int)nop);
+    if (itflags&NPY_ITFLAG_HAS_MASKNA_OP) {
+        printf("| First MaskNA Op: %d\n", (int)NIT_FIRST_MASKNA_OP(iter));
+        printf("| MaskNA Indices: ");
+        for (iop = 0; iop < nop; ++iop) {
+            printf("%d ", (int)NIT_MASKNA_INDICES(iter)[iop]);
+        }
+        printf("\n");
+    }
     if (NIT_MASKOP(iter) >= 0) {
         printf("| MaskOp: %d\n", (int)NIT_MASKOP(iter));
     }
@@ -1522,6 +1659,7 @@ NpyIter_DebugPrint(NpyIter *iter)
     }
 
     printf("------- END ITERATOR DUMP -------\n");
+    fflush(stdout);
 
     PyGILState_Release(gilstate);
 }
@@ -1599,7 +1737,6 @@ npyiter_coalesce_axes(NpyIter *iter)
 }
 
 /*
- *
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
@@ -1749,6 +1886,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
     int ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
     int maskop = NIT_MASKOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -1764,11 +1902,12 @@ npyiter_copy_from_buffers(NpyIter *iter)
     char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
     char *buffer;
+    npy_int8 *maskna_indices = NIT_MASKNA_INDICES(iter);
 
     npy_intp reduce_outerdim = 0;
     npy_intp *reduce_outerstrides = NULL;
 
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
     NpyAuxData *transferdata = NULL;
 
     npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) /
@@ -1866,7 +2005,37 @@ npyiter_copy_from_buffers(NpyIter *iter)
                                     "operand %d (%d items)\n",
                                     (int)iop, (int)op_transfersize);
 
-                if (op_itflags[iop] & NPY_OP_ITFLAG_WRITEMASKED) {
+                /* USE_MASKNA operand */
+                if (iop < first_maskna_op && maskna_indices[iop] >= 0) {
+                    int iop_maskna = maskna_indices[iop];
+                    npy_mask *maskptr;
+                    /* TODO: support WRITEMASKED + USE_MASKNA together */
+
+                    /*
+                     * The mask pointer may be in the buffer or in
+                     * the array, detect which one.
+                     */
+                    delta = (ptrs[iop_maskna] - buffers[iop_maskna]);
+                    if (0 <= delta &&
+                            delta <= buffersize*dtypes[iop_maskna]->elsize) {
+                        maskptr = (npy_mask *)buffers[iop_maskna];
+                    }
+                    else {
+                        maskptr = (npy_mask *)ad_ptrs[iop_maskna];
+                    }
+
+                    PyArray_TransferMaskedStridedToNDim(ndim_transfer,
+                            ad_ptrs[iop], dst_strides, axisdata_incr,
+                            buffer, src_stride,
+                            maskptr, strides[iop_maskna],
+                            dst_coords, axisdata_incr,
+                            dst_shape, axisdata_incr,
+                            op_transfersize, dtypes[iop]->elsize,
+                            (PyArray_MaskedStridedUnaryOp *)stransfer,
+                            transferdata);
+                }
+                /* WRITEMASKED operand */
+                else if (op_itflags[iop] & NPY_OP_ITFLAG_WRITEMASKED) {
                     npy_mask *maskptr;
 
                     /*
@@ -1889,9 +2058,10 @@ npyiter_copy_from_buffers(NpyIter *iter)
                             dst_coords, axisdata_incr,
                             dst_shape, axisdata_incr,
                             op_transfersize, dtypes[iop]->elsize,
-                            (PyArray_MaskedStridedTransferFn *)stransfer,
+                            (PyArray_MaskedStridedUnaryOp *)stransfer,
                             transferdata);
                 }
+                /* Regular operand */
                 else {
                     PyArray_TransferStridedToNDim(ndim_transfer,
                             ad_ptrs[iop], dst_strides, axisdata_incr,
@@ -1943,6 +2113,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     char *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
@@ -1963,7 +2134,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
     npy_intp *reduce_outerstrides = NULL;
     char **reduce_outerptrs = NULL;
 
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
     NpyAuxData *transferdata = NULL;
 
     /*
@@ -2272,7 +2443,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         }
 
         if (stransfer != NULL) {
-            npy_intp src_itemsize = PyArray_DESCR(operands[iop])->elsize;
+            npy_intp src_itemsize;
             npy_intp op_transfersize;
 
             npy_intp dst_stride, *src_strides, *src_coords, *src_shape;
@@ -2280,6 +2451,14 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
             npy_bool skip_transfer = 0;
 
+            /* Need to pick the right item size for the data vs mask */
+            if (iop < first_maskna_op) {
+                src_itemsize = PyArray_DTYPE(operands[iop])->elsize;
+            }
+            else {
+                src_itemsize = PyArray_MASKNA_DTYPE(operands[iop])->elsize;
+            }
+
             /* If stransfer wasn't set to NULL, buffering is required */
             any_buffered = 1;
 
@@ -2454,7 +2633,7 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     npy_intp sizeof_axisdata;
     npy_intp coord, shape, *strides;
     npy_intp reducespace = 1, factor;
-    npy_bool nonzerocoord = 0;
+    npy_bool nonzerocoord;
 
     char *op_itflags = NIT_OPITFLAGS(iter);
     char stride0op[NPY_MAXARGS];
@@ -2463,7 +2642,7 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     *reduce_outerdim = 0;
 
     /* If there's only one dimension, no need to calculate anything */
-    if (ndim == 1) {
+    if (ndim == 1 || count == 0) {
         *reduce_innersize = count;
         return count;
     }
@@ -2485,6 +2664,9 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     factor = shape;
     NIT_ADVANCE_AXISDATA(axisdata, 1);
 
+    /* Initialize nonzerocoord based on the first coordinate */
+    nonzerocoord = (coord != 0);
+
     /* Go forward through axisdata, calculating the space available */
     for (idim = 1; idim < ndim && reducespace < count;
                                 ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 441f19de0..2754b2098 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -14,6 +14,8 @@
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
 #include "nditer_impl.h"
 
+#include "arrayobject.h"
+
 /* Internal helper functions private to this file */
 static int
 npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
@@ -33,14 +35,16 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                         npy_uint32 flags,
                         npy_uint32 op_flags, char *op_itflags);
 static int
-npyiter_prepare_operands(int nop, PyArrayObject **op_in,
+npyiter_prepare_operands(int nop, int first_maskna_op,
+                    PyArrayObject **op_in,
                     PyArrayObject **op,
                     char **op_dataptr,
                     PyArray_Descr **op_request_dtypes,
                     PyArray_Descr **op_dtype,
                     npy_uint32 flags,
                     npy_uint32 *op_flags, char *op_itflags,
-                    npy_int8 *out_maskop);
+                    npy_int8 *out_maskop,
+                    npy_int8 *out_maskna_indices);
 static int
 npyiter_check_casting(int nop, PyArrayObject **op,
                     PyArray_Descr **op_dtype,
@@ -68,7 +72,7 @@ npyiter_reverse_axis_ordering(NpyIter *iter);
 static void
 npyiter_find_best_axis_ordering(NpyIter *iter);
 static PyArray_Descr *
-npyiter_get_common_dtype(int nop, PyArrayObject **op,
+npyiter_get_common_dtype(int first_maskna_op, PyArrayObject **op,
                         char *op_itflags, PyArray_Descr **op_dtype,
                         PyArray_Descr **op_request_dtypes,
                         int only_inputs, int output_scalars);
@@ -83,8 +87,10 @@ npyiter_allocate_arrays(NpyIter *iter,
                         PyArray_Descr **op_dtype, PyTypeObject *subtype,
                         npy_uint32 *op_flags, char *op_itflags,
                         int **op_axes, int output_scalars);
+static int
+npyiter_fill_maskna_axisdata(NpyIter *iter, int **op_axes);
 static void
-npyiter_get_priority_subtype(int nop, PyArrayObject **op,
+npyiter_get_priority_subtype(int first_maskna_op, PyArrayObject **op,
                             char *op_itflags,
                             double *subtype_priority, PyTypeObject **subtype);
 static int
@@ -105,7 +111,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
 {
     npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
     int idim, ndim;
-    int iop;
+    int iop, maskna_nop, first_maskna_op;
 
     /* The iterator being constructed */
     NpyIter *iter;
@@ -179,6 +185,18 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
 
     NPY_IT_TIME_POINT(c_calculate_ndim);
 
+    /* Calculate maskna_nop */
+    maskna_nop = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        if (op_flags[iop]&NPY_ITER_USE_MASKNA) {
+            itflags |= NPY_ITFLAG_HAS_MASKNA_OP;
+            ++maskna_nop;
+        }
+    }
+    /* Adjust nop to include the masks at the end */
+    first_maskna_op = nop;
+    nop += maskna_nop;
+
     /* Allocate memory for the iterator */
     iter = (NpyIter*)
                 PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, nop));
@@ -189,6 +207,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     NIT_ITFLAGS(iter) = itflags;
     NIT_NDIM(iter) = ndim;
     NIT_NOP(iter) = nop;
+    NIT_FIRST_MASKNA_OP(iter) = first_maskna_op;
     NIT_MASKOP(iter) = -1;
     NIT_ITERINDEX(iter) = 0;
     memset(NIT_BASEOFFSETS(iter), 0, (nop+1)*NPY_SIZEOF_INTP);
@@ -199,10 +218,12 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     op_dataptr = NIT_RESETDATAPTR(iter);
 
     /* Prepare all the operands */
-    if (!npyiter_prepare_operands(nop, op_in, op, op_dataptr,
+    if (!npyiter_prepare_operands(nop, first_maskna_op, op_in, op, op_dataptr,
                         op_request_dtypes, op_dtype,
                         flags,
-                        op_flags, op_itflags, &NIT_MASKOP(iter))) {
+                        op_flags, op_itflags,
+                        &NIT_MASKOP(iter),
+                        NIT_MASKNA_INDICES(iter))) {
         PyArray_free(iter);
         return NULL;
     }
@@ -219,6 +240,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         bufferdata = NIT_BUFFERDATA(iter);
         NBF_SIZE(bufferdata) = 0;
         memset(NBF_BUFFERS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
+        memset(NBF_PTRS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
         memset(NBF_READTRANSFERDATA(bufferdata), 0, nop*NPY_SIZEOF_INTP);
         memset(NBF_WRITETRANSFERDATA(bufferdata), 0, nop*NPY_SIZEOF_INTP);
     }
@@ -273,7 +295,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     NPY_IT_TIME_POINT(c_apply_forced_iteration_order);
 
     /* Set some flags for allocated outputs */
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         if (op[iop] == NULL) {
             /* Flag this so later we can avoid flipping axes */
             any_allocate = 1;
@@ -312,7 +334,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     NPY_IT_TIME_POINT(c_find_best_axis_ordering);
 
     if (need_subtype) {
-        npyiter_get_priority_subtype(nop, op, op_itflags,
+        npyiter_get_priority_subtype(first_maskna_op, op, op_itflags,
                                      &subtype_priority, &subtype);
     }
 
@@ -329,7 +351,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         op = NIT_OPERANDS(iter);
         op_dtype = NIT_DTYPES(iter);
 
-        dtype = npyiter_get_common_dtype(nop, op,
+        dtype = npyiter_get_common_dtype(first_maskna_op, op,
                                     op_itflags, op_dtype,
                                     op_request_dtypes,
                                     only_inputs,
@@ -341,7 +363,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         if (flags & NPY_ITER_COMMON_DTYPE) {
             NPY_IT_DBG_PRINT("Iterator: Replacing all data types\n");
             /* Replace all the data types */
-            for (iop = 0; iop < nop; ++iop) {
+            for (iop = 0; iop < first_maskna_op; ++iop) {
                 if (op_dtype[iop] != dtype) {
                     Py_XDECREF(op_dtype[iop]);
                     Py_INCREF(dtype);
@@ -352,7 +374,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         else {
             NPY_IT_DBG_PRINT("Iterator: Setting unset output data types\n");
             /* Replace the NULL data types */
-            for (iop = 0; iop < nop; ++iop) {
+            for (iop = 0; iop < first_maskna_op; ++iop) {
                 if (op_dtype[iop] == NULL) {
                     Py_INCREF(dtype);
                     op_dtype[iop] = dtype;
@@ -369,7 +391,8 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
      * to check that data type conversions are following the
      * casting rules.
      */
-    if (!npyiter_check_casting(nop, op, op_dtype, casting, op_itflags)) {
+    if (!npyiter_check_casting(first_maskna_op, op,
+                                op_dtype, casting, op_itflags)) {
         NpyIter_Deallocate(iter);
         return NULL;
     }
@@ -388,6 +411,58 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         return NULL;
     }
 
+    /*
+     * If there were any NA masks added to the iteration, fill in
+     * the strides and other data they need. This is being done
+     * after all the 'allocate' arrays are finished.
+     */
+    if (maskna_nop > 0) {
+        npy_int8 *maskna_indices = NIT_MASKNA_INDICES(iter);
+
+        /* Copy op references to the maskna op's masks */
+        for (iop = first_maskna_op; iop < nop; ++iop) {
+            int iop_maskna = maskna_indices[iop];
+            op[iop] = op[iop_maskna];
+            Py_INCREF(op[iop]);
+            /* If the operand has a mask, use its dtype */
+            if (PyArray_HASMASKNA(op[iop])) {
+                op_dtype[iop] = PyArray_MASKNA_DTYPE(op[iop]);
+                Py_INCREF(op_dtype[iop]);
+            }
+            /* Otherwise a virtual all-ones operand will be used */
+            else {
+                if (PyArray_HASFIELDS(op[iop])) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "struct-NA is not supported yet");
+                    NpyIter_Deallocate(iter);
+                    return NULL;
+                }
+                else {
+                    op_dtype[iop] = PyArray_DescrFromType(NPY_BOOL);
+                    if (op_dtype[iop] == NULL) {
+                        NpyIter_Deallocate(iter);
+                        return NULL;
+                    }
+                }
+            }
+            /* Propagate select flags from the main operand */
+            op_itflags[iop] = op_itflags[iop_maskna] &
+                                (NPY_OP_ITFLAG_WRITE |
+                                 NPY_OP_ITFLAG_READ |
+                                 NPY_OP_ITFLAG_BUFNEVER |
+                                 NPY_OP_ITFLAG_REDUCE |
+                                 NPY_OP_ITFLAG_WRITEMASKED);
+            /* The mask is always aligned (alignment 1) */
+            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+        }
+
+        /* Fill in the strides for the masks */
+        if (!npyiter_fill_maskna_axisdata(iter, op_axes)) {
+            NpyIter_Deallocate(iter);
+            return NULL;
+        }
+    }
+
     NPY_IT_TIME_POINT(c_allocate_arrays);
 
     /*
@@ -431,7 +506,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
      * reference arrays and flag it if so.
      */
     if (flags & NPY_ITER_REFS_OK) {
-        for (iop = 0; iop < nop; ++iop) {
+        for (iop = 0; iop < first_maskna_op; ++iop) {
             PyArray_Descr *rdt = op_dtype[iop];
             if ((rdt->flags & (NPY_ITEM_REFCOUNT |
                                      NPY_ITEM_IS_POINTER |
@@ -448,12 +523,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
             NpyIter_Deallocate(iter);
             return NULL;
         }
-        if (itflags & NPY_ITFLAG_DELAYBUF) {
-            bufferdata = NIT_BUFFERDATA(iter);
-            /* Make the data pointers NULL */
-            memset(NBF_PTRS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
-        }
-        else {
+        if (!(itflags & NPY_ITFLAG_DELAYBUF)) {
             /* Allocate the buffers */
             if (!npyiter_allocate_buffers(iter, NULL)) {
                 NpyIter_Deallocate(iter);
@@ -915,6 +985,18 @@ npyiter_check_per_op_flags(npy_uint32 op_flags, char *op_itflags)
                 "be used together with ARRAYMASK");
             return 0;
         }
+        /*
+         * When WRITEMASKED and USE_MASKNA are supported together,
+         * it will probably require that buffering is enabled as well,
+         * because that will need yet another temporary mask buffer to combine
+         * the two masks before doing the masked copies.
+         */
+        if ((op_flags & NPY_ITER_USE_MASKNA) != 0) {
+            PyErr_SetString(PyExc_ValueError,
+                "The combination of iterator flags WRITEMASKED "
+                "and USE_MASKNA is not yet supported");
+            return 0;
+        }
         *op_itflags |= NPY_OP_ITFLAG_WRITEMASKED;
     }
 
@@ -933,8 +1015,8 @@ npyiter_check_per_op_flags(npy_uint32 op_flags, char *op_itflags)
 
 /*
  * Prepares a a constructor operand.  Assumes a reference to 'op'
- * is owned, and that 'op' may be replaced.  Fills in 'op_dtype'
- * and 'ndim'.
+ * is owned, and that 'op' may be replaced.  Fills in 'op_dataptr',
+ * 'op_dtype', and may modify 'op_itflags'.
  *
  * Returns 1 on success, 0 on failure.
  */
@@ -1011,11 +1093,14 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         return 0;
     }
 
+
     if (PyArray_Check(*op)) {
+        npy_uint32 tmp;
+
         if (((*op_itflags) & NPY_OP_ITFLAG_WRITE) &&
                     (!PyArray_CHKFLAGS(*op, NPY_ARRAY_WRITEABLE))) {
             PyErr_SetString(PyExc_ValueError,
-                    "Iterator operand was a non-writeable array, but was "
+                    "Operand was a non-writeable array, but "
                     "flagged as writeable");
             return 0;
         }
@@ -1024,6 +1109,27 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                     "Iteration of zero-sized operands is not enabled");
             return 0;
         }
+        /*
+         * Writeable USE_MASKNA operands must have a mask
+         * (or NA dtype, later)
+         */
+        if ((op_flags & NPY_ITER_USE_MASKNA) != 0 &&
+                            ((*op_itflags) & NPY_OP_ITFLAG_WRITE) != 0 &&
+                            !PyArray_HASMASKNA(*op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Operand is writeable and flagged USE_MASKNA, "
+                    "but the operand does not have an NA mask");
+            return 0;
+        }
+        /* Arrays with NA masks must have USE_MASKNA specified */
+        tmp = op_flags & (NPY_ITER_USE_MASKNA | NPY_ITER_IGNORE_MASKNA);
+        if (tmp != NPY_ITER_USE_MASKNA && tmp != NPY_ITER_IGNORE_MASKNA &&
+                                    PyArray_HASMASKNA(*op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Operand has an NA mask but the operation does "
+                    "not support NA via the flag USE_MASKNA");
+            return 0;
+        }
         *op_dataptr = PyArray_BYTES(*op);
         /* PyArray_DESCR does not give us a reference */
         *op_dtype = PyArray_DESCR(*op);
@@ -1115,20 +1221,26 @@ npyiter_prepare_one_operand(PyArrayObject **op,
  * can replace the arrays if copying is necessary.
  */
 static int
-npyiter_prepare_operands(int nop, PyArrayObject **op_in,
+npyiter_prepare_operands(int nop, int first_maskna_op, PyArrayObject **op_in,
                     PyArrayObject **op,
                     char **op_dataptr,
                     PyArray_Descr **op_request_dtypes,
                     PyArray_Descr **op_dtype,
                     npy_uint32 flags,
                     npy_uint32 *op_flags, char *op_itflags,
-                    npy_int8 *out_maskop)
+                    npy_int8 *out_maskop,
+                    npy_int8 *out_maskna_indices)
 {
-    int iop, i;
+    int iop, iop_maskna = first_maskna_op, i;
     npy_int8 maskop = -1;
     int any_writemasked_ops = 0;
 
-    for (iop = 0; iop < nop; ++iop) {
+    /*
+     * Here we just prepare the provided operands. The masks for
+     * the maskna operands get prepared later, after any 'allocate'
+     * operands are allocated.
+     */
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         op[iop] = op_in[iop];
         Py_XINCREF(op[iop]);
         op_dtype[iop] = NULL;
@@ -1163,6 +1275,16 @@ npyiter_prepare_operands(int nop, PyArrayObject **op_in,
             any_writemasked_ops = 1;
         }
 
+        /* Link the operands to their maskna operands */
+        if (op_flags[iop] & NPY_ITER_USE_MASKNA) {
+            out_maskna_indices[iop] = iop_maskna;
+            out_maskna_indices[iop_maskna] = iop;
+            ++iop_maskna;
+        }
+        else {
+            out_maskna_indices[iop] = -1;
+        }
+
         /*
          * Prepare the operand.  This produces an op_dtype[iop] reference
          * on success.
@@ -1181,11 +1303,17 @@ npyiter_prepare_operands(int nop, PyArrayObject **op_in,
         }
     }
 
+    /* Initialize the mask virtual operands to NULL for now */
+    for (iop = first_maskna_op; iop < nop; ++iop) {
+        op[iop] = NULL;
+        op_dataptr[iop] = NULL;
+        op_dtype[iop] = NULL;
+    }
 
     /* If all the operands were NULL, it's an error */
     if (op[0] == NULL) {
         int all_null = 1;
-        for (iop = 1; iop < nop; ++iop) {
+        for (iop = 1; iop < first_maskna_op; ++iop) {
             if (op[iop] != NULL) {
                 all_null = 0;
                 break;
@@ -1197,7 +1325,7 @@ npyiter_prepare_operands(int nop, PyArrayObject **op_in,
                 Py_XDECREF(op_dtype[i]);
             }
             PyErr_SetString(PyExc_ValueError,
-                    "At least one iterator input must be non-NULL");
+                    "At least one iterator operand must be non-NULL");
             return 0;
         }
     }
@@ -1247,7 +1375,7 @@ npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
 
     /*
      * Negative dimension indicates "newaxis", which can
-     * be discarded for printing if its a leading dimension.
+     * be discarded for printing if it's a leading dimension.
      * Find the first non-"newaxis" dimension.
      */
     i = 0;
@@ -1289,14 +1417,14 @@ npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
 }
 
 static int
-npyiter_check_casting(int nop, PyArrayObject **op,
+npyiter_check_casting(int first_maskna_op, PyArrayObject **op,
                     PyArray_Descr **op_dtype,
                     NPY_CASTING casting,
                     char *op_itflags)
 {
     int iop;
 
-    for(iop = 0; iop < nop; ++iop) {
+    for(iop = 0; iop < first_maskna_op; ++iop) {
         NPY_IT_DBG_PRINT1("Iterator: Checking casting for operand %d\n",
                             (int)iop);
 #if NPY_IT_DBG_TRACING
@@ -1439,6 +1567,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
     int maskop = NIT_MASKOP(iter);
 
     int ondim;
@@ -1462,7 +1591,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
             }
         }
     }
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         op_cur = op[iop];
         if (op_cur != NULL) {
             npy_intp *shape = PyArray_DIMS(op_cur);
@@ -1543,7 +1672,8 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
         NAD_INDEX(axisdata) = 0;
         memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*nop);
 
-        for (iop = 0; iop < nop; ++iop) {
+        /* Not processing the maskna masks until after allocation */
+        for (iop = 0; iop < first_maskna_op; ++iop) {
             op_cur = op[iop];
 
             if (op_axes == NULL || op_axes[iop] == NULL) {
@@ -1555,7 +1685,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
                     if (bshape == 1) {
                         strides[iop] = 0;
                         if (idim >= ondim && !output_scalars &&
-                                        (op_flags[iop] & NPY_ITER_NO_BROADCAST)) {
+                                    (op_flags[iop] & NPY_ITER_NO_BROADCAST)) {
                             goto operand_different_than_broadcast;
                         }
                     }
@@ -1696,7 +1826,7 @@ broadcast_error: {
             if (errmsg == NULL) {
                 return 0;
             }
-            for (iop = 0; iop < nop; ++iop) {
+            for (iop = 0; iop < first_maskna_op; ++iop) {
                 if (op[iop] != NULL) {
                     tmp = npyiter_shape_string(PyArray_NDIM(op[iop]),
                                                     PyArray_DIMS(op[iop]),
@@ -1739,7 +1869,7 @@ broadcast_error: {
             errmsg = PyUString_FromString("operands could not be broadcast "
                                           "together with remapped shapes "
                                           "[original->remapped]: ");
-            for (iop = 0; iop < nop; ++iop) {
+            for (iop = 0; iop < first_maskna_op; ++iop) {
                 if (op[iop] != NULL) {
                     int *axes = op_axes[iop];
 
@@ -2077,6 +2207,7 @@ npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
     /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
     int ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     switch (order) {
     case NPY_CORDER:
@@ -2097,7 +2228,7 @@ npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
             int forder = 1;
 
             /* Check that all the array inputs are fortran order */
-            for (iop = 0; iop < nop; ++iop, ++op) {
+            for (iop = 0; iop < first_maskna_op; ++iop, ++op) {
                 if (*op && !PyArray_CHKFLAGS(*op, NPY_ARRAY_F_CONTIGUOUS)) {
                    forder = 0;
                    break;
@@ -2127,6 +2258,7 @@ npyiter_flip_negative_strides(NpyIter *iter)
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     npy_intp istrides, nstrides = NAD_NSTRIDES();
     NpyIter_AxisData *axisdata, *axisdata0;
@@ -2141,10 +2273,9 @@ npyiter_flip_negative_strides(NpyIter *iter)
         int any_negative = 0;
 
         /*
-         * Check the signs of all the strides, excluding
-         * the index stride at the end.
+         * Check the signs of all the operand strides.
          */
-        for (iop = 0; iop < nop; ++iop) {
+        for (iop = 0; iop < first_maskna_op; ++iop) {
             if (strides[iop] < 0) {
                 any_negative = 1;
             }
@@ -2153,10 +2284,10 @@ npyiter_flip_negative_strides(NpyIter *iter)
             }
         }
         /*
-         * If at least on stride is negative and none are positive,
+         * If at least one stride is negative and none are positive,
          * flip all the strides for this dimension.
          */
-        if (any_negative && iop == nop) {
+        if (any_negative && iop == first_maskna_op) {
             npy_intp shapem1 = NAD_SHAPE(axisdata) - 1;
 
             for (istrides = 0; istrides < nstrides; ++istrides) {
@@ -2238,7 +2369,7 @@ npyiter_reverse_axis_ordering(NpyIter *iter)
     NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
 }
 
-static npy_intp
+static NPY_INLINE npy_intp
 intp_abs(npy_intp x)
 {
     return (x < 0) ? -x : x;
@@ -2250,6 +2381,7 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     npy_intp ax_i0, ax_i1, ax_ipos;
     npy_int8 ax_j0, ax_j1;
@@ -2281,7 +2413,7 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
 
             strides1 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, ax_j1));
 
-            for (iop = 0; iop < nop; ++iop) {
+            for (iop = 0; iop < first_maskna_op; ++iop) {
                 if (strides0[iop] != 0 && strides1[iop] != 0) {
                     if (intp_abs(strides1[iop]) <=
                                             intp_abs(strides0[iop])) {
@@ -2388,7 +2520,7 @@ npyiter_find_best_axis_ordering(NpyIter *iter)
  * are not read from out of the calculation.
  */
 static PyArray_Descr *
-npyiter_get_common_dtype(int nop, PyArrayObject **op,
+npyiter_get_common_dtype(int first_maskna_op, PyArrayObject **op,
                         char *op_itflags, PyArray_Descr **op_dtype,
                         PyArray_Descr **op_request_dtypes,
                         int only_inputs, int output_scalars)
@@ -2401,7 +2533,7 @@ npyiter_get_common_dtype(int nop, PyArrayObject **op,
 
     NPY_IT_DBG_PRINT("Iterator: Getting a common data type from operands\n");
 
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         if (op_dtype[iop] != NULL &&
                     (!only_inputs || (op_itflags[iop] & NPY_OP_ITFLAG_READ))) {
             /* If no dtype was requested and the op is a scalar, pass the op */
@@ -2737,6 +2869,7 @@ npyiter_allocate_arrays(NpyIter *iter,
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     int check_writemasked_reductions = 0;
 
@@ -2747,7 +2880,7 @@ npyiter_allocate_arrays(NpyIter *iter,
         bufferdata = NIT_BUFFERDATA(iter);
     }
 
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         /*
          * Check whether there are any WRITEMASKED REDUCE operands
          * which should be validated after all the strides are filled
@@ -2783,6 +2916,17 @@ npyiter_allocate_arrays(NpyIter *iter,
             op[iop] = out;
 
             /*
+             * Add an NA mask to the array if needed. When NA dtypes
+             * are supported, this should skip allocating the mask
+             * if the allocated array has an NA dtype.
+             */
+            if (op_flags[iop] & NPY_ITER_USE_MASKNA) {
+                if (PyArray_AllocateMaskNA(out, 1, 0, 1) < 0) {
+                    return 0;
+                }
+            }
+
+            /*
              * Now we need to replace the pointers and strides with values
              * from the new array.
              */
@@ -2811,6 +2955,15 @@ npyiter_allocate_arrays(NpyIter *iter,
             if (temp == NULL) {
                 return 0;
             }
+            /*
+             * Add an NA mask if needed, defaulting to all NAs because
+             * the data is uninitialized
+             */
+            if (PyArray_HASMASKNA(op[iop])) {
+                if (PyArray_AllocateMaskNA(temp, 1, 0, 0) < 0) {
+                    return 0;
+                }
+            }
             if (PyArray_CopyInto(temp, op[iop]) != 0) {
                 Py_DECREF(temp);
                 return 0;
@@ -2852,8 +3005,18 @@ npyiter_allocate_arrays(NpyIter *iter,
             if (temp == NULL) {
                 return 0;
             }
+            /* Add an NA mask if needed */
+            if (PyArray_HASMASKNA(op[iop])) {
+                if (PyArray_AllocateMaskNA(temp, 1, 0, 1) < 0) {
+                    return 0;
+                }
+            }
 
-            /* If the data will be read, copy it into temp */
+            /*
+             * If the data will be read, copy it into temp.
+             * TODO: It might be possible to do a view into
+             *       op[iop]'s mask instead here.
+             */
             if (op_itflags[iop] & NPY_OP_ITFLAG_READ) {
                 if (PyArray_CopyInto(temp, op[iop]) != 0) {
                     Py_DECREF(temp);
@@ -2867,7 +3030,7 @@ npyiter_allocate_arrays(NpyIter *iter,
                  * the chain of bases.
                  */
                 Py_INCREF(op[iop]);
-                ((PyArrayObject_fieldaccess *)temp)->base =
+                ((PyArrayObject_fields *)temp)->base =
                                                         (PyObject *)op[iop];
                 PyArray_ENABLEFLAGS(temp, NPY_ARRAY_UPDATEIFCOPY);
                 PyArray_CLEARFLAGS(op[iop], NPY_ARRAY_WRITEABLE);
@@ -2929,7 +3092,7 @@ npyiter_allocate_arrays(NpyIter *iter,
         }
 
         /*
-         * If no alignment, byte swap, or casting is needed, and
+         * If no alignment, byte swap, or casting is needed,
          * the inner stride of this operand works for the whole
          * array, we can set NPY_OP_ITFLAG_BUFNEVER.
          */
@@ -2987,7 +3150,7 @@ npyiter_allocate_arrays(NpyIter *iter,
     }
 
     if (check_writemasked_reductions) {
-        for (iop = 0; iop < nop; ++iop) {
+        for (iop = 0; iop < first_maskna_op; ++iop) {
             /*
              * Check whether there are any WRITEMASKED REDUCE operands
              * which should be validated now that all the strides are filled
@@ -3015,19 +3178,167 @@ npyiter_allocate_arrays(NpyIter *iter,
 }
 
 /*
+ * Prepares the maskna virtual operands for the constructor
+ * operands, and fills in the axisdata. Fills in 'op_maskna_dataptr',
+ * 'op_maskna_dtype', and may modify 'op_maskna_itflags'.
+ *
+ * This needs to be called after any 'allocate' operands have
+ * been allocated. There is no validation of the shape/strides done,
+ * because the shape of a mask exactly matches the shape of the
+ * operand to which it attached.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+npyiter_fill_maskna_axisdata(NpyIter *iter, int **op_axes)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int idim, ndim = NIT_NDIM(iter);
+    int iop, iop_maskna, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
+    npy_int8 *perm;
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    npy_int8 *maskna_indices = NIT_MASKNA_INDICES(iter);
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    PyArrayObject **op = NIT_OPERANDS(iter), *op_cur;
+    char **op_dataptr = NIT_RESETDATAPTR(iter);
+    NpyIter_BufferData *bufferdata = NULL;
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    perm = NIT_PERM(iter);
+
+    if (itflags & NPY_ITFLAG_BUFFER) {
+        bufferdata = NIT_BUFFERDATA(iter);
+    }
+
+    /* Fill in the reset dataptr array with the mask pointers */
+    for (iop = first_maskna_op; iop < nop; ++iop) {
+        /* If there's a mask, process that */
+        if (PyArray_HASMASKNA(op[iop])) {
+            op_dataptr[iop] = PyArray_MASKNA_DATA(op[iop]);
+        }
+        /*
+         * Otherwise we create a virtual operand a single one value
+         * broadcast everywhere
+         */
+        else {
+            static char ones_virtual_mask_data = 1;
+
+            op_itflags[iop] |= (NPY_OP_ITFLAG_VIRTUAL |
+                                NPY_OP_ITFLAG_BUFNEVER);
+            op_dataptr[iop] = &ones_virtual_mask_data;
+            if (itflags & NPY_ITFLAG_BUFFER) {
+                NBF_PTRS(bufferdata)[iop] = op_dataptr[iop];
+            }
+        }
+    }
+
+    /* Process the maskna operands, filling in the axisdata */
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp *strides = NAD_STRIDES(axisdata);
+        npy_int8 p;
+        int idim_permuted;
+
+        p = perm[idim];
+        if (p < 0) {
+            idim_permuted = -1-p;
+        }
+        else {
+            idim_permuted = p;
+        }
+        for (iop = first_maskna_op; iop < nop; ++iop) {
+            /*
+             * iop_maskna is the index of the USE_MASKNA input,
+             * iop is the index of the corresponding mask.
+             */
+            iop_maskna = maskna_indices[iop];
+            op_cur = op[iop_maskna];
+
+            /*
+             * The strides of the mask will be zero exactly
+             * where they're zero for the main data, or will
+             * be zero always if the operand has no NA support and
+             * a virtual mask of all ones is being used.
+             */
+            if (strides[iop_maskna] == 0 || !PyArray_HASMASKNA(op_cur)) {
+                strides[iop] = 0;
+            }
+            else {
+                int i;
+
+                if (op_axes == NULL || op_axes[iop_maskna] == NULL) {
+                    i = PyArray_NDIM(op_cur) - idim_permuted - 1;
+                }
+                else {
+                    i = op_axes[iop_maskna][ndim-idim_permuted-1];
+                }
+
+                strides[iop] = PyArray_MASKNA_STRIDES(op_cur)[i];
+                /* Reverse the axis if necessary */
+                if (p < 0) {
+                    op_dataptr[iop] += (NAD_SHAPE(axisdata)-1) * strides[iop];
+                    strides[iop] = -strides[iop];
+                }
+            }
+        }
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+    /* Initialize the mask data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    for (idim = 0; idim < ndim; ++idim) {
+        memcpy(NAD_PTRS(axisdata) + first_maskna_op,
+                op_dataptr + first_maskna_op,
+                NPY_SIZEOF_INTP*(nop - first_maskna_op));
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+    /* Initialize the strides of any BUFNEVER mask operands */
+    if (itflags & NPY_ITFLAG_BUFFER) {
+        npy_intp *strides = NBF_STRIDES(bufferdata);
+
+        for (iop = first_maskna_op; iop < nop; ++iop) {
+            if (op_itflags[iop] & NPY_OP_ITFLAG_BUFNEVER) {
+                if (PyArray_HASMASKNA(op[iop])) {
+                    axisdata = NIT_AXISDATA(iter);
+                    /* Find stride of the first non-empty shape */
+                    for (idim = 0; idim < ndim; ++idim) {
+                        if (NAD_SHAPE(axisdata) != 1) {
+                            strides[iop] = NAD_STRIDES(axisdata)[iop];
+                            break;
+                        }
+                        NIT_ADVANCE_AXISDATA(axisdata, 1);
+                    }
+                }
+                else {
+                    strides[iop] = 0;
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+/*
  * The __array_priority__ attribute of the inputs determines
  * the subtype of any output arrays.  This function finds the
  * subtype of the input array with highest priority.
  */
 static void
-npyiter_get_priority_subtype(int nop, PyArrayObject **op,
+npyiter_get_priority_subtype(int first_maskna_op, PyArrayObject **op,
                             char *op_itflags,
                             double *subtype_priority,
                             PyTypeObject **subtype)
 {
     int iop;
 
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         if (op[iop] != NULL && op_itflags[iop] & NPY_OP_ITFLAG_READ) {
             double priority = PyArray_GetPriority((PyObject *)op[iop], 0.0);
             if (priority > *subtype_priority) {
@@ -3044,6 +3355,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     /*int ndim = NIT_NDIM(iter);*/
     int iop = 0, nop = NIT_NOP(iter);
+    int first_maskna_op = NIT_FIRST_MASKNA_OP(iter);
 
     npy_intp i;
     char *op_itflags = NIT_OPITFLAGS(iter);
@@ -3052,12 +3364,13 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     PyArrayObject **op = NIT_OPERANDS(iter);
     PyArray_Descr **op_dtype = NIT_DTYPES(iter);
     npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
-    PyArray_StridedTransferFn **readtransferfn = NBF_READTRANSFERFN(bufferdata),
+    PyArray_StridedUnaryOp **readtransferfn = NBF_READTRANSFERFN(bufferdata),
                         **writetransferfn = NBF_WRITETRANSFERFN(bufferdata);
     NpyAuxData **readtransferdata = NBF_READTRANSFERDATA(bufferdata),
                **writetransferdata = NBF_WRITETRANSFERDATA(bufferdata);
+    npy_int8 *maskna_indices = NIT_MASKNA_INDICES(iter);
 
-    PyArray_StridedTransferFn *stransfer = NULL;
+    PyArray_StridedUnaryOp *stransfer = NULL;
     NpyAuxData *transferdata = NULL;
     int needs_api = 0;
 
@@ -3075,13 +3388,18 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
          * allocate the appropriate transfer functions
          */
         if (!(flags & NPY_OP_ITFLAG_BUFNEVER)) {
+            PyArray_Descr *op_orig_dtype;
+
+            /* Get either the array's or the array namask's dtype */
+            op_orig_dtype = (iop < first_maskna_op) ? PyArray_DESCR(op[iop])
+                                           : PyArray_MASKNA_DTYPE(op[iop]);
             if (flags & NPY_OP_ITFLAG_READ) {
                 int move_references = 0;
                 if (PyArray_GetDTypeTransferFunction(
                                         (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
                                         op_stride,
                                         op_dtype[iop]->elsize,
-                                        PyArray_DESCR(op[iop]),
+                                        op_orig_dtype,
                                         op_dtype[iop],
                                         move_references,
                                         &stransfer,
@@ -3099,9 +3417,39 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                 int move_references = 1;
 
                 /*
-                 * If the operand is WRITEMASKED, use a masked transfer fn.
+                 * If the operand has USE_MASKNA, use a masked transfer fn.
+                 * The masks for the maskna operands can be copied straight
+                 * unless the operand is also WRITEMASKED.
                  */
-                if (flags & NPY_OP_ITFLAG_WRITEMASKED) {
+                if (iop < first_maskna_op && maskna_indices[iop] >= 0) {
+                    /* TODO: support USE_MASKNA + WRITEMASKED together */
+                    PyArray_Descr *mask_dtype =
+                                    PyArray_MASKNA_DTYPE(op[iop]);
+                    int iop_maskna = maskna_indices[iop];
+
+                    /*
+                     * If the mask's stride is contiguous, use it, otherwise
+                     * the mask may or may not be buffered, so the stride
+                     * could be inconsistent.
+                     */
+                    if (PyArray_GetMaskedDTypeTransferFunction(
+                                (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
+                                op_dtype[iop]->elsize,
+                                op_stride,
+                            (flags & NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP :
+                                                   strides[iop_maskna],
+                                op_dtype[iop],
+                                op_orig_dtype,
+                                mask_dtype,
+                                move_references,
+                                (PyArray_MaskedStridedUnaryOp **)&stransfer,
+                                &transferdata,
+                                &needs_api) != NPY_SUCCEED) {
+                        goto fail;
+                    }
+                }
+                /* If the operand is WRITEMASKED, use a masked transfer fn */
+                else if (flags & NPY_OP_ITFLAG_WRITEMASKED) {
                     int maskop = NIT_MASKOP(iter);
                     PyArray_Descr *mask_dtype = PyArray_DESCR(op[maskop]);
 
@@ -3118,10 +3466,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                                                 mask_dtype->elsize :
                                                 NPY_MAX_INTP,
                                 op_dtype[iop],
-                                PyArray_DESCR(op[iop]),
+                                op_orig_dtype,
                                 mask_dtype,
                                 move_references,
-                                (PyArray_MaskedStridedTransferFn **)&stransfer,
+                                (PyArray_MaskedStridedUnaryOp **)&stransfer,
                                 &transferdata,
                                 &needs_api) != NPY_SUCCEED) {
                         goto fail;
@@ -3133,7 +3481,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                                         op_dtype[iop]->elsize,
                                         op_stride,
                                         op_dtype[iop],
-                                        PyArray_DESCR(op[iop]),
+                                        op_orig_dtype,
                                         move_references,
                                         &stransfer,
                                         &transferdata,
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index e5ec487f8..de361470d 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -101,6 +101,8 @@
 #define NPY_ITFLAG_REDUCE       0x1000
 /* Reduce iteration doesn't need to recalculate reduce loops next time */
 #define NPY_ITFLAG_REUSE_REDUCE_LOOPS 0x2000
+/* The iterator has one or more operands with NPY_ITER_USE_MASKNA set */
+#define NPY_ITFLAG_HAS_MASKNA_OP 0x4000
 
 /* Internal iterator per-operand iterator flags */
 
@@ -133,9 +135,8 @@
 struct NpyIter_InternalOnly {
     /* Initial fixed position data */
     npy_uint32 itflags;
-    npy_uint8 ndim, nop;
+    npy_uint8 ndim, nop, first_maskna_op;
     npy_int8 maskop;
-    npy_uint8 unused_padding;
     npy_intp itersize, iterstart, iterend;
     /* iterindex is only used if RANGED or BUFFERED is set */
     npy_intp iterindex;
@@ -149,6 +150,8 @@ typedef struct NpyIter_BD NpyIter_BufferData;
 /* Byte sizes of the iterator members */
 #define NIT_PERM_SIZEOF(itflags, ndim, nop) \
         NPY_INTP_ALIGNED(NPY_MAXDIMS)
+#define NIT_MASKNA_INDICES_SIZEOF(itflags, ndim, nop) \
+        NPY_INTP_ALIGNED(NPY_MAXDIMS)
 #define NIT_DTYPES_SIZEOF(itflags, ndim, nop) \
         ((NPY_SIZEOF_INTP)*(nop))
 #define NIT_RESETDATAPTR_SIZEOF(itflags, ndim, nop) \
@@ -165,9 +168,12 @@ typedef struct NpyIter_BD NpyIter_BufferData;
 /* Byte offsets of the iterator members starting from iter->iter_flexdata */
 #define NIT_PERM_OFFSET() \
         (0)
-#define NIT_DTYPES_OFFSET(itflags, ndim, nop) \
+#define NIT_MASKNA_INDICES_OFFSET(itflags, ndim, nop) \
         (NIT_PERM_OFFSET() + \
          NIT_PERM_SIZEOF(itflags, ndim, nop))
+#define NIT_DTYPES_OFFSET(itflags, ndim, nop) \
+        (NIT_MASKNA_INDICES_OFFSET(itflags, ndim, nop) + \
+         NIT_MASKNA_INDICES_SIZEOF(itflags, ndim, nop))
 #define NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop) \
         (NIT_DTYPES_OFFSET(itflags, ndim, nop) + \
          NIT_DTYPES_SIZEOF(itflags, ndim, nop))
@@ -194,6 +200,8 @@ typedef struct NpyIter_BD NpyIter_BufferData;
         ((iter)->ndim)
 #define NIT_NOP(iter) \
         ((iter)->nop)
+#define NIT_FIRST_MASKNA_OP(iter) \
+        ((iter)->first_maskna_op)
 #define NIT_MASKOP(iter) \
         ((iter)->maskop)
 #define NIT_ITERSIZE(iter) \
@@ -206,6 +214,8 @@ typedef struct NpyIter_BD NpyIter_BufferData;
         (iter->iterindex)
 #define NIT_PERM(iter)  ((npy_int8 *)( \
         &(iter)->iter_flexdata + NIT_PERM_OFFSET()))
+#define NIT_MASKNA_INDICES(iter) ((npy_int8 *)( \
+        &(iter)->iter_flexdata + NIT_MASKNA_INDICES_OFFSET(itflags, ndim, nop)))
 #define NIT_DTYPES(iter) ((PyArray_Descr **)( \
         &(iter)->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, nop)))
 #define NIT_RESETDATAPTR(iter) ((char **)( \
@@ -241,11 +251,11 @@ struct NpyIter_BD {
         (&(bufferdata)->bd_flexdata + 2*(nop)))
 #define NBF_REDUCE_OUTERPTRS(bufferdata) ((char **) \
         (&(bufferdata)->bd_flexdata + 3*(nop)))
-#define NBF_READTRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
+#define NBF_READTRANSFERFN(bufferdata) ((PyArray_StridedUnaryOp **) \
         (&(bufferdata)->bd_flexdata + 4*(nop)))
 #define NBF_READTRANSFERDATA(bufferdata) ((NpyAuxData **) \
         (&(bufferdata)->bd_flexdata + 5*(nop)))
-#define NBF_WRITETRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
+#define NBF_WRITETRANSFERFN(bufferdata) ((PyArray_StridedUnaryOp **) \
         (&(bufferdata)->bd_flexdata + 6*(nop)))
 #define NBF_WRITETRANSFERDATA(bufferdata) ((NpyAuxData **) \
         (&(bufferdata)->bd_flexdata + 7*(nop)))
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 5e4f0ea74..e5061c999 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -256,71 +256,6 @@ npyiter_order_converter(PyObject *order_in, NPY_ORDER *order)
     return 0;
 }
 
-/*NUMPY_API
- * Convert any Python object, *obj*, to an NPY_CASTING enum.
- * TODO: Move elsewhere
- */
-NPY_NO_EXPORT int
-PyArray_CastingConverter(PyObject *obj, NPY_CASTING *casting)
-{
-    char *str = NULL;
-    Py_ssize_t length = 0;
-
-    if (PyUnicode_Check(obj)) {
-        PyObject *str_obj;
-        int ret;
-        str_obj = PyUnicode_AsASCIIString(obj);
-        if (str_obj == NULL) {
-            return 0;
-        }
-        ret = PyArray_CastingConverter(str_obj, casting);
-        Py_DECREF(str_obj);
-        return ret;
-    }
-
-    if (PyBytes_AsStringAndSize(obj, &str, &length) == -1) {
-        return 0;
-    }
-
-    if (length >= 2) switch (str[2]) {
-        case 0:
-            if (strcmp(str, "no") == 0) {
-                *casting = NPY_NO_CASTING;
-                return 1;
-            }
-            break;
-        case 'u':
-            if (strcmp(str, "equiv") == 0) {
-                *casting = NPY_EQUIV_CASTING;
-                return 1;
-            }
-            break;
-        case 'f':
-            if (strcmp(str, "safe") == 0) {
-                *casting = NPY_SAFE_CASTING;
-                return 1;
-            }
-            break;
-        case 'm':
-            if (strcmp(str, "same_kind") == 0) {
-                *casting = NPY_SAME_KIND_CASTING;
-                return 1;
-            }
-            break;
-        case 's':
-            if (strcmp(str, "unsafe") == 0) {
-                *casting = NPY_UNSAFE_CASTING;
-                return 1;
-            }
-            break;
-    }
-
-    PyErr_SetString(PyExc_ValueError,
-            "casting must be one of 'no', 'equiv', 'safe', "
-            "'same_kind', or 'unsafe'");
-    return 0;
-}
-
 static int
 NpyIter_OpFlagsConverter(PyObject *op_flags_in,
                          npy_uint32 *op_flags)
@@ -428,8 +363,17 @@ NpyIter_OpFlagsConverter(PyObject *op_flags_in,
                 }
                 break;
             case 'u':
-                if (strcmp(str, "updateifcopy") == 0) {
-                    flag = NPY_ITER_UPDATEIFCOPY;
+                switch (str[1]) {
+                    case 'p':
+                        if (strcmp(str, "updateifcopy") == 0) {
+                            flag = NPY_ITER_UPDATEIFCOPY;
+                        }
+                        break;
+                    case 's':
+                        if (strcmp(str, "use_maskna") == 0) {
+                            flag = NPY_ITER_USE_MASKNA;
+                        }
+                        break;
                 }
                 break;
             case 'v':
@@ -659,9 +603,9 @@ npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp nop,
 }
 
 /*
- * Converts the operand array and op_flags array into the form NpyIter_AdvancedNew
- * needs.  Sets nop, and on success, each op[i] owns a reference
- * to an array object.
+ * Converts the operand array and op_flags array into the form
+ * NpyIter_AdvancedNew needs.  Sets nop, and on success, each
+ * op[i] owns a reference to an array object.
  */
 static int
 npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
@@ -669,6 +613,7 @@ npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
                     int *nop_out)
 {
     int iop, nop;
+    int any_maskna;
 
     /* nop and op */
     if (PyTuple_Check(op_in) || PyList_Check(op_in)) {
@@ -738,10 +683,10 @@ npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
     for (iop = 0; iop < nop; ++iop) {
         if (op[iop] != NULL) {
             PyArrayObject *ao;
-            int fromanyflags = 0;
+            int fromanyflags = NPY_ARRAY_ALLOWNA;
 
             if (op_flags[iop]&(NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
-                fromanyflags = NPY_ARRAY_UPDATEIFCOPY;
+                fromanyflags |= NPY_ARRAY_UPDATEIFCOPY;
             }
             ao = (PyArrayObject *)PyArray_FromAny((PyObject *)op[iop],
                                             NULL, 0, 0, fromanyflags, NULL);
@@ -764,6 +709,33 @@ npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
         }
     }
 
+    /*
+     * Because the Python exposure of nditer knows how to deal with
+     * NA-masked arrays, we automatically add NPY_ITER_USE_MASKNA
+     * flags for convenience.
+     */
+    any_maskna = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        /* Enable MASKNA iteration if the op needs it */
+        if (op[iop] != NULL && PyArray_HASMASKNA(op[iop])) {
+            op_flags[iop] |= NPY_ITER_USE_MASKNA;
+            any_maskna = 1;
+        }
+    }
+    /*
+     * If any operands had an NA-mask, add it to the 'allocate' ones too.
+     * This causes the Python exposure nditer to have slightly less control
+     * than the C NpyIter usage, but is generally going to be what people
+     * want.
+     */
+    if (any_maskna) {
+        for (iop = 0; iop < nop; ++iop) {
+            if (op[iop] == NULL) {
+                op_flags[iop] |= NPY_ITER_USE_MASKNA;
+            }
+        }
+    }
+
     return 1;
 }
 
@@ -859,6 +831,7 @@ npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
         itershape.ptr = NULL;
     }
 
+
     self->iter = NpyIter_AdvancedNew(nop, op, flags, order, casting, op_flags,
                                   op_request_dtypes,
                                   oa_ndim, oa_ndim > 0 ? op_axes : NULL,
@@ -1442,7 +1415,7 @@ static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iop, nop;
+    npy_intp iop, first_maskna_op;
 
     if (self->iter == NULL || self->finished) {
         PyErr_SetString(PyExc_ValueError,
@@ -1450,18 +1423,18 @@ static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    first_maskna_op = NpyIter_GetFirstMaskNAOp(self->iter);
 
     /* Return an array  or tuple of arrays with the values */
-    if (nop == 1) {
+    if (first_maskna_op == 1) {
         ret = npyiter_seq_item(self, 0);
     }
     else {
-        ret = PyTuple_New(nop);
+        ret = PyTuple_New(first_maskna_op);
         if (ret == NULL) {
             return NULL;
         }
-        for (iop = 0; iop < nop; ++iop) {
+        for (iop = 0; iop < first_maskna_op; ++iop) {
             PyObject *a = npyiter_seq_item(self, iop);
             if (a == NULL) {
                 Py_DECREF(ret);
@@ -1478,7 +1451,7 @@ static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iop, nop;
+    npy_intp iop, first_maskna_op;
     PyArrayObject **operands;
 
     if (self->iter == NULL) {
@@ -1487,14 +1460,14 @@ static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    first_maskna_op = NpyIter_GetFirstMaskNAOp(self->iter);
     operands = self->operands;
 
-    ret = PyTuple_New(nop);
+    ret = PyTuple_New(first_maskna_op);
     if (ret == NULL) {
         return NULL;
     }
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         PyObject *operand = (PyObject *)operands[iop];
 
         Py_INCREF(operand);
@@ -1508,7 +1481,7 @@ static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
 {
     PyObject *ret;
 
-    npy_intp iop, nop;
+    npy_intp iop, first_maskna_op;
 
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1516,13 +1489,13 @@ static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    first_maskna_op = NpyIter_GetFirstMaskNAOp(self->iter);
 
-    ret = PyTuple_New(nop);
+    ret = PyTuple_New(first_maskna_op);
     if (ret == NULL) {
         return NULL;
     }
-    for (iop = 0; iop < nop; ++iop) {
+    for (iop = 0; iop < first_maskna_op; ++iop) {
         PyArrayObject *view = NpyIter_GetIterView(self->iter, iop);
 
         if (view == NULL) {
@@ -1629,7 +1602,8 @@ static PyObject *npyiter_multi_index_get(NewNpyArrayIterObject *self)
     }
 }
 
-static int npyiter_multi_index_set(NewNpyArrayIterObject *self, PyObject *value)
+static int
+npyiter_multi_index_set(NewNpyArrayIterObject *self, PyObject *value)
 {
     npy_intp idim, ndim, multi_index[NPY_MAXDIMS];
 
@@ -1976,7 +1950,11 @@ static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNOp(self->iter));
+    /*
+     * We only expose the provided operands, which is everything
+     * before the first MASKNA operand.
+     */
+    return PyInt_FromLong(NpyIter_GetFirstMaskNAOp(self->iter));
 }
 
 static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
@@ -2007,7 +1985,11 @@ npyiter_seq_length(NewNpyArrayIterObject *self)
         return 0;
     }
     else {
-        return NpyIter_GetNOp(self->iter);
+        /*
+         * We only expose the provided operands, which is everything
+         * before the first MASKNA operand.
+         */
+        return NpyIter_GetFirstMaskNAOp(self->iter);
     }
 }
 
@@ -2016,10 +1998,13 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
 {
     PyArrayObject *ret;
 
+    npy_int8 *maskna_indices;
     npy_intp ret_ndim;
     npy_intp nop, innerloopsize, innerstride;
     char *dataptr;
     PyArray_Descr *dtype;
+    int has_external_loop;
+    Py_ssize_t i_orig = i;
 
     if (self->iter == NULL || self->finished) {
         PyErr_SetString(PyExc_ValueError,
@@ -2034,10 +2019,20 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
         return NULL;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    /*
+     * We only expose the provided operands, which is everything
+     * before the first MASKNA operand.
+     */
+    nop = NpyIter_GetFirstMaskNAOp(self->iter);
+
+    /* Negative indexing */
+    if (i < 0) {
+        i += nop;
+    }
+
     if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
-                "Iterator operand index %d is out of bounds", (int)i);
+                "Iterator operand index %d is out of bounds", (int)i_orig);
         return NULL;
     }
 
@@ -2058,8 +2053,10 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
 
     dataptr = self->dataptrs[i];
     dtype = self->dtypes[i];
+    has_external_loop = NpyIter_HasExternalLoop(self->iter);
+    maskna_indices = NpyIter_GetMaskNAIndexArray(self->iter);
 
-    if (NpyIter_HasExternalLoop(self->iter)) {
+    if (has_external_loop) {
         innerloopsize = *self->innerloopsizeptr;
         innerstride = self->innerstrides[i];
         ret_ndim = 1;
@@ -2082,6 +2079,22 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
         return NULL;
     }
 
+    /* If this is a USE_MASKNA operand, include the mask */
+    if (maskna_indices[i] >= 0) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+        int i_maskna = maskna_indices[i];
+
+        fret->maskna_dtype = NpyIter_GetDescrArray(self->iter)[i_maskna];
+        Py_INCREF(fret->maskna_dtype);
+        fret->maskna_data = self->dataptrs[i_maskna];
+        if (has_external_loop) {
+            fret->maskna_strides[0] = self->innerstrides[i_maskna];
+        }
+
+        fret->flags |= NPY_ARRAY_MASKNA;
+        fret->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
+
     PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
 
     return (PyObject *)ret;
@@ -2108,7 +2121,11 @@ npyiter_seq_slice(NewNpyArrayIterObject *self,
         return NULL;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    /*
+     * We only expose the provided operands, which is everything
+     * before the first MASKNA operand.
+     */
+    nop = NpyIter_GetFirstMaskNAOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
     }
@@ -2142,10 +2159,13 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
 {
 
     npy_intp nop, innerloopsize, innerstride;
+    npy_int8 *maskna_indices;
     char *dataptr;
     PyArray_Descr *dtype;
     PyArrayObject *tmp;
-    int ret;
+    int ret, has_external_loop;
+    Py_ssize_t i_orig = i;
+
 
     if (v == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -2166,22 +2186,33 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
         return -1;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    /*
+     * We only expose the provided operands, which is everything
+     * before the first MASKNA operand.
+     */
+    nop = NpyIter_GetFirstMaskNAOp(self->iter);
+
+    /* Negative indexing */
+    if (i < 0) {
+        i += nop;
+    }
+
     if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
-                "Iterator operand index %d is out of bounds", (int)i);
+                "Iterator operand index %d is out of bounds", (int)i_orig);
         return -1;
     }
     if (!self->writeflags[i]) {
         PyErr_Format(PyExc_RuntimeError,
-                "Iterator operand %d is not writeable", (int)i);
+                "Iterator operand %d is not writeable", (int)i_orig);
         return -1;
     }
 
     dataptr = self->dataptrs[i];
     dtype = self->dtypes[i];
+    has_external_loop = NpyIter_HasExternalLoop(self->iter);
 
-    if (NpyIter_HasExternalLoop(self->iter)) {
+    if (has_external_loop) {
         innerloopsize = *self->innerloopsizeptr;
         innerstride = self->innerstrides[i];
     }
@@ -2190,6 +2221,8 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
         innerstride = 0;
     }
 
+    maskna_indices = NpyIter_GetMaskNAIndexArray(self->iter);
+
     /* TODO - there should be a better way than this... */
     Py_INCREF(dtype);
     tmp = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype,
@@ -2199,7 +2232,27 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
     if (tmp == NULL) {
         return -1;
     }
+    /* If this is a USE_MASKNA operand, include the mask */
+    if (maskna_indices[i] >= 0) {
+        PyArrayObject_fields *ftmp = (PyArrayObject_fields *)tmp;
+        int i_maskna = maskna_indices[i];
+
+        ftmp->maskna_dtype = NpyIter_GetDescrArray(self->iter)[i_maskna];
+        Py_INCREF(ftmp->maskna_dtype);
+        ftmp->maskna_data = self->dataptrs[i_maskna];
+        if (has_external_loop) {
+            ftmp->maskna_strides[0] = self->innerstrides[i_maskna];
+        }
+        else {
+            ftmp->maskna_strides[0] = 0;
+        }
+
+        ftmp->flags |= NPY_ARRAY_MASKNA;
+        ftmp->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
+
     PyArray_UpdateFlags(tmp, NPY_ARRAY_UPDATE_ALL);
+
     ret = PyArray_CopyObject(tmp, v);
     Py_DECREF(tmp);
     return ret;
@@ -2231,7 +2284,11 @@ npyiter_seq_ass_slice(NewNpyArrayIterObject *self, Py_ssize_t ilow,
         return -1;
     }
 
-    nop = NpyIter_GetNOp(self->iter);
+    /*
+     * We only expose the provided operands, which is everything
+     * before the first MASKNA operand.
+     */
+    nop = NpyIter_GetFirstMaskNAOp(self->iter);
     if (ilow < 0) {
         ilow = 0;
     }
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 8bb9dfc3d..861ef5c05 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -52,7 +52,7 @@ PyArray_SetNumericOps(PyObject *dict)
     SET(power);
     SET(square);
     SET(reciprocal);
-    SET(ones_like);
+    SET(_ones_like);
     SET(sqrt);
     SET(negative);
     SET(absolute);
@@ -103,7 +103,7 @@ PyArray_GetNumericOps(void)
     GET(power);
     GET(square);
     GET(reciprocal);
-    GET(ones_like);
+    GET(_ones_like);
     GET(sqrt);
     GET(negative);
     GET(absolute);
@@ -348,7 +348,7 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace)
                 fastop = n_ops.reciprocal;
             }
             else if (exp ==  0.0) {
-                fastop = n_ops.ones_like;
+                fastop = n_ops._ones_like;
             }
             else if (exp ==  0.5) {
                 fastop = n_ops.sqrt;
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index 8f1cb3b91..0018b7348 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -10,7 +10,7 @@ typedef struct {
     PyObject *power;
     PyObject *square;
     PyObject *reciprocal;
-    PyObject *ones_like;
+    PyObject *_ones_like;
     PyObject *sqrt;
     PyObject *negative;
     PyObject *absolute;
diff --git a/numpy/core/src/multiarray/reduction.c b/numpy/core/src/multiarray/reduction.c
new file mode 100644
index 000000000..b17e3ca6e
--- /dev/null
+++ b/numpy/core/src/multiarray/reduction.c
@@ -0,0 +1,1192 @@
+/*
+ * This file implements generic methods for computing reductions on arrays.
+ *
+ * Written by Mark Wiebe (mwwiebe@gmail.com)
+ * Copyright (c) 2011 by Enthought, Inc.
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API
+#define _MULTIARRAYMODULE
+#include <numpy/arrayobject.h>
+
+#include "npy_config.h"
+#include "numpy/npy_3kcompat.h"
+
+#include "lowlevel_strided_loops.h"
+#include "na_mask.h"
+#include "reduction.h"
+
+/*
+ * Allocates a result array for a reduction operation, with
+ * dimensions matching 'arr' except set to 1 with 0 stride
+ * whereever axis_flags is True. Dropping the reduction axes
+ * from the result must be done later by the caller once the
+ * computation is complete.
+ *
+ * This function never adds an NA mask to the allocated
+ * result, that is the responsibility of the caller. It also
+ * always allocates a base class ndarray.
+ *
+ * If 'dtype' isn't NULL, this function steals its reference.
+ */
+static PyArrayObject *
+allocate_reduce_result(PyArrayObject *arr, npy_bool *axis_flags,
+                        PyArray_Descr *dtype, int subok)
+{
+    npy_intp strides[NPY_MAXDIMS], stride;
+    npy_intp shape[NPY_MAXDIMS], *arr_shape = PyArray_DIMS(arr);
+    npy_stride_sort_item strideperm[NPY_MAXDIMS];
+    int idim, ndim = PyArray_NDIM(arr);
+
+    if (dtype == NULL) {
+        dtype = PyArray_DTYPE(arr);
+        Py_INCREF(dtype);
+    }
+
+    PyArray_CreateSortedStridePerm(PyArray_NDIM(arr), PyArray_SHAPE(arr),
+                                    PyArray_STRIDES(arr), strideperm);
+
+    /* Build the new strides and shape */
+    stride = dtype->elsize;
+    memcpy(shape, arr_shape, ndim * sizeof(shape[0]));
+    for (idim = ndim-1; idim >= 0; --idim) {
+        npy_intp i_perm = strideperm[idim].perm;
+        if (axis_flags[i_perm]) {
+            strides[i_perm] = 0;
+            shape[i_perm] = 1;
+        }
+        else {
+            strides[i_perm] = stride;
+            stride *= shape[i_perm];
+        }
+    }
+
+    /* Finally, allocate the array */
+    return (PyArrayObject *)PyArray_NewFromDescr(
+                                    subok ? Py_TYPE(arr) : &PyArray_Type,
+                                    dtype, ndim, shape, strides,
+                                    NULL, 0, subok ? (PyObject *)arr : NULL);
+}
+
+/*
+ * Conforms an output parameter 'out' to have 'ndim' dimensions
+ * with dimensions of size one added in the appropriate places
+ * indicated by 'axis_flags'.
+ *
+ * The return value is a view into 'out'.
+ */
+static PyArrayObject *
+conform_reduce_result(int ndim, npy_bool *axis_flags,
+                    PyArrayObject *out, int keepdims, const char *funcname)
+{
+    npy_intp strides[NPY_MAXDIMS], shape[NPY_MAXDIMS];
+    npy_intp *strides_out = PyArray_STRIDES(out);
+    npy_intp *shape_out = PyArray_DIMS(out);
+    int idim, idim_out, ndim_out = PyArray_NDIM(out);
+    PyArray_Descr *dtype;
+    PyArrayObject_fields *ret;
+
+    /*
+     * If the 'keepdims' parameter is true, do a simpler validation and
+     * return a new reference to 'out'.
+     */
+    if (keepdims) {
+        if (PyArray_NDIM(out) != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                    "output parameter for reduction operation %s "
+                    "has the wrong number of dimensions (must match "
+                    "the operand's when keepdims=True)", funcname);
+            return NULL;
+        }
+
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                if (shape_out[idim] != 1) {
+                    PyErr_Format(PyExc_ValueError,
+                            "output parameter for reduction operation %s "
+                            "has a reduction dimension not equal to one "
+                            "(required when keepdims=True)", funcname);
+                    return NULL;
+                }
+            }
+        }
+
+        Py_INCREF(out);
+        return out;
+    }
+
+    /* Construct the strides and shape */
+    idim_out = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (axis_flags[idim]) {
+            strides[idim] = 0;
+            shape[idim] = 1;
+        }
+        else {
+            if (idim_out >= ndim_out) {
+                PyErr_Format(PyExc_ValueError,
+                        "output parameter for reduction operation %s "
+                        "does not have enough dimensions", funcname);
+                return NULL;
+            }
+            strides[idim] = strides_out[idim_out];
+            shape[idim] = shape_out[idim_out];
+            ++idim_out;
+        }
+    }
+
+    if (idim_out != ndim_out) {
+        PyErr_Format(PyExc_ValueError,
+                "output parameter for reduction operation %s "
+                "has too many dimensions", funcname);
+        return NULL;
+    }
+
+    /* Allocate the view */
+    dtype = PyArray_DESCR(out);
+    Py_INCREF(dtype);
+    ret = (PyArrayObject_fields *)PyArray_NewFromDescr(&PyArray_Type,
+                               dtype,
+                               ndim, shape,
+                               strides,
+                               PyArray_DATA(out),
+               PyArray_FLAGS(out) & ~(NPY_ARRAY_MASKNA|NPY_ARRAY_OWNMASKNA),
+                               NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+    Py_INCREF(out);
+    if (PyArray_SetBaseObject((PyArrayObject *)ret, (PyObject *)out) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    /* Take a view of the mask if it exists */
+    if (PyArray_HASMASKNA(out)) {
+        npy_intp *strides_ret = ret->maskna_strides;
+        strides_out = PyArray_MASKNA_STRIDES(out);
+        idim_out = 0;
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                strides_ret[idim] = 0;
+            }
+            else {
+                strides_ret[idim] = strides_out[idim_out];
+                ++idim_out;
+            }
+        }
+
+        ret->maskna_dtype = PyArray_MASKNA_DTYPE(out);
+        Py_INCREF(ret->maskna_dtype);
+        ret->maskna_data = PyArray_MASKNA_DATA(out);
+        ret->flags |= NPY_ARRAY_MASKNA;
+    }
+
+    return (PyArrayObject *)ret;
+}
+
+/*
+ * Creates a result for reducing 'operand' along the axes specified
+ * in 'axis_flags'. If 'dtype' isn't NULL, this function steals a
+ * reference to 'dtype'.
+ *
+ * If 'out' isn't NULL, this function creates a view conforming
+ * to the number of dimensions of 'operand', adding a singleton dimension
+ * for each reduction axis specified. In this case, 'dtype' is ignored
+ * (but its reference is still stolen), and the caller must handle any
+ * type conversion/validity check for 'out'. When 'need_namask' is true,
+ * raises an exception if 'out' doesn't have an NA mask.
+ *
+ * If 'subok' is true, creates a result with the subtype of 'operand',
+ * otherwise creates on with the base ndarray class.
+ *
+ * If 'out' is NULL, it allocates a new array whose shape matches
+ * that of 'operand', except for at the reduction axes. An NA mask
+ * is added if 'need_namask' is true.  If 'dtype' is NULL, the dtype
+ * of 'operand' is used for the result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
+                    PyArray_Descr *dtype, npy_bool *axis_flags,
+                    int need_namask, int keepdims, int subok,
+                    const char *funcname)
+{
+    PyArrayObject *result;
+
+    if (out == NULL) {
+        /* This function steals the reference to 'dtype' */
+        result = allocate_reduce_result(operand, axis_flags, dtype, subok);
+
+        /* Allocate an NA mask if necessary */
+        if (need_namask && result != NULL) {
+            if (PyArray_AllocateMaskNA(result, 1, 0, 1) < 0) {
+                Py_DECREF(result);
+                return NULL;
+            }
+        }
+    }
+    else {
+        /* Steal the dtype reference */
+        Py_XDECREF(dtype);
+
+        if (need_namask && !PyArray_HASMASKNA(out)) {
+            PyErr_Format(PyExc_ValueError,
+                    "output parameter for reduction operation %s "
+                    "needs an NA mask, but the array provided does "
+                    "not have one", funcname);
+            return NULL;
+        }
+
+        result = conform_reduce_result(PyArray_NDIM(operand), axis_flags,
+                                        out, keepdims, funcname);
+    }
+
+    return result;
+}
+
+/*
+ * Checks that there are only zero or one dimensions selected in 'axis_flags',
+ * and raises an error about a non-reorderable reduction if not.
+ */
+static int
+check_nonreorderable_axes(int ndim, npy_bool *axis_flags, const char *funcname)
+{
+    int idim, single_axis = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (axis_flags[idim]) {
+            if (single_axis) {
+                PyErr_Format(PyExc_ValueError,
+                        "reduction operation '%s' is not reorderable, "
+                        "so only one axis may be specified",
+                        funcname);
+                return -1;
+            }
+            else {
+                single_axis = 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Initializes the reduce result for skipna reductions where operand
+ * has more than one dimension.
+ *
+ * 'operand must have an NA mask, 'result' may or may not have an
+ * NA mask, and 'skipna' must be True to call this function.
+ */
+static PyArrayObject *
+initialize_reduce_result_noidentity_skipna(
+                    PyArrayObject *operand, PyArrayObject *result,
+                    npy_bool *axis_flags, const char *funcname)
+{
+    PyArrayObject *initialized = NULL;
+    npy_intp initialized_countdown;
+    npy_intp op_itemsize;
+    PyArray_Descr *bool_dtype;
+
+    /* Iterator parameters */
+    NpyIter *iter = NULL;
+    PyArrayObject *op[3];
+    npy_uint32 flags, op_flags[3];
+    npy_intp fixed_strides[4];
+
+    /* Data transfer function */
+    PyArray_StridedUnaryOp *stransfer = NULL;
+    NpyAuxData *transferdata = NULL;
+    int needs_api;
+
+    op_itemsize = PyArray_DTYPE(operand)->elsize;
+
+    /*
+     * Create a view of operand which owns its its own mask, so that
+     * we can change it.
+     */
+    operand = (PyArrayObject *)PyArray_View(operand, NULL, &PyArray_Type);
+    if (operand == NULL) {
+        goto fail;
+    }
+    if (PyArray_AllocateMaskNA(operand, 1, 0, 1) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Allocate a flag array to keep track of which elements in the result
+     * have already been initialized.
+     *
+     * This reference to bool_dtype gets stolen by NewLikeArray.
+     */
+    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+    if (bool_dtype == NULL) {
+        goto fail;
+    }
+    initialized = (PyArrayObject *)PyArray_NewLikeArray(result,
+                                            NPY_KEEPORDER, bool_dtype, 0);
+    if (initialized == NULL) {
+        goto fail;
+    }
+    if (PyArray_AssignZero(initialized, NULL, 0, NULL) < 0) {
+        Py_DECREF(initialized);
+        goto fail;
+    }
+
+    /* Set up the iterator for copying the elements */
+    op[0] = operand;
+    op[1] = result;
+    op[2] = initialized;
+    op_flags[0] = NPY_ITER_READWRITE | NPY_ITER_USE_MASKNA;
+    op_flags[1] = NPY_ITER_READWRITE | NPY_ITER_IGNORE_MASKNA;
+    op_flags[2] = NPY_ITER_READWRITE;
+    flags = NPY_ITER_EXTERNAL_LOOP |
+            NPY_ITER_REFS_OK |
+            NPY_ITER_REDUCE_OK |
+            NPY_ITER_ZEROSIZE_OK |
+            NPY_ITER_DONT_NEGATE_STRIDES;
+
+    iter = NpyIter_MultiNew(3, op, flags,
+                               NPY_KEEPORDER, NPY_UNSAFE_CASTING,
+                               op_flags,
+                               NULL);
+    if (iter == NULL) {
+        goto fail;
+    }
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    needs_api = NpyIter_IterationNeedsAPI(iter);
+
+    /* Get a function for copying the elements */
+    if (PyArray_GetDTypeTransferFunction(
+                    PyArray_ISALIGNED(operand) && PyArray_ISALIGNED(result),
+                    fixed_strides[0], fixed_strides[1],
+                    PyArray_DTYPE(operand), PyArray_DTYPE(result),
+                    0,
+                    &stransfer, &transferdata,
+                    &needs_api) != NPY_SUCCEED) {
+        goto fail;
+    }
+
+    /*
+     * Track how many initializations we've done, both to
+     * short circuit completion and to raise an error if
+     * any remained uninitialized.
+     */
+    initialized_countdown = PyArray_SIZE(result);
+
+    if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *strideptr;
+        npy_intp *countptr, count, subcount;
+        NPY_BEGIN_THREADS_DEF;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strideptr = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+
+        do {
+            char *op_d = dataptr[0], *res_d = dataptr[1];
+            char *init_d = dataptr[2], *op_namask_d = dataptr[3];
+            npy_intp op_s = strideptr[0], res_s = strideptr[1];
+            npy_intp init_s = strideptr[2], op_namask_s = strideptr[3];
+
+            count = *countptr;
+
+            /* If the result stride is 0, copy at most one value */
+            if (res_s == 0) {
+                npy_intp i;
+                for (i = 0; i < count; ++i) {
+                    if (*init_d == 0 && NpyMaskValue_IsExposed(
+                                                *(npy_mask *)op_namask_d)) {
+
+                        /* Mark it as initialized */
+                        *init_d = 1;
+                        stransfer(res_d, 0, op_d + i * op_s, op_s,
+                                        1, op_itemsize, transferdata);
+
+                        --initialized_countdown;
+                        if (initialized_countdown == 0) {
+                            goto finish_loop;
+                        }
+                        break;
+                    }
+
+                    init_d += init_s;
+                    op_namask_d += op_namask_s;
+                }
+            }
+            /* Otherwise process the data in runs as large as possible */
+            else {
+                do {
+                    /* Skip values that are initialized or masked */
+                    subcount = 0;
+                    while (subcount < count && (*init_d == 1 ||
+                                !NpyMaskValue_IsExposed(
+                                                *(npy_mask *)op_namask_d))) {
+                        ++subcount;
+                        init_d += init_s;
+                        op_namask_d += op_namask_s;
+                    }
+                    op_d += subcount * op_s;
+                    res_d += subcount * res_s;
+                    count -= subcount;
+
+                    /* Transfer values that are uninitialized and exposed */
+                    subcount = 0;
+                    while (subcount < count && (*init_d == 0 &&
+                                NpyMaskValue_IsExposed(
+                                                *(npy_mask *)op_namask_d))) {
+                        ++subcount;
+                        /* Mark it as initialized */
+                        *init_d = 1;
+                        init_d += init_s;
+                        op_namask_d += op_namask_s;
+                    }
+                    stransfer(res_d, res_s, op_d, op_s,
+                                subcount, op_itemsize, transferdata);
+                    op_d += subcount * op_s;
+                    res_d += subcount * res_s;
+                    count -= subcount;
+
+                    initialized_countdown -= subcount;
+                    if (initialized_countdown == 0) {
+                        goto finish_loop;
+                    }
+                } while (count > 0);
+            }
+        } while (iternext(iter));
+
+    finish_loop:
+        if (!needs_api) {
+            NPY_END_THREADS;
+        }
+    }
+
+    if (needs_api && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    /* Since this ufunc has no identity, all elements must be initialized */
+    if (initialized_countdown != 0) {
+        PyErr_Format(PyExc_ValueError,
+                "reduction operation %s with skipna=True "
+                "had an output element with all its inputs NA", funcname);
+        goto fail;
+    }
+
+    /* If 'result' has an NA mask, set it to all exposed */
+    if (PyArray_HASMASKNA(result)) {
+        if (PyArray_AssignMaskNA(result, 1, NULL, 0, NULL) < 0) {
+            goto fail;
+        }
+    }
+
+    Py_DECREF(initialized);
+    NpyIter_Deallocate(iter);
+    NPY_AUXDATA_FREE(transferdata);
+    return operand;
+
+fail:
+    Py_XDECREF(operand);
+    Py_XDECREF(initialized);
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
+    }
+    NPY_AUXDATA_FREE(transferdata);
+
+    return NULL;
+}
+
+/*
+ * This function initializes a result array for a reduction operation
+ * which has no identity. This means it needs to copy the first element
+ * it sees along the reduction axes to result, then return a view of
+ * the operand which excludes that element.
+ *
+ * If a reduction has an identity, such as 0 or 1, the result should
+ * be initialized by calling PyArray_AssignZero(result, NULL, !skipna, NULL)
+ * or PyArray_AssignOne(result, NULL, !skipna, NULL), because this
+ * function raises an exception when there are no elements to reduce.
+ *
+ * For regular reduction, this means it copies the subarray indexed
+ * at zero along each reduction axis into 'result', then returns a view
+ * into 'operand' excluding those copied elements. If 'operand' has
+ * an NA mask in this case, the caller should have already done
+ * the reduction on the mask. This function copies the subarray with
+ * 'replacena' set to True, so that the already accumulated NA mask
+ * in result doesn't get overwritten.
+ *
+ * For 'skipna' reduction, this is more complicated. In the one dimensional
+ * case, it searches for the first non-NA element, copies that element
+ * to 'result', then returns a view into the rest of 'operand'. For
+ * multi-dimensional reductions, the initial elements may be scattered
+ * throughout the array.
+ *
+ * To deal with this, a view of 'operand' is taken, and given its own
+ * copy of the NA mask. Additionally, an array of flags is created,
+ * matching the shape of 'result', and initialized to all False.
+ * Then, the elements of the 'operand' view are gone through, and any time
+ * an exposed element is encounted which isn't already flagged in the
+ * auxiliary array, it is copied into 'result' and flagged as copied.
+ * The element is masked as an NA in the view of 'operand', so that the
+ * later reduction step will skip it during processing.
+ *
+ * result  : The array into which the result is computed. This must have
+ *           the same number of dimensions as 'operand', but for each
+ *           axis i where 'axis_flags[i]' is True, it has a single element.
+ * operand : The array being reduced.
+ * axis_flags : An array of boolean flags, one for each axis of 'operand'.
+ *              When a flag is True, it indicates to reduce along that axis.
+ * reorderable : If True, the reduction being done is reorderable, which
+ *               means specifying multiple axes of reduction at once is ok,
+ *               and the reduction code may calculate the reduction in an
+ *               arbitrary order. The calculation may be reordered because
+ *               of cache behavior or multithreading requirements.
+ * skipna  : If True, indicates that the reduction is being calculated
+ *           as if the NA values are being dropped from the computation
+ *           instead of accumulating into an NA result.
+ * out_skip_first_count : This gets populated with the number of first-visit
+ *                        elements that should be skipped during the
+ *                        iteration loop.
+ * funcname : The name of the reduction operation, for the purpose of
+ *            better quality error messages. For example, "numpy.max"
+ *            would be a good name for NumPy's max function.
+ *
+ * Returns a view which contains the remaining elements on which to do
+ * the reduction.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_InitializeReduceResult(
+                    PyArrayObject *result, PyArrayObject *operand,
+                    npy_bool *axis_flags, int reorderable, int skipna,
+                    npy_intp *out_skip_first_count, const char *funcname)
+{
+    npy_intp *strides, *shape, shape_orig[NPY_MAXDIMS], shape0;
+    PyArrayObject *op_view = NULL;
+    int idim, ndim, nreduce_axes;
+
+    ndim = PyArray_NDIM(operand);
+
+    /* Default to no skipping first-visit elements in the iteration */
+    *out_skip_first_count = 0;
+
+    /*
+     * If this reduction is non-reorderable, make sure there are
+     * only 0 or 1 axes in axis_flags.
+     */
+    if (!reorderable && check_nonreorderable_axes(ndim,
+                                    axis_flags, funcname) < 0) {
+        return NULL;
+    }
+
+    /*
+     * If 'skipna' is False, or 'operand' has no NA mask in which
+     * case the 'skipna' flag does nothing.
+     */
+    if (!skipna || !PyArray_HASMASKNA(operand)) {
+        if (PyArray_SIZE(operand) == 0) {
+            PyErr_Format(PyExc_ValueError,
+                    "zero-size array to reduction operation %s "
+                    "which has no identity",
+                    funcname);
+            return NULL;
+        }
+
+        /* Take a view into 'operand' which we can modify. */
+        op_view = (PyArrayObject *)PyArray_View(operand, NULL, &PyArray_Type);
+        if (op_view == NULL) {
+            return NULL;
+        }
+    }
+    /*
+     * Otherwise 'skipna' is True and 'operand' has an NA mask. Deal
+     * with the simple one-dimensional case first
+     */
+    else if (ndim == 1) {
+        char *data, *maskna_data;
+        npy_intp *maskna_strides;
+
+        ndim = PyArray_NDIM(operand);
+
+        op_view = (PyArrayObject *)PyArray_View(operand, NULL, &PyArray_Type);
+        if (op_view == NULL) {
+            return NULL;
+        }
+
+        shape = PyArray_DIMS(op_view);
+        shape0 = shape[0];
+        data = PyArray_DATA(op_view);
+        strides = PyArray_STRIDES(op_view);
+        maskna_data = PyArray_MASKNA_DATA(op_view);
+        maskna_strides = PyArray_MASKNA_STRIDES(op_view);
+
+        /* Shrink the array from the start until we find an exposed element */
+        while (shape0 > 0 &&
+                    !NpyMaskValue_IsExposed((npy_mask)*maskna_data)) {
+            --shape0;
+            data += strides[0];
+            maskna_data += maskna_strides[0];
+        }
+
+        if (shape0 == 0) {
+            Py_DECREF(op_view);
+            PyErr_Format(PyExc_ValueError,
+                    "fully NA array with skipna=True to reduction operation "
+                    "%s which has no identity", funcname);
+            return NULL;
+        }
+
+        /*
+         * With the first element exposed, fall through to the code
+         * which copies the element and adjusts the view just as in the
+         * non-skipna case.
+         */
+        shape[0] = shape0;
+        ((PyArrayObject_fields *)op_view)->data = data;
+        ((PyArrayObject_fields *)op_view)->maskna_data = maskna_data;
+    }
+    /*
+     * Here 'skipna' is True and 'operand' has an NA mask, but
+     * 'operand' has more than one dimension, so it's the complicated
+     * case
+     */
+    else {
+        return initialize_reduce_result_noidentity_skipna(
+                                    operand, result, axis_flags, funcname);
+    }
+
+    /*
+     * Now copy the subarray of the first element along each reduction axis,
+     * then return a view to the rest.
+     *
+     * Adjust the shape to only look at the first element along
+     * any of the reduction axes. We count the number of reduction axes
+     * at the same time.
+     */
+    shape = PyArray_SHAPE(op_view);
+    nreduce_axes = 0;
+    memcpy(shape_orig, shape, ndim * sizeof(npy_intp));
+    for (idim = 0; idim < ndim; ++idim) {
+        if (axis_flags[idim]) {
+            shape[idim] = 1;
+            ++nreduce_axes;
+        }
+    }
+
+    /*
+     * Copy the elements into the result to start, with
+     * 'preservena' set to True so that we don't overwrite
+     * what we already calculated in ReduceNAMask.
+     */
+    if (PyArray_AssignArray(result, op_view, NULL, NPY_UNSAFE_CASTING,
+                                                            1, NULL) < 0) {
+        Py_DECREF(op_view);
+        return NULL;
+    }
+
+    /*
+     * If there is one reduction axis, adjust the view's
+     * shape to only look at the remaining elements
+     */
+    if (nreduce_axes == 1) {
+        strides = PyArray_STRIDES(op_view);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                shape[idim] = shape_orig[idim] - 1;
+                ((PyArrayObject_fields *)op_view)->data += strides[idim];
+            }
+        }
+        if (PyArray_HASMASKNA(op_view)) {
+            strides = PyArray_MASKNA_STRIDES(op_view);
+            for (idim = 0; idim < ndim; ++idim) {
+                if (axis_flags[idim]) {
+                    ((PyArrayObject_fields *)op_view)->maskna_data +=
+                                                                strides[idim];
+                }
+            }
+        }
+    }
+    /* If there are zero reduction axes, make the view empty */
+    else if (nreduce_axes == 0) {
+        for (idim = 0; idim < ndim; ++idim) {
+            shape[idim] = 0;
+        }
+    }
+    /*
+     * Otherwise iterate over the whole operand, but tell the inner loop
+     * to skip the elements we already copied by setting the skip_first_count.
+     */
+    else {
+        *out_skip_first_count = PyArray_SIZE(result);
+
+        Py_DECREF(op_view);
+        Py_INCREF(operand);
+        op_view = operand;
+    }
+
+    return op_view;
+}
+
+/*NUMPY_API
+ *
+ * This function executes all the standard NumPy reduction function
+ * boilerplate code, just calling assign_identity and the appropriate
+ * inner loop function where necessary.
+ *
+ * operand     : The array to be reduced.
+ * out         : NULL, or the array into which to place the result.
+ * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
+ *               so that support can be added in the future without breaking
+ *               API compatibility. Pass in NULL.
+ * operand_dtype : The dtype the inner loop expects for the operand.
+ * result_dtype : The dtype the inner loop expects for the result.
+ * casting     : The casting rule to apply to the operands.
+ * axis_flags  : Flags indicating the reduction axes of 'operand'.
+ * reorderable : If True, the reduction being done is reorderable, which
+ *               means specifying multiple axes of reduction at once is ok,
+ *               and the reduction code may calculate the reduction in an
+ *               arbitrary order. The calculation may be reordered because
+ *               of cache behavior or multithreading requirements.
+ * skipna      : If true, NAs are skipped instead of propagating.
+ * whichskipna : NOT YET SUPPORTED, but this parameter is placed here
+ *               so that support can be added for multi-NA without
+ *               breaking API compatibility. Pass in NULL.
+ * keepdims    : If true, leaves the reduction dimensions in the result
+ *               with size one.
+ * subok       : If true, the result uses the subclass of operand, otherwise
+ *               it is always a base class ndarray.
+ * assign_identity : If NULL, PyArray_InitializeReduceResult is used, otherwise
+ *               this function is called to initialize the result to
+ *               the reduction's unit.
+ * loop        : The loop which does the reduction.
+ * masked_loop : The loop which does the reduction with a mask.
+ * advanced_masked_loop: If non-NULL, this is a loop which uses a mask from
+ *               both the operand and the result. The 'result' is
+ *               initialized to a usual reduction of the operand's mask,
+ *               but both the operand's mask and the result's mask
+ *               are provided so that the loop may decide to expose
+ *               elements, which normally would not be exposed by the
+ *               normal NA propagation rules, based on the input data.
+ * data        : Data which is passed to assign_identity and the inner loop.
+ * buffersize  : Buffer size for the iterator. For the default, pass in 0.
+ * funcname    : The name of the reduction function, for error messages.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
+                        PyArrayObject *wheremask,
+                        PyArray_Descr *operand_dtype,
+                        PyArray_Descr *result_dtype,
+                        NPY_CASTING casting,
+                        npy_bool *axis_flags, int reorderable,
+                        int skipna, npy_bool *skipwhichna, int keepdims,
+                        int subok,
+                        PyArray_AssignReduceIdentityFunc *assign_identity,
+                        PyArray_ReduceLoopFunc *loop,
+                        PyArray_ReduceLoopFunc *masked_loop,
+                        PyArray_ReduceLoopFunc *advanced_masked_loop,
+                        void *data, npy_intp buffersize, const char *funcname)
+{
+    int use_maskna;
+    PyArrayObject *result = NULL, *op_view = NULL;
+    npy_intp skip_first_count = 0;
+
+    /* Iterator parameters */
+    NpyIter *iter = NULL;
+    PyArrayObject *op[2];
+    PyArray_Descr *op_dtypes[2];
+    npy_uint32 flags, op_flags[2];
+
+    /* Validate that the parameters for future expansion are NULL */
+    if (wheremask != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Reduce operations in NumPy do not yet support "
+                "a where mask");
+        return NULL;
+    }
+    if (skipwhichna != NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "multi-NA support is not yet implemented in "
+                "reduce operations");
+        return NULL;
+    }
+
+    use_maskna = PyArray_HASMASKNA(operand);
+
+    /*
+     * If 'operand' has an NA mask, but 'out' doesn't, validate that 'operand'
+     * contains no NA values so we can ignore the mask entirely.
+     */
+    if (use_maskna && !skipna && out != NULL && !PyArray_HASMASKNA(out)) {
+        int containsna = PyArray_ContainsNA(operand, wheremask, NULL);
+        if (containsna == -1) {
+            goto fail;
+        }
+        else if (containsna) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot assign NA to an array which "
+                    "does not support NAs");
+            goto fail;
+        }
+        else {
+            use_maskna = 0;
+        }
+    }
+
+    /*
+     * This either conforms 'out' to the ndim of 'operand', or allocates
+     * a new array appropriate for this reduction.
+     */
+    Py_INCREF(result_dtype);
+    result = PyArray_CreateReduceResult(operand, out,
+                            result_dtype, axis_flags, !skipna && use_maskna,
+                            keepdims, subok, funcname);
+    if (result == NULL) {
+        goto fail;
+    }
+
+    /*
+     * Do the reduction on the NA mask before the data. This way
+     * we can avoid modifying the outputs which end up masked, obeying
+     * the required NA masking semantics.
+     */
+    if (use_maskna && !skipna) {
+        if (PyArray_ReduceMaskNAArray(operand, result, NULL, NULL) < 0) {
+            goto fail;
+        }
+
+        /*
+         * Short circuit any calculation if the result is a 0-dim NA
+         * and the advanced masked loop which could expose it isn't
+         * provided.
+         */
+        if (advanced_masked_loop == NULL && PyArray_SIZE(result) == 1 &&
+                !NpyMaskValue_IsExposed(
+                            (npy_mask)*PyArray_MASKNA_DATA(result))) {
+            goto finish;
+        }
+    }
+
+    /*
+     * Initialize the result to the reduction unit if possible,
+     * otherwise copy the initial values and get a view to the rest.
+     */
+    if (assign_identity != NULL) {
+        /*
+         * If this reduction is non-reorderable, make sure there are
+         * only 0 or 1 axes in axis_flags.
+         */
+        if (!reorderable && check_nonreorderable_axes(PyArray_NDIM(operand),
+                                        axis_flags, funcname) < 0) {
+            goto fail;
+        }
+
+        if (assign_identity(result, !skipna, data) < 0) {
+            goto fail;
+        }
+        op_view = operand;
+        Py_INCREF(op_view);
+    }
+    else {
+        op_view = PyArray_InitializeReduceResult(result, operand,
+                            axis_flags, reorderable, skipna,
+                            &skip_first_count, funcname);
+        if (op_view == NULL) {
+            goto fail;
+        }
+        if (PyArray_SIZE(op_view) == 0) {
+            Py_DECREF(op_view);
+            op_view = NULL;
+            goto finish;
+        }
+    }
+
+    /* Set up the iterator */
+    op[0] = result;
+    op[1] = op_view;
+    op_dtypes[0] = result_dtype;
+    op_dtypes[1] = operand_dtype;
+
+    flags = NPY_ITER_BUFFERED |
+            NPY_ITER_EXTERNAL_LOOP |
+            NPY_ITER_GROWINNER |
+            NPY_ITER_DONT_NEGATE_STRIDES |
+            NPY_ITER_ZEROSIZE_OK |
+            NPY_ITER_REDUCE_OK |
+            NPY_ITER_REFS_OK;
+    op_flags[0] = NPY_ITER_READWRITE |
+                  NPY_ITER_ALIGNED |
+                  NPY_ITER_NO_SUBTYPE;
+    op_flags[1] = NPY_ITER_READONLY |
+                  NPY_ITER_ALIGNED;
+
+    /* Add mask-related flags */
+    if (use_maskna) {
+        if (skipna) {
+            /* The output's mask has been set to all exposed already */
+            op_flags[0] |= NPY_ITER_IGNORE_MASKNA;
+            /* Need the input's mask to determine what to skip */
+            op_flags[1] |= NPY_ITER_USE_MASKNA;
+        }
+        else {
+            /* Iterate over the output's mask */
+            op_flags[0] |= NPY_ITER_USE_MASKNA;
+            if (advanced_masked_loop == NULL) {
+                /* Input's mask is already incorporated in the output's mask */
+                op_flags[1] |= NPY_ITER_IGNORE_MASKNA;
+            }
+            else {
+                /* The reduction wants to use the operand's mask as well */
+                op_flags[1] |= NPY_ITER_USE_MASKNA;
+            }
+        }
+    }
+    else {
+        /*
+         * If 'out' had no mask, and 'operand' did, we checked that 'operand'
+         * contains no NA values and can ignore the masks.
+         */
+        op_flags[0] |= NPY_ITER_IGNORE_MASKNA;
+        op_flags[1] |= NPY_ITER_IGNORE_MASKNA;
+    }
+
+    iter = NpyIter_AdvancedNew(2, op, flags,
+                               NPY_KEEPORDER, casting,
+                               op_flags,
+                               op_dtypes,
+                               0, NULL, NULL, buffersize);
+    if (iter == NULL) {
+        goto fail;
+    }
+
+    if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *strideptr;
+        npy_intp *countptr;
+        int needs_api;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strideptr = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        needs_api = NpyIter_IterationNeedsAPI(iter);
+
+        /* Straightforward reduction */
+        if (!use_maskna) {
+            if (loop == NULL) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "reduction operation %s did not supply an "
+                        "unmasked inner loop function", funcname);
+                goto fail;
+            }
+
+            if (loop(iter, dataptr, strideptr, countptr,
+                            iternext, needs_api, skip_first_count, data) < 0) {
+
+                goto fail;
+            }
+        }
+        /* Masked reduction with both masks */
+        else if (!skipna && advanced_masked_loop != NULL) {
+            if (advanced_masked_loop(iter, dataptr, strideptr, countptr,
+                            iternext, needs_api, skip_first_count, data) < 0) {
+                goto fail;
+            }
+        }
+        /* Regular masked reduction with just one mask */
+        else {
+            if (masked_loop == NULL) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "reduction operation %s did not supply a "
+                        "masked inner loop function", funcname);
+                goto fail;
+            }
+
+            if (masked_loop(iter, dataptr, strideptr, countptr,
+                            iternext, needs_api, skip_first_count, data) < 0) {
+                goto fail;
+            }
+        }
+    }
+
+    NpyIter_Deallocate(iter);
+    Py_DECREF(op_view);
+
+finish:
+    /* Strip out the extra 'one' dimensions in the result */
+    if (out == NULL) {
+        if (!keepdims) {
+            PyArray_RemoveAxesInPlace(result, axis_flags);
+        }
+    }
+    else {
+        Py_DECREF(result);
+        result = out;
+        Py_INCREF(result);
+    }
+
+    return result;
+
+fail:
+    Py_XDECREF(result);
+    Py_XDECREF(op_view);
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
+    }
+
+    return NULL;
+}
+
+/*
+ * This function counts the number of elements that a reduction
+ * will see along the reduction directions, given the provided options.
+ *
+ * If the reduction operand has no NA mask or 'skipna' is false, this
+ * is simply the prod`uct of all the reduction axis sizes. A NumPy
+ * scalar is returned in this case.
+ *
+ * If the reduction operand has an NA mask and 'skipna' is true, this
+ * counts the number of elements which are not NA along the reduction
+ * dimensions, and returns an array with the counts.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_CountReduceItems(PyArrayObject *operand,
+                            npy_bool *axis_flags, int skipna, int keepdims)
+{
+    int idim, ndim = PyArray_NDIM(operand);
+
+    /* The product of the reduction dimensions in this case */
+    if (!skipna || !PyArray_HASMASKNA(operand)) {
+        npy_intp count = 1, *shape = PyArray_SHAPE(operand);
+        PyArray_Descr *dtype;
+        PyObject *ret;
+
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                count *= shape[idim];
+            }
+        }
+
+        dtype = PyArray_DescrFromType(NPY_INTP);
+        if (dtype == NULL) {
+            return NULL;
+        }
+        ret = PyArray_Scalar(&count, dtype, NULL);
+        Py_DECREF(dtype);
+        return ret;
+    }
+    /* Otherwise we need to do a count based on the NA mask */
+    else {
+        npy_intp *strides;
+        PyArrayObject *result;
+        PyArray_Descr *result_dtype;
+
+        npy_intp i, coord[NPY_MAXDIMS];
+        npy_intp shape_it[NPY_MAXDIMS];
+        npy_intp operand_strides_it[NPY_MAXDIMS];
+        npy_intp result_strides_it[NPY_MAXDIMS];
+        char *operand_data = NULL, *result_data = NULL;
+
+        /*
+         * To support field-NA, we would create a result type
+         * with an INTP matching each field, then separately count
+         * the available elements per-field.
+         */
+        if (PyArray_HASFIELDS(operand)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "field-NA isn't implemented yet");
+            return NULL;
+        }
+
+        /*
+         * TODO: The loop below is specialized for NPY_BOOL masks,
+         *       will need another version for NPY_MASK masks.
+         */
+        if (PyArray_MASKNA_DTYPE(operand)->type_num != NPY_BOOL) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "multi-NA isn't implemented yet");
+            return NULL;
+        }
+
+        /* Allocate an array for the reduction counting */
+        result_dtype = PyArray_DescrFromType(NPY_INTP);
+        if (result_dtype == NULL) {
+            return NULL;
+        }
+        result = PyArray_CreateReduceResult(operand, NULL,
+                                result_dtype, axis_flags, 0,
+                                keepdims, 0, "count_reduce_items");
+        if (result == NULL) {
+            return NULL;
+        }
+
+        /* Initialize result to all zeros */
+        if (PyArray_AssignZero(result, NULL, 0, NULL) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+
+        /*
+         * Set all the reduction strides to 0 in result so
+         * we can use them for raw iteration
+         */
+        strides = PyArray_STRIDES(result);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                strides[idim] = 0;
+            }
+        }
+
+        /*
+         * Sort axes based on 'operand', which has more non-zero strides,
+         * by making it the first operand here
+         */
+        if (PyArray_PrepareTwoRawArrayIter(ndim, PyArray_SHAPE(operand),
+                PyArray_MASKNA_DATA(operand), PyArray_MASKNA_STRIDES(operand),
+                            PyArray_DATA(result), PyArray_STRIDES(result),
+                            &ndim, shape_it,
+                            &operand_data, operand_strides_it,
+                            &result_data, result_strides_it) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+
+        /*
+         * NOTE: The following only works for NPY_BOOL masks.
+         */
+        NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+            char *operand_d = operand_data, *result_d = result_data;
+            for (i = 0; i < shape_it[0]; ++i) {
+                *(npy_intp *)result_d += *operand_d;
+
+                operand_d += operand_strides_it[0];
+                result_d += result_strides_it[0];
+            }
+        } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                    operand_data, operand_strides_it,
+                                    result_data, result_strides_it);
+
+        /* Remove the reduction axes and return the result */
+        if (!keepdims) {
+            PyArray_RemoveAxesInPlace(result, axis_flags);
+        }
+        return PyArray_Return(result);
+    }
+}
diff --git a/numpy/core/src/multiarray/reduction.h b/numpy/core/src/multiarray/reduction.h
new file mode 100644
index 000000000..268fdaf29
--- /dev/null
+++ b/numpy/core/src/multiarray/reduction.h
@@ -0,0 +1,110 @@
+#ifndef _NPY_PRIVATE__REDUCTION_H_
+#define _NPY_PRIVATE__REDUCTION_H_
+
+/*
+ * This function counts the number of elements that a reduction
+ * will see along the reduction directions, given the provided options.
+ *
+ * If the reduction operand has no NA mask or 'skipna' is false, this
+ * is simply the prod`uct of all the reduction axis sizes. A NumPy
+ * scalar is returned in this case.
+ *
+ * If the reduction operand has an NA mask and 'skipna' is true, this
+ * counts the number of elements which are not NA along the reduction
+ * dimensions, and returns an array with the counts.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_CountReduceItems(PyArrayObject *operand,
+                            npy_bool *axis_flags, int skipna, int keepdims);
+
+/*
+ * This function initializes a result array for a reduction operation
+ * which has no identity. This means it needs to copy the first element
+ * it sees along the reduction axes to result, then return a view of
+ * the operand which excludes that element.
+ *
+ * If a reduction has an identity, such as 0 or 1, the result should
+ * be initialized by calling PyArray_AssignZero(result, NULL, !skipna, NULL)
+ * or PyArray_AssignOne(result, NULL, !skipna, NULL), because this
+ * function raises an exception when there are no elements to reduce.
+ *
+ * For regular reduction, this means it copies the subarray indexed
+ * at zero along each reduction axis into 'result', then returns a view
+ * into 'operand' excluding those copied elements. If 'operand' has
+ * an NA mask in this case, the caller should have already done
+ * the reduction on the mask. This function copies the subarray with
+ * 'replacena' set to True, so that the already accumulated NA mask
+ * in result doesn't get overwritten.
+ *
+ * For 'skipna' reduction, this is more complicated. In the one dimensional
+ * case, it searches for the first non-NA element, copies that element
+ * to 'result', then returns a view into the rest of 'operand'. For
+ * multi-dimensional reductions, the initial elements may be scattered
+ * throughout the array.
+ *
+ * To deal with this, a view of 'operand' is taken, and given its own
+ * copy of the NA mask. Additionally, an array of flags is created,
+ * matching the shape of 'result', and initialized to all False.
+ * Then, the elements of the 'operand' view are gone through, and any time
+ * an exposed element is encounted which isn't already flagged in the
+ * auxiliary array, it is copied into 'result' and flagged as copied.
+ * The element is masked as an NA in the view of 'operand', so that the
+ * later reduction step will skip it during processing.
+ *
+ * result  : The array into which the result is computed. This must have
+ *           the same number of dimensions as 'operand', but for each
+ *           axis i where 'axis_flags[i]' is True, it has a single element.
+ * operand : The array being reduced.
+ * axis_flags : An array of boolean flags, one for each axis of 'operand'.
+ *              When a flag is True, it indicates to reduce along that axis.
+ * reorderable : If True, the reduction being done is reorderable, which
+ *               means specifying multiple axes of reduction at once is ok,
+ *               and the reduction code may calculate the reduction in an
+ *               arbitrary order. The calculation may be reordered because
+ *               of cache behavior or multithreading requirements.
+ * skipna  : If True, indicates that the reduction is being calculated
+ *           as if the NA values are being dropped from the computation
+ *           instead of accumulating into an NA result.
+ * out_skip_first_count : This gets populated with the number of first-visit
+ *                        elements that should be skipped during the
+ *                        iteration loop.
+ * funcname : The name of the reduction operation, for the purpose of
+ *            better quality error messages. For example, "numpy.max"
+ *            would be a good name for NumPy's max function.
+ *
+ * Returns a view which contains the remaining elements on which to do
+ * the reduction.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_InitializeReduceResult(
+                    PyArrayObject *result, PyArrayObject *operand,
+                    npy_bool *axis_flags, int reorderable, int skipna,
+                    npy_intp *out_skip_first_count, const char *funcname);
+
+/*
+ * Creates a result for reducing 'operand' along the axes specified
+ * in 'axis_flags'. If 'dtype' isn't NULL, this function steals a
+ * reference to 'dtype'.
+ *
+ * If 'out' isn't NULL, this function creates a view conforming
+ * to the number of dimensions of 'operand', adding a singleton dimension
+ * for each reduction axis specified. In this case, 'dtype' is ignored
+ * (but its reference is still stolen), and the caller must handle any
+ * type conversion/validity check for 'out'. When 'need_namask' is true,
+ * raises an exception if 'out' doesn't have an NA mask.
+ *
+ * If 'subok' is true, creates a result with the subtype of 'operand',
+ * otherwise creates on with the base ndarray class.
+ *
+ * If 'out' is NULL, it allocates a new array whose shape matches
+ * that of 'operand', except for at the reduction axes. An NA mask
+ * is added if 'need_namask' is true.  If 'dtype' is NULL, the dtype
+ * of 'operand' is used for the result.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
+                    PyArray_Descr *dtype, npy_bool *axis_flags,
+                    int need_namask, int keepdims, int subok,
+                    const char *funcname);
+
+#endif
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index fa783dabf..41ba4c6ae 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -19,6 +19,7 @@
 #include "scalartypes.h"
 
 #include "common.h"
+#include "na_object.h"
 
 static PyArray_Descr *
 _descr_from_subtype(PyObject *type)
@@ -423,7 +424,7 @@ PyArray_DescrFromTypeObject(PyObject *type)
 
     /* if it's a builtin type, then use the typenumber */
     typenum = _typenum_fromtypeobj(type,1);
-    if (typenum != PyArray_NOTYPE) {
+    if (typenum != NPY_NOTYPE) {
         new = PyArray_DescrFromType(typenum);
         return new;
     }
@@ -432,24 +433,24 @@ PyArray_DescrFromTypeObject(PyObject *type)
     if ((type == (PyObject *) &PyNumberArrType_Type) ||
             (type == (PyObject *) &PyInexactArrType_Type) ||
             (type == (PyObject *) &PyFloatingArrType_Type)) {
-        typenum = PyArray_DOUBLE;
+        typenum = NPY_DOUBLE;
     }
     else if (type == (PyObject *)&PyComplexFloatingArrType_Type) {
-        typenum = PyArray_CDOUBLE;
+        typenum = NPY_CDOUBLE;
     }
     else if ((type == (PyObject *)&PyIntegerArrType_Type) ||
             (type == (PyObject *)&PySignedIntegerArrType_Type)) {
-        typenum = PyArray_LONG;
+        typenum = NPY_LONG;
     }
     else if (type == (PyObject *) &PyUnsignedIntegerArrType_Type) {
-        typenum = PyArray_ULONG;
+        typenum = NPY_ULONG;
     }
     else if (type == (PyObject *) &PyCharacterArrType_Type) {
-        typenum = PyArray_STRING;
+        typenum = NPY_STRING;
     }
     else if ((type == (PyObject *) &PyGenericArrType_Type) ||
             (type == (PyObject *) &PyFlexibleArrType_Type)) {
-        typenum = PyArray_VOID;
+        typenum = NPY_VOID;
     }
 
     if (typenum != PyArray_NOTYPE) {
@@ -463,7 +464,7 @@ PyArray_DescrFromTypeObject(PyObject *type)
 
     /* Do special thing for VOID sub-types */
     if (PyType_IsSubtype((PyTypeObject *)type, &PyVoidArrType_Type)) {
-        new = PyArray_DescrNewFromType(PyArray_VOID);
+        new = PyArray_DescrNewFromType(NPY_VOID);
         conv = _arraydescr_fromobj(type);
         if (conv) {
             new->fields = conv->fields;
@@ -814,7 +815,18 @@ PyArray_Return(PyArrayObject *mp)
     }
     if (PyArray_NDIM(mp) == 0) {
         PyObject *ret;
-        ret = PyArray_ToScalar(PyArray_DATA(mp), mp);
+        if (PyArray_HASMASKNA(mp)) {
+            npy_mask maskvalue = (npy_mask)(*PyArray_MASKNA_DATA(mp));
+            if (NpyMaskValue_IsExposed(maskvalue)) {
+                ret = PyArray_ToScalar(PyArray_DATA(mp), mp);
+            }
+            else {
+                ret = (PyObject *)NpyNA_FromObject((PyObject *)mp, 0);
+            }
+        }
+        else {
+            ret = PyArray_ToScalar(PyArray_DATA(mp), mp);
+        }
         Py_DECREF(mp);
         return ret;
     }
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 354dcfa2f..2d1437df2 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -72,7 +72,8 @@ array_slice(PyArrayObject *self, Py_ssize_t ilow, Py_ssize_t ihigh)
     ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self), dtype,
                              PyArray_NDIM(self), shape,
                              PyArray_STRIDES(self), data,
-                             PyArray_FLAGS(self), (PyObject *)self);
+             PyArray_FLAGS(self) & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
+                             (PyObject *)self);
     if (ret == NULL) {
         return NULL;
     }
@@ -82,6 +83,28 @@ array_slice(PyArrayObject *self, Py_ssize_t ilow, Py_ssize_t ihigh)
         return NULL;
     }
     PyArray_UpdateFlags(ret, NPY_ARRAY_UPDATE_ALL);
+
+    /* Also take a view of the NA mask if it exists */
+    if (PyArray_HASMASKNA(self)) {
+        PyArrayObject_fields *fret = (PyArrayObject_fields *)ret;
+
+        fret->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fret->maskna_dtype);
+
+        data = PyArray_MASKNA_DATA(self);
+        if (ilow < ihigh) {
+            data += ilow * PyArray_MASKNA_STRIDES(self)[0];
+        }
+        fret->maskna_data = data;
+
+        memcpy(fret->maskna_strides, PyArray_MASKNA_STRIDES(self),
+                        PyArray_NDIM(self) * sizeof(npy_intp));
+
+        /* This view doesn't own the mask */
+        fret->flags |= NPY_ARRAY_MASKNA;
+        fret->flags &= ~NPY_ARRAY_OWNMASKNA;
+    }
+
     return (PyObject *)ret;
 }
 
@@ -102,7 +125,8 @@ array_ass_slice(PyArrayObject *self, Py_ssize_t ilow,
                         "array is not writeable");
         return -1;
     }
-    if ((tmp = (PyArrayObject *)array_slice(self, ilow, ihigh)) == NULL) {
+    tmp = (PyArrayObject *)array_slice(self, ilow, ihigh);
+    if (tmp == NULL) {
         return -1;
     }
     ret = PyArray_CopyObject(tmp, v);
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 3754e6a1e..a0cf5a6aa 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -18,17 +18,17 @@
 
 #include "shape.h"
 
-#define PyAO PyArrayObject
-
 static int
-_check_ones(PyArrayObject *self, int newnd, intp* newdims, intp *strides);
+_check_ones(PyArrayObject *self, int newnd,
+                npy_intp* newdims, npy_intp *strides, npy_intp *masknastrides);
 
 static int
-_fix_unknown_dimension(PyArray_Dims *newshape, intp s_original);
+_fix_unknown_dimension(PyArray_Dims *newshape, npy_intp s_original);
 
 static int
-_attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
-                        intp *newstrides, int is_f_order);
+_attempt_nocopy_reshape(PyArrayObject *self, int newnd, npy_intp* newdims,
+                        npy_intp *newstrides, npy_intp *newmasknastrides,
+                        int is_f_order);
 
 static void
 _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype);
@@ -43,15 +43,15 @@ NPY_NO_EXPORT PyObject *
 PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
                NPY_ORDER order)
 {
-    intp oldsize, newsize;
+    npy_intp oldsize, newsize;
     int new_nd=newshape->len, k, n, elsize;
     int refcnt;
-    intp* new_dimensions=newshape->ptr;
-    intp new_strides[MAX_DIMS];
+    npy_intp* new_dimensions=newshape->ptr;
+    npy_intp new_strides[NPY_MAXDIMS];
     size_t sd;
-    intp *dimptr;
+    npy_intp *dimptr;
     char *new_data;
-    intp largest;
+    npy_intp largest;
 
     if (!PyArray_ISONESEGMENT(self)) {
         PyErr_SetString(PyExc_ValueError,
@@ -97,7 +97,7 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
         }
         if ((refcnt > 2)
                 || (PyArray_BASE(self) != NULL)
-                || (((PyArrayObject_fieldaccess *)self)->weakreflist != NULL)) {
+                || (((PyArrayObject_fields *)self)->weakreflist != NULL)) {
             PyErr_SetString(PyExc_ValueError,
                     "cannot resize an array references or is referenced\n"\
                     "by another array in this way.  Use the resize function");
@@ -117,7 +117,7 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
                     "cannot allocate memory for array");
             return NULL;
         }
-        ((PyArrayObject_fieldaccess *)self)->data = new_data;
+        ((PyArrayObject_fields *)self)->data = new_data;
     }
 
     if ((newsize > oldsize) && PyArray_ISWRITEABLE(self)) {
@@ -141,24 +141,25 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
 
     if (PyArray_NDIM(self) != new_nd) {
         /* Different number of dimensions. */
-        ((PyArrayObject_fieldaccess *)self)->nd = new_nd;
+        ((PyArrayObject_fields *)self)->nd = new_nd;
         /* Need new dimensions and strides arrays */
-        dimptr = PyDimMem_RENEW(PyArray_DIMS(self), 2*new_nd);
+        dimptr = PyDimMem_RENEW(PyArray_DIMS(self), 3*new_nd);
         if (dimptr == NULL) {
             PyErr_SetString(PyExc_MemoryError,
                     "cannot allocate memory for array");
             return NULL;
         }
-        ((PyArrayObject_fieldaccess *)self)->dimensions = dimptr;
-        ((PyArrayObject_fieldaccess *)self)->strides = dimptr + new_nd;
+        ((PyArrayObject_fields *)self)->dimensions = dimptr;
+        ((PyArrayObject_fields *)self)->strides = dimptr + new_nd;
+        ((PyArrayObject_fields *)self)->maskna_strides = dimptr + 2*new_nd;
     }
 
     /* make new_strides variable */
     sd = (size_t) PyArray_DESCR(self)->elsize;
     sd = (size_t) _array_fill_strides(new_strides, new_dimensions, new_nd, sd,
-            PyArray_FLAGS(self), &(((PyArrayObject_fieldaccess *)self)->flags));
-    memmove(PyArray_DIMS(self), new_dimensions, new_nd*sizeof(intp));
-    memmove(PyArray_STRIDES(self), new_strides, new_nd*sizeof(intp));
+            PyArray_FLAGS(self), &(((PyArrayObject_fields *)self)->flags));
+    memmove(PyArray_DIMS(self), new_dimensions, new_nd*sizeof(npy_intp));
+    memmove(PyArray_STRIDES(self), new_strides, new_nd*sizeof(npy_intp));
     Py_INCREF(Py_None);
     return Py_None;
 }
@@ -177,23 +178,24 @@ NPY_NO_EXPORT PyObject *
 PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
                  NPY_ORDER order)
 {
-    intp i;
-    intp *dimensions = newdims->ptr;
+    npy_intp i;
+    npy_intp *dimensions = newdims->ptr;
     PyArrayObject *ret;
-    int n = newdims->len;
-    Bool same, incref = TRUE;
-    intp *strides = NULL;
-    intp newstrides[MAX_DIMS];
-    int flags;
+    int ndim = newdims->len;
+    npy_bool same, incref = TRUE;
+    npy_intp *strides = NULL;
+    npy_intp newstrides[NPY_MAXDIMS];
+    npy_intp newmasknastrides[NPY_MAXDIMS];
+    int flags, build_maskna_strides = 0;
 
     if (order == NPY_ANYORDER) {
         order = PyArray_ISFORTRAN(self);
     }
     /*  Quick check to make sure anything actually needs to be done */
-    if (n == PyArray_NDIM(self)) {
+    if (ndim == PyArray_NDIM(self)) {
         same = TRUE;
         i = 0;
-        while (same && i < n) {
+        while (same && i < ndim) {
             if (PyArray_DIM(self,i) != dimensions[i]) {
                 same=FALSE;
             }
@@ -212,11 +214,12 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
      * In this case we don't need to do anything but update strides and
      * dimensions.  So, we can handle non single-segment cases.
      */
-    i = _check_ones(self, n, dimensions, newstrides);
+    i = _check_ones(self, ndim, dimensions, newstrides, newmasknastrides);
     if (i == 0) {
         strides = newstrides;
     }
-    flags = PyArray_FLAGS(self);
+    flags = PyArray_FLAGS(self) & ~(NPY_ARRAY_OWNMASKNA |
+                                    NPY_ARRAY_MASKNA);
 
     if (strides == NULL) {
         /*
@@ -238,29 +241,33 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
               (PyArray_CHKFLAGS(self, NPY_ARRAY_F_CONTIGUOUS) &&
                   order == NPY_CORDER)) && (PyArray_NDIM(self) > 1))) {
             int success = 0;
-            success = _attempt_nocopy_reshape(self,n,dimensions,
-                                              newstrides,order);
+            success = _attempt_nocopy_reshape(self, ndim, dimensions,
+                                          newstrides, newmasknastrides, order);
             if (success) {
                 /* no need to copy the array after all */
                 strides = newstrides;
-                flags = PyArray_FLAGS(self);
             }
             else {
-                PyObject *new;
-                new = PyArray_NewCopy(self, order);
-                if (new == NULL) {
+                PyObject *newcopy;
+                newcopy = PyArray_NewCopy(self, order);
+                if (newcopy == NULL) {
                     return NULL;
                 }
                 incref = FALSE;
-                self = (PyArrayObject *)new;
-                flags = PyArray_FLAGS(self);
+                self = (PyArrayObject *)newcopy;
+                build_maskna_strides = 1;
             }
+            flags = PyArray_FLAGS(self) & ~(NPY_ARRAY_OWNMASKNA |
+                                            NPY_ARRAY_MASKNA);
+        }
+        else {
+            build_maskna_strides = 1;
         }
 
         /* We always have to interpret the contiguous buffer correctly */
 
         /* Make sure the flags argument is set. */
-        if (n > 1) {
+        if (ndim > 1) {
             if (order == NPY_FORTRANORDER) {
                 flags &= ~NPY_ARRAY_C_CONTIGUOUS;
                 flags |= NPY_ARRAY_F_CONTIGUOUS;
@@ -271,7 +278,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             }
         }
     }
-    else if (n > 0) {
+    else if (ndim > 0) {
         /*
          * replace any 0-valued strides with
          * appropriate value to preserve contiguousness
@@ -280,17 +287,17 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             if (strides[0] == 0) {
                 strides[0] = PyArray_DESCR(self)->elsize;
             }
-            for (i = 1; i < n; i++) {
+            for (i = 1; i < ndim; i++) {
                 if (strides[i] == 0) {
                     strides[i] = strides[i-1] * dimensions[i-1];
                 }
             }
         }
         else {
-            if (strides[n-1] == 0) {
-                strides[n-1] = PyArray_DESCR(self)->elsize;
+            if (strides[ndim-1] == 0) {
+                strides[ndim-1] = PyArray_DESCR(self)->elsize;
             }
-            for (i = n - 2; i > -1; i--) {
+            for (i = ndim - 2; i > -1; i--) {
                 if (strides[i] == 0) {
                     strides[i] = strides[i+1] * dimensions[i+1];
                 }
@@ -299,9 +306,9 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
     }
 
     Py_INCREF(PyArray_DESCR(self));
-    ret = (PyAO *)PyArray_NewFromDescr(Py_TYPE(self),
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
                                        PyArray_DESCR(self),
-                                       n, dimensions,
+                                       ndim, dimensions,
                                        strides,
                                        PyArray_DATA(self),
                                        flags, (PyObject *)self);
@@ -309,6 +316,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
     if (ret == NULL) {
         goto fail;
     }
+
     if (incref) {
         Py_INCREF(self);
     }
@@ -316,6 +324,35 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
         Py_DECREF(ret);
         return NULL;
     }
+
+    /* If there's an NA mask, make sure to view it too */
+    if (PyArray_HASMASKNA(self)) {
+        PyArrayObject_fields *fa = (PyArrayObject_fields *)ret;
+        fa->maskna_dtype = PyArray_MASKNA_DTYPE(self);
+        Py_INCREF(fa->maskna_dtype);
+        fa->maskna_data = PyArray_MASKNA_DATA(self);
+        if (build_maskna_strides) {
+            npy_intp stride = 1;
+            if (order == NPY_FORTRANORDER) {
+                for (i = 0; i < ndim; ++i) {
+                    fa->maskna_strides[i] = stride;
+                    stride *= fa->dimensions[i];
+                }
+            }
+            else {
+                for (i = ndim-1; i >= 0; --i) {
+                    fa->maskna_strides[i] = stride;
+                    stride *= fa->dimensions[i];
+                }
+            }
+        }
+        else {
+            memcpy(fa->maskna_strides, newmasknastrides,
+                                fa->nd * sizeof(npy_intp));
+        }
+        fa->flags |= NPY_ARRAY_MASKNA;
+    }
+
     PyArray_UpdateFlags(ret, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
     return (PyObject *)ret;
 
@@ -349,12 +386,14 @@ PyArray_Reshape(PyArrayObject *self, PyObject *shape)
 
 /* inserts 0 for strides where dimension will be 1 */
 static int
-_check_ones(PyArrayObject *self, int newnd, intp* newdims, intp *strides)
+_check_ones(PyArrayObject *self, int newnd,
+                npy_intp* newdims, npy_intp *strides, npy_intp *masknastrides)
 {
     int nd;
-    intp *dims;
-    Bool done=FALSE;
+    npy_intp *dims;
+    npy_bool done=FALSE;
     int j, k;
+    int has_maskna = PyArray_HASMASKNA(self);
 
     nd = PyArray_NDIM(self);
     dims = PyArray_DIMS(self);
@@ -362,11 +401,17 @@ _check_ones(PyArrayObject *self, int newnd, intp* newdims, intp *strides)
     for (k = 0, j = 0; !done && (j < nd || k < newnd);) {
         if ((j<nd) && (k<newnd) && (newdims[k] == dims[j])) {
             strides[k] = PyArray_STRIDES(self)[j];
+            if (has_maskna) {
+                masknastrides[k] = PyArray_MASKNA_STRIDES(self)[j];
+            }
             j++;
             k++;
         }
         else if ((k < newnd) && (newdims[k] == 1)) {
             strides[k] = 0;
+            if (has_maskna) {
+                masknastrides[k] = 0;
+            }
             k++;
         }
         else if ((j<nd) && (dims[j] == 1)) {
@@ -430,20 +475,25 @@ _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype)
  * stride of the next-fastest index.
  */
 static int
-_attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
-                        intp *newstrides, int is_f_order)
+_attempt_nocopy_reshape(PyArrayObject *self, int newnd, npy_intp* newdims,
+                        npy_intp *newstrides, npy_intp *newmasknastrides,
+                        int is_f_order)
 {
     int oldnd;
-    intp olddims[MAX_DIMS];
-    intp oldstrides[MAX_DIMS];
+    npy_intp olddims[NPY_MAXDIMS];
+    npy_intp oldstrides[NPY_MAXDIMS], oldmasknastrides[NPY_MAXDIMS];
     int oi, oj, ok, ni, nj, nk;
     int np, op;
+    int has_maskna = PyArray_HASMASKNA(self);
 
     oldnd = 0;
     for (oi = 0; oi < PyArray_NDIM(self); oi++) {
         if (PyArray_DIMS(self)[oi]!= 1) {
             olddims[oldnd] = PyArray_DIMS(self)[oi];
             oldstrides[oldnd] = PyArray_STRIDES(self)[oi];
+            if (has_maskna) {
+                oldmasknastrides[oldnd] = PyArray_MASKNA_STRIDES(self)[oi];
+            }
             oldnd++;
         }
     }
@@ -494,14 +544,18 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
 
         for (ok = oi; ok < oj - 1; ok++) {
             if (is_f_order) {
-                if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
+                if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok] ||
+                       (has_maskna && oldmasknastrides[ok+1] !=
+                                        olddims[ok]*oldmasknastrides[ok])) {
                      /* not contiguous enough */
                     return 0;
                 }
             }
             else {
                 /* C order */
-                if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
+                if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1] ||
+                        (has_maskna && oldmasknastrides[ok] !=
+                                    olddims[ok+1]*oldmasknastrides[ok+1])) {
                     /* not contiguous enough */
                     return 0;
                 }
@@ -513,6 +567,13 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
             for (nk = ni + 1; nk < nj; nk++) {
                 newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
             }
+            if (has_maskna) {
+                newmasknastrides[ni] = oldmasknastrides[oi];
+                for (nk = ni + 1; nk < nj; nk++) {
+                    newmasknastrides[nk] =
+                                    newmasknastrides[nk - 1]*newdims[nk - 1];
+                }
+            }
         }
         else {
             /* C order */
@@ -520,6 +581,13 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
             for (nk = nj - 1; nk > ni; nk--) {
                 newstrides[nk - 1] = newstrides[nk]*newdims[nk];
             }
+            if (has_maskna) {
+                newmasknastrides[nj - 1] = oldmasknastrides[oj - 1];
+                for (nk = nj - 1; nk > ni; nk--) {
+                    newmasknastrides[nk - 1] =
+                                    newmasknastrides[nk]*newdims[nk];
+                }
+            }
         }
         ni = nj++;
         oi = oj++;
@@ -539,10 +607,10 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, intp* newdims,
 }
 
 static int
-_fix_unknown_dimension(PyArray_Dims *newshape, intp s_original)
+_fix_unknown_dimension(PyArray_Dims *newshape, npy_intp s_original)
 {
-    intp *dimensions;
-    intp i_unknown, s_known;
+    npy_intp *dimensions;
+    npy_intp i_unknown, s_known;
     int i, n;
     static char msg[] = "total size of new array must be unchanged";
 
@@ -593,45 +661,104 @@ _fix_unknown_dimension(PyArray_Dims *newshape, intp s_original)
 NPY_NO_EXPORT PyObject *
 PyArray_Squeeze(PyArrayObject *self)
 {
-    int nd = PyArray_NDIM(self);
-    int newnd = nd;
-    intp dimensions[MAX_DIMS];
-    intp strides[MAX_DIMS];
-    int i, j;
     PyArrayObject *ret;
-    PyArray_Descr *dtype;
+    npy_bool unit_dims[NPY_MAXDIMS];
+    int idim, ndim, any_ones;
+    npy_intp *shape;
+
+    ndim = PyArray_NDIM(self);
+    shape = PyArray_SHAPE(self);
+
+    any_ones = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (shape[idim] == 1) {
+            unit_dims[idim] = 1;
+            any_ones = 1;
+        }
+        else {
+            unit_dims[idim] = 0;
+        }
+    }
 
-    if (nd == 0) {
+    /* If there were no ones to squeeze out, return the same array */
+    if (!any_ones) {
         Py_INCREF(self);
         return (PyObject *)self;
     }
-    for (j = 0, i = 0; i < nd; i++) {
-        if (PyArray_DIMS(self)[i] == 1) {
-            newnd -= 1;
-        }
-        else {
-            dimensions[j] = PyArray_DIMS(self)[i];
-            strides[j++] = PyArray_STRIDES(self)[i];
+
+    ret = (PyArrayObject *)PyArray_View(self, NULL, &PyArray_Type);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    PyArray_RemoveAxesInPlace(ret, unit_dims);
+
+    /*
+     * If self isn't not a base class ndarray, call its
+     * __array_wrap__ method
+     */
+    if (Py_TYPE(self) != &PyArray_Type) {
+        PyArrayObject *tmp = PyArray_SubclassWrap(self, ret);
+        Py_DECREF(ret);
+        ret = tmp;
+    }
+
+    return (PyObject *)ret;
+}
+
+/*
+ * Just like PyArray_Squeeze, but allows the caller to select
+ * a subset of the size-one dimensions to squeeze out.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_SqueezeSelected(PyArrayObject *self, npy_bool *axis_flags)
+{
+    PyArrayObject *ret;
+    int idim, ndim, any_ones;
+    npy_intp *shape;
+
+    ndim = PyArray_NDIM(self);
+    shape = PyArray_SHAPE(self);
+
+    /* Verify that the axes requested are all of size one */
+    any_ones = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        if (axis_flags[idim] != 0) {
+            if (shape[idim] == 1) {
+                any_ones = 1;
+            }
+            else {
+                PyErr_SetString(PyExc_ValueError,
+                        "cannot select an axis to squeeze out "
+                        "which has size greater than one");
+                return NULL;
+            }
         }
     }
 
-    dtype = PyArray_DESCR(self);
-    Py_INCREF(dtype);
-    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
-                               dtype,
-                               newnd, dimensions,
-                               strides, PyArray_DATA(self),
-                               PyArray_FLAGS(self),
-                               (PyObject *)self);
+    /* If there were no axes to squeeze out, return the same array */
+    if (!any_ones) {
+        Py_INCREF(self);
+        return (PyObject *)self;
+    }
+
+    ret = (PyArrayObject *)PyArray_View(self, NULL, &PyArray_Type);
     if (ret == NULL) {
         return NULL;
     }
-    PyArray_CLEARFLAGS(ret, NPY_ARRAY_OWNDATA);
-    Py_INCREF(self);
-    if (PyArray_SetBaseObject(ret, (PyObject *)self) < 0) {
+
+    PyArray_RemoveAxesInPlace(ret, axis_flags);
+
+    /*
+     * If self isn't not a base class ndarray, call its
+     * __array_wrap__ method
+     */
+    if (Py_TYPE(self) != &PyArray_Type) {
+        PyArrayObject *tmp = PyArray_SubclassWrap(self, ret);
         Py_DECREF(ret);
-        return NULL;
+        ret = tmp;
     }
+
     return (PyObject *)ret;
 }
 
@@ -642,7 +769,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_SwapAxes(PyArrayObject *ap, int a1, int a2)
 {
     PyArray_Dims new_axes;
-    intp dims[MAX_DIMS];
+    npy_intp dims[NPY_MAXDIMS];
     int n, i, val;
     PyObject *ret;
 
@@ -698,10 +825,11 @@ PyArray_SwapAxes(PyArrayObject *ap, int a1, int a2)
 NPY_NO_EXPORT PyObject *
 PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute)
 {
-    intp *axes, axis;
-    intp i, n;
-    intp permutation[MAX_DIMS], reverse_permutation[MAX_DIMS];
+    npy_intp *axes, axis;
+    npy_intp i, n;
+    npy_intp permutation[NPY_MAXDIMS], reverse_permutation[NPY_MAXDIMS];
     PyArrayObject *ret = NULL;
+    int flags;
 
     if (permute == NULL) {
         n = PyArray_NDIM(ap);
@@ -738,20 +866,21 @@ PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute)
             reverse_permutation[axis] = i;
             permutation[i] = axis;
         }
-        for (i = 0; i < n; i++) {
-        }
     }
 
+    flags = PyArray_FLAGS(ap);
+
     /*
      * this allocates memory for dimensions and strides (but fills them
      * incorrectly), sets up descr, and points data at PyArray_DATA(ap).
      */
     Py_INCREF(PyArray_DESCR(ap));
-    ret = (PyArrayObject *)\
+    ret = (PyArrayObject *)
         PyArray_NewFromDescr(Py_TYPE(ap),
                              PyArray_DESCR(ap),
                              n, PyArray_DIMS(ap),
-                             NULL, PyArray_DATA(ap), PyArray_FLAGS(ap),
+                             NULL, PyArray_DATA(ap),
+                             flags & ~(NPY_ARRAY_MASKNA | NPY_ARRAY_OWNMASKNA),
                              (PyObject *)ap);
     if (ret == NULL) {
         return NULL;
@@ -763,6 +892,21 @@ PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute)
         return NULL;
     }
 
+    /* Take a view of the NA mask as well if necessary */
+    if (flags & NPY_ARRAY_MASKNA) {
+        PyArrayObject_fields *fa = (PyArrayObject_fields *)ret;
+
+        fa->maskna_dtype = PyArray_MASKNA_DTYPE(ap);
+        Py_INCREF(fa->maskna_dtype);
+        fa->maskna_data = PyArray_MASKNA_DATA(ap);
+
+        for (i = 0; i < n; i++) {
+            fa->maskna_strides[i] =
+                        PyArray_MASKNA_STRIDES(ap)[permutation[i]];
+        }
+        fa->flags |= NPY_ARRAY_MASKNA;
+    }
+
     /* fix the dimensions and strides of the return-array */
     for (i = 0; i < n; i++) {
         PyArray_DIMS(ret)[i] = PyArray_DIMS(ap)[permutation[i]];
@@ -778,8 +922,8 @@ PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute)
  */
 int _npy_stride_sort_item_comparator(const void *a, const void *b)
 {
-    npy_intp astride = ((_npy_stride_sort_item *)a)->stride,
-            bstride = ((_npy_stride_sort_item *)b)->stride;
+    npy_intp astride = ((npy_stride_sort_item *)a)->stride,
+            bstride = ((npy_stride_sort_item *)b)->stride;
 
     /* Sort the absolute value of the strides */
     if (astride < 0) {
@@ -789,123 +933,248 @@ int _npy_stride_sort_item_comparator(const void *a, const void *b)
         bstride = -bstride;
     }
 
-    if (astride > bstride) {
-        return -1;
-    }
-    else if (astride == bstride) {
+    if (astride == bstride || astride == 0 || bstride == 0) {
         /*
          * Make the qsort stable by next comparing the perm order.
          * (Note that two perm entries will never be equal)
          */
-        npy_intp aperm = ((_npy_stride_sort_item *)a)->perm,
-                bperm = ((_npy_stride_sort_item *)b)->perm;
+        npy_intp aperm = ((npy_stride_sort_item *)a)->perm,
+                bperm = ((npy_stride_sort_item *)b)->perm;
         return (aperm < bperm) ? -1 : 1;
     }
+    if (astride > bstride) {
+        return -1;
+    }
     else {
         return 1;
     }
 }
 
-/*
- * This function populates the first PyArray_NDIM(arr) elements
+/*NUMPY_API
+ *
+ * This function populates the first ndim elements
  * of strideperm with sorted descending by their absolute values.
  * For example, the stride array (4, -2, 12) becomes
  * [(2, 12), (0, 4), (1, -2)].
  */
 NPY_NO_EXPORT void
-PyArray_CreateSortedStridePerm(PyArrayObject *arr,
-                           _npy_stride_sort_item *strideperm)
+PyArray_CreateSortedStridePerm(int ndim, npy_intp *shape,
+                        npy_intp *strides,
+                        npy_stride_sort_item *out_strideperm)
 {
-    int i, ndim = PyArray_NDIM(arr);
+    int i;
 
     /* Set up the strideperm values */
     for (i = 0; i < ndim; ++i) {
-        strideperm[i].perm = i;
-        strideperm[i].stride = PyArray_STRIDE(arr, i);
+        out_strideperm[i].perm = i;
+        if (shape[i] == 1) {
+            out_strideperm[i].stride = 0;
+        }
+        else {
+            out_strideperm[i].stride = strides[i];
+        }
     }
 
     /* Sort them */
-    qsort(strideperm, ndim, sizeof(_npy_stride_sort_item),
+    qsort(out_strideperm, ndim, sizeof(npy_stride_sort_item),
                                     &_npy_stride_sort_item_comparator);
 }
 
+static NPY_INLINE npy_intp
+s_intp_abs(npy_intp x)
+{
+    return (x < 0) ? -x : x;
+}
+
+
+
+/*
+ * Creates a sorted stride perm matching the KEEPORDER behavior
+ * of the NpyIter object. Because this operates based on multiple
+ * input strides, the 'stride' member of the npy_stride_sort_item
+ * would be useless and we simply argsort a list of indices instead.
+ *
+ * The caller should have already validated that 'ndim' matches for
+ * every array in the arrays list.
+ */
+NPY_NO_EXPORT void
+PyArray_CreateMultiSortedStridePerm(int narrays, PyArrayObject **arrays,
+                        int ndim, int *out_strideperm)
+{
+    int i0, i1, ipos, j0, j1, iarrays;
+
+    /* Initialize the strideperm values to the identity. */
+    for (i0 = 0; i0 < ndim; ++i0) {
+        out_strideperm[i0] = i0;
+    }
+
+    /*
+     * This is the same as the custom stable insertion sort in
+     * the NpyIter object, but sorting in the reverse order as
+     * in the iterator. The iterator sorts from smallest stride
+     * to biggest stride (Fortran order), whereas here we sort
+     * from biggest stride to smallest stride (C order).
+     */
+    for (i0 = 1; i0 < ndim; ++i0) {
+
+        ipos = i0;
+        j0 = out_strideperm[i0];
+
+        for (i1 = i0 - 1; i1 >= 0; --i1) {
+            int ambig = 1, shouldswap = 0;
+
+            j1 = out_strideperm[i1];
+
+            for (iarrays = 0; iarrays < narrays; ++iarrays) {
+                if (PyArray_SHAPE(arrays[iarrays])[j0] != 1 &&
+                            PyArray_SHAPE(arrays[iarrays])[j1] != 1) {
+                    if (s_intp_abs(PyArray_STRIDES(arrays[iarrays])[j0]) <=
+                            s_intp_abs(PyArray_STRIDES(arrays[iarrays])[j1])) {
+                        /*
+                         * Set swap even if it's not ambiguous already,
+                         * because in the case of conflicts between
+                         * different operands, C-order wins.
+                         */
+                        shouldswap = 0;
+                    }
+                    else {
+                        /* Only set swap if it's still ambiguous */
+                        if (ambig) {
+                            shouldswap = 1;
+                        }
+                    }
+
+                    /*
+                     * A comparison has been done, so it's
+                     * no longer ambiguous
+                     */
+                    ambig = 0;
+                }
+            }
+            /*
+             * If the comparison was unambiguous, either shift
+             * 'ipos' to 'i1' or stop looking for an insertion point
+             */
+            if (!ambig) {
+                if (shouldswap) {
+                    ipos = i1;
+                }
+                else {
+                    break;
+                }
+            }
+        }
+
+        /* Insert out_strideperm[i0] into the right place */
+        if (ipos != i0) {
+            for (i1 = i0; i1 > ipos; --i1) {
+                out_strideperm[i1] = out_strideperm[i1-1];
+            }
+            out_strideperm[ipos] = j0;
+        }
+    }
+}
+
 /*NUMPY_API
  * Ravel
  * Returns a contiguous array
  */
 NPY_NO_EXPORT PyObject *
-PyArray_Ravel(PyArrayObject *a, NPY_ORDER order)
+PyArray_Ravel(PyArrayObject *arr, NPY_ORDER order)
 {
     PyArray_Dims newdim = {NULL,1};
-    intp val[1] = {-1};
+    npy_intp val[1] = {-1};
 
     newdim.ptr = val;
 
     if (order == NPY_ANYORDER) {
-        order = PyArray_ISFORTRAN(a) ? NPY_FORTRANORDER : NPY_CORDER;
+        order = PyArray_ISFORTRAN(arr) ? NPY_FORTRANORDER : NPY_CORDER;
     }
     else if (order == NPY_KEEPORDER) {
-        if (PyArray_IS_C_CONTIGUOUS(a)) {
+        if (PyArray_IS_C_CONTIGUOUS(arr)) {
             order = NPY_CORDER;
         }
-        else if (PyArray_IS_F_CONTIGUOUS(a)) {
+        else if (PyArray_IS_F_CONTIGUOUS(arr)) {
             order = NPY_FORTRANORDER;
         }
     }
 
-    if (order == NPY_CORDER && PyArray_ISCONTIGUOUS(a)) {
-        return PyArray_Newshape(a, &newdim, NPY_CORDER);
+    if (order == NPY_CORDER && PyArray_ISCONTIGUOUS(arr)) {
+        return PyArray_Newshape(arr, &newdim, NPY_CORDER);
     }
-    else if (order == NPY_FORTRANORDER && PyArray_ISFORTRAN(a)) {
-        return PyArray_Newshape(a, &newdim, NPY_FORTRANORDER);
+    else if (order == NPY_FORTRANORDER && PyArray_ISFORTRAN(arr)) {
+        return PyArray_Newshape(arr, &newdim, NPY_FORTRANORDER);
     }
     /* For KEEPORDER, check if we can make a flattened view */
     else if (order == NPY_KEEPORDER) {
-        _npy_stride_sort_item strideperm[NPY_MAXDIMS];
+        npy_stride_sort_item strideperm[NPY_MAXDIMS];
         npy_intp stride;
-        int i, ndim = PyArray_NDIM(a);
+        int i, ndim = PyArray_NDIM(arr);
 
-        PyArray_CreateSortedStridePerm(a, strideperm);
+        PyArray_CreateSortedStridePerm(PyArray_NDIM(arr), PyArray_SHAPE(arr),
+                                PyArray_STRIDES(arr), strideperm);
 
-        stride = PyArray_DESCR(a)->elsize;
+        stride = strideperm[ndim-1].stride;
         for (i = ndim-1; i >= 0; --i) {
             if (strideperm[i].stride != stride) {
                 break;
             }
-            stride *= PyArray_DIM(a, strideperm[i].perm);
+            stride *= PyArray_DIM(arr, strideperm[i].perm);
         }
 
         /* If all the strides matched a contiguous layout, return a view */
         if (i < 0) {
             PyArrayObject *ret;
-            npy_intp stride = PyArray_DESCR(a)->elsize;
+            npy_intp stride = strideperm[ndim-1].stride;
 
-            val[0] = PyArray_SIZE(a);
+            val[0] = PyArray_SIZE(arr);
 
-            Py_INCREF(PyArray_DESCR(a));
-            ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(a),
-                               PyArray_DESCR(a),
+            Py_INCREF(PyArray_DESCR(arr));
+            ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(arr),
+                               PyArray_DESCR(arr),
                                1, val,
                                &stride,
-                               PyArray_BYTES(a),
-                               PyArray_FLAGS(a),
-                               (PyObject *)a);
-
-            if (ret != NULL) {
-                PyArray_UpdateFlags(ret,
-                            NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_F_CONTIGUOUS);
-                Py_INCREF(a);
-                if (PyArray_SetBaseObject(ret, (PyObject *)a) < 0) {
-                    Py_DECREF(ret);
-                    ret = NULL;
-                }
+                               PyArray_BYTES(arr),
+                PyArray_FLAGS(arr) & ~(NPY_ARRAY_OWNMASKNA | NPY_ARRAY_MASKNA),
+                               (PyObject *)arr);
+            if (ret == NULL) {
+                return NULL;
+            }
+
+            /* Take a view of the NA mask as well if necessary */
+            if (PyArray_HASMASKNA(arr)) {
+                PyArrayObject_fields *fa =
+                                    (PyArrayObject_fields *)ret;
+
+                fa->maskna_dtype = PyArray_MASKNA_DTYPE(arr);
+                Py_INCREF(fa->maskna_dtype);
+                fa->maskna_data = PyArray_MASKNA_DATA(arr);
+
+                /*
+                 * Because the strides of the NA mask always match up
+                 * layout-wise with the strides of the data, we don't
+                 * have to also check them the same way. This is due
+                 * to the fact that PyArray_AllocateMaskNA is the only
+                 * mechanism ever used to create an NA mask.
+                 */
+                fa->maskna_strides[0] =
+                        PyArray_MASKNA_STRIDES(arr)[strideperm[ndim-1].perm];
+                fa->flags |= NPY_ARRAY_MASKNA;
+            }
+
+            PyArray_UpdateFlags(ret,
+                        NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_F_CONTIGUOUS);
+            Py_INCREF(arr);
+            if (PyArray_SetBaseObject(ret, (PyObject *)arr) < 0) {
+                Py_DECREF(ret);
+                return NULL;
             }
+
             return (PyObject *)ret;
         }
-
     }
 
-    return PyArray_Flatten(a, order);
+    return PyArray_Flatten(arr, order);
 }
 
 /*NUMPY_API
@@ -915,7 +1184,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
 {
     PyArrayObject *ret;
-    intp size;
+    npy_intp size;
 
     if (order == NPY_ANYORDER) {
         order = PyArray_ISFORTRAN(a) ? NPY_FORTRANORDER : NPY_CORDER;
@@ -929,13 +1198,116 @@ PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
                                NULL,
                                NULL,
                                0, (PyObject *)a);
-
     if (ret == NULL) {
         return NULL;
     }
+
+    if (PyArray_HASMASKNA(a)) {
+        if (PyArray_AllocateMaskNA(ret, 1, 0, 1) < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
     if (PyArray_CopyAsFlat(ret, a, order) < 0) {
         Py_DECREF(ret);
         return NULL;
     }
     return (PyObject *)ret;
 }
+
+/* See shape.h for parameters documentation */
+NPY_NO_EXPORT PyObject *
+build_shape_string(npy_intp n, npy_intp *vals)
+{
+    npy_intp i;
+    PyObject *ret, *tmp;
+
+    /*
+     * Negative dimension indicates "newaxis", which can
+     * be discarded for printing if it's a leading dimension.
+     * Find the first non-"newaxis" dimension.
+     */
+    i = 0;
+    while (i < n && vals[i] < 0) {
+        ++i;
+    }
+
+    if (i == n) {
+        return PyUString_FromFormat("()");
+    }
+    else {
+        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    for (; i < n; ++i) {
+        if (vals[i] < 0) {
+            tmp = PyUString_FromString(",newaxis");
+        }
+        else {
+            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
+        }
+        if (tmp == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        PyUString_ConcatAndDel(&ret, tmp);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    tmp = PyUString_FromFormat(")");
+    PyUString_ConcatAndDel(&ret, tmp);
+    return ret;
+}
+
+/*NUMPY_API
+ *
+ * Removes the axes flagged as True from the array,
+ * modifying it in place. If an axis flagged for removal
+ * has a shape entry bigger than one, this effectively selects
+ * index zero for that axis.
+ *
+ * WARNING: If an axis flagged for removal has a shape equal to zero,
+ *          the array will point to invalid memory. The caller must
+ *          validate this!
+ *
+ * For example, this can be used to remove the reduction axes
+ * from a reduction result once its computation is complete.
+ */
+NPY_NO_EXPORT void
+PyArray_RemoveAxesInPlace(PyArrayObject *arr, npy_bool *flags)
+{
+    PyArrayObject_fields *fa = (PyArrayObject_fields *)arr;
+    npy_intp *shape = fa->dimensions, *strides = fa->strides;
+    int idim, ndim = fa->nd, idim_out = 0;
+
+    /* Compress the dimensions and strides */
+    for (idim = 0; idim < ndim; ++idim) {
+        if (!flags[idim]) {
+            shape[idim_out] = shape[idim];
+            strides[idim_out] = strides[idim];
+            ++idim_out;
+        }
+    }
+
+    /* Compress the mask strides if the result has an NA mask */
+    if (PyArray_HASMASKNA(arr)) {
+        strides = fa->maskna_strides;
+        idim_out = 0;
+        for (idim = 0; idim < ndim; ++idim) {
+            if (!flags[idim]) {
+                strides[idim_out] = strides[idim];
+                ++idim_out;
+            }
+        }
+    }
+
+    /* The final number of dimensions */
+    fa->nd = idim_out;
+}
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index 8038a9f25..0451a463e 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -1,18 +1,31 @@
 #ifndef _NPY_ARRAY_SHAPE_H_
 #define _NPY_ARRAY_SHAPE_H_
 
-typedef struct {
-    npy_intp perm, stride;
-} _npy_stride_sort_item;
+/*
+ * Builds a string representation of the shape given in 'vals'.
+ * A negative value in 'vals' gets interpreted as newaxis.
+ */
+NPY_NO_EXPORT PyObject *
+build_shape_string(npy_intp n, npy_intp *vals);
 
 /*
- * This function populates the first PyArray_NDIM(arr) elements
- * of strideperm with sorted descending by their absolute values.
- * For example, the stride array (4, -2, 12) becomes
- * [(2, 12), (0, 4), (1, -2)].
+ * Creates a sorted stride perm matching the KEEPORDER behavior
+ * of the NpyIter object. Because this operates based on multiple
+ * input strides, the 'stride' member of the npy_stride_sort_item
+ * would be useless and we simply argsort a list of indices instead.
+ *
+ * The caller should have already validated that 'ndim' matches for
+ * every array in the arrays list.
  */
 NPY_NO_EXPORT void
-PyArray_CreateSortedStridePerm(PyArrayObject *arr,
-                           _npy_stride_sort_item *strideperm);
+PyArray_CreateMultiSortedStridePerm(int narrays, PyArrayObject **arrays,
+                        int ndim, int *out_strideperm);
+
+/*
+ * Just like PyArray_Squeeze, but allows the caller to select
+ * a subset of the size-one dimensions to squeeze out.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_SqueezeSelected(PyArrayObject *self, npy_bool *axis_flags);
 
 #endif
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index b6b53ba45..6ffa36ee9 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -8,12 +8,13 @@
  */
 
 /*
- * This function pointer is for functions that transfer an arbitrarily strided
- * input to a an arbitrarily strided output.  It may be a fully general
- * function, or a specialized function when the strides or item size
- * have special values.
+ * This function pointer is for unary operations that input an
+ * arbitrarily strided one-dimensional array segment and output
+ * an arbitrarily strided array segment of the same size.
+ * It may be a fully general function, or a specialized function
+ * when the strides or item size have particular known values.
  *
- * Examples of transfer functions are a straight copy, a byte-swap,
+ * Examples of unary operations are a straight copy, a byte-swap,
  * and a casting operation,
  *
  * The 'transferdata' parameter is slightly special, following a
@@ -21,26 +22,46 @@
  * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
  *
  */
-typedef void (PyArray_StridedTransferFn)(char *dst, npy_intp dst_stride,
+typedef void (PyArray_StridedUnaryOp)(char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_intp N, npy_intp src_itemsize,
                                     NpyAuxData *transferdata);
 
 /*
  * This is for pointers to functions which behave exactly as
- * for PyArray_StridedTransferFn, but with an additional mask controlling
- * which values are transferred.
+ * for PyArray_StridedUnaryOp, but with an additional mask controlling
+ * which values are transformed.
  *
- * In particular, the 'i'-th element is transfered if and only if
- * NpyMask_IsExposed(mask[i*mask_stride]).
+ * In particular, the 'i'-th element is operated on if and only if
+ * NpyMaskValue_IsExposed(mask[i*mask_stride]).
  */
-typedef void (PyArray_MaskedStridedTransferFn)(char *dst, npy_intp dst_stride,
+typedef void (PyArray_MaskedStridedUnaryOp)(char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_mask *mask, npy_intp mask_stride,
                                     npy_intp N, npy_intp src_itemsize,
                                     NpyAuxData *transferdata);
 
 /*
+ * This function pointer is for binary operations that input two
+ * arbitrarily strided one-dimensional array segments and output
+ * an arbitrarily strided array segment of the same size.
+ * It may be a fully general function, or a specialized function
+ * when the strides or item size have particular known values.
+ *
+ * Examples of binary operations are the basic arithmetic operations,
+ * logical operators AND, OR, and many others.
+ *
+ * The 'transferdata' parameter is slightly special, following a
+ * generic auxiliary data pattern defined in ndarraytypes.h
+ * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
+ *
+ */
+typedef void (PyArray_StridedBinaryOp)(char *dst, npy_intp dst_stride,
+                                    char *src0, npy_intp src0_stride,
+                                    char *src1, npy_intp src1_stride,
+                                    npy_intp N, NpyAuxData *transferdata);
+
+/*
  * Gives back a function pointer to a specialized function for copying
  * strided memory.  Returns NULL if there is a problem with the inputs.
  *
@@ -57,7 +78,7 @@ typedef void (PyArray_MaskedStridedTransferFn)(char *dst, npy_intp dst_stride,
  *      Should be the item size if it will always be the same, 0 otherwise.
  *
  */
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedCopyFn(int aligned,
                         npy_intp src_stride, npy_intp dst_stride,
                         npy_intp itemsize);
@@ -72,7 +93,7 @@ PyArray_GetStridedCopyFn(int aligned,
  *
  * Parameters are as for PyArray_GetStridedCopyFn.
  */
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedCopySwapFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             npy_intp itemsize);
@@ -87,7 +108,7 @@ PyArray_GetStridedCopySwapFn(int aligned,
  *
  * Parameters are as for PyArray_GetStridedCopyFn.
  */
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedCopySwapPairFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             npy_intp itemsize);
@@ -106,7 +127,7 @@ NPY_NO_EXPORT int
 PyArray_GetStridedZeroPadCopyFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             npy_intp src_itemsize, npy_intp dst_itemsize,
-                            PyArray_StridedTransferFn **outstransfer,
+                            PyArray_StridedUnaryOp **outstransfer,
                             NpyAuxData **outtransferdata);
 
 /*
@@ -115,7 +136,7 @@ PyArray_GetStridedZeroPadCopyFn(int aligned,
  * to dst_type_num.  If a conversion is unsupported, returns NULL
  * without setting a Python exception.
  */
-NPY_NO_EXPORT PyArray_StridedTransferFn *
+NPY_NO_EXPORT PyArray_StridedUnaryOp *
 PyArray_GetStridedNumericCastFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             int src_type_num, int dst_type_num);
@@ -130,14 +151,14 @@ NPY_NO_EXPORT int
 PyArray_GetDTypeCopySwapFn(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *dtype,
-                            PyArray_StridedTransferFn **outstransfer,
+                            PyArray_StridedUnaryOp **outstransfer,
                             NpyAuxData **outtransferdata);
 
 /*
  * If it's possible, gives back a transfer function which casts and/or
  * byte swaps data with the dtype 'src_dtype' into data with the dtype
  * 'dst_dtype'.  If the outtransferdata is populated with a non-NULL value,
- * it must be deallocated with the ``PyArray_FreeStridedTransferData``
+ * it must be deallocated with the NPY_AUXDATA_FREE
  * function when the transfer function is no longer required.
  *
  * aligned:
@@ -165,7 +186,7 @@ PyArray_GetDTypeCopySwapFn(int aligned,
  * out_transferdata:
  *      The auxiliary data for the transfer function is placed here.
  *      When finished with the transfer function, the caller must call
- *      ``PyArray_FreeStridedTransferData`` on this data.
+ *      NPY_AUXDATA_FREE on this data.
  * out_needs_api:
  *      If this is non-NULL, and the transfer function produced needs
  *      to call into the (Python) API, this gets set to 1.  This
@@ -183,7 +204,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
-                            PyArray_StridedTransferFn **out_stransfer,
+                            PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api);
 
@@ -211,7 +232,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             PyArray_Descr *dst_dtype,
                             PyArray_Descr *mask_dtype,
                             int move_references,
-                            PyArray_MaskedStridedTransferFn **out_stransfer,
+                            PyArray_MaskedStridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata,
                             int *out_needs_api);
 
@@ -220,7 +241,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
  * 'src_dtype' to 'dst' with 'dst_dtype'. See
  * PyArray_GetDTypeTransferFunction for more details.
  *
- * returns NPY_SUCCEED or NPY_FAIL.
+ * Returns NPY_SUCCEED or NPY_FAIL.
  */
 NPY_NO_EXPORT int
 PyArray_CastRawArrays(npy_intp count,
@@ -230,6 +251,20 @@ PyArray_CastRawArrays(npy_intp count,
                       int move_references);
 
 /*
+ * Casts the elements from one n-dimensional array to another n-dimensional
+ * array with identical shape but possibly different strides and dtypes.
+ * Does not account for overlap.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+PyArray_CastRawNDimArrays(int ndim, npy_intp *shape,
+                      char *src, char *dst,
+                      npy_intp *src_strides, npy_intp *dst_strides,
+                      PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                      int move_references);
+
+/*
  * These two functions copy or convert the data of an n-dimensional array
  * to/from a 1-dimensional strided buffer.  These functions will only call
  * 'stransfer' with the provided dst_stride/src_stride and
@@ -278,7 +313,7 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_StridedTransferFn *stransfer,
+                PyArray_StridedUnaryOp *stransfer,
                 NpyAuxData *transferdata);
 
 NPY_NO_EXPORT npy_intp
@@ -288,7 +323,7 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_StridedTransferFn *stransfer,
+                PyArray_StridedUnaryOp *stransfer,
                 NpyAuxData *transferdata);
 
 NPY_NO_EXPORT npy_intp
@@ -299,9 +334,191 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
                 npy_intp *coords, npy_intp coords_inc,
                 npy_intp *shape, npy_intp shape_inc,
                 npy_intp count, npy_intp src_itemsize,
-                PyArray_MaskedStridedTransferFn *stransfer,
+                PyArray_MaskedStridedUnaryOp *stransfer,
                 NpyAuxData *data);
 
+/*
+ * Prepares shape and strides for a simple raw array iteration.
+ * This sorts the strides into FORTRAN order, reverses any negative
+ * strides, then coalesces axes where possible. The results are
+ * filled in the output parameters.
+ *
+ * This is intended for simple, lightweight iteration over arrays
+ * where no buffering of any kind is needed, and the array may
+ * not be stored as a PyArrayObject. For example, to iterate over
+ * the NA mask of an array.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_ONE_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareOneRawArrayIter(int ndim, npy_intp *shape,
+                            char *data, npy_intp *strides,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_data, npy_intp *out_strides);
+
+/*
+ * The same as PyArray_PrepareOneRawArrayIter, but for two
+ * operands instead of one. Any broadcasting of the two operands
+ * should have already been done before calling this function,
+ * as the ndim and shape is only specified once for both operands.
+ *
+ * Only the strides of the first operand are used to reorder
+ * the dimensions, no attempt to consider all the strides together
+ * is made, as is done in the NpyIter object.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_TWO_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareTwoRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB);
+
+/*
+ * The same as PyArray_PrepareOneRawArrayIter, but for three
+ * operands instead of one. Any broadcasting of the three operands
+ * should have already been done before calling this function,
+ * as the ndim and shape is only specified once for all operands.
+ *
+ * Only the strides of the first operand are used to reorder
+ * the dimensions, no attempt to consider all the strides together
+ * is made, as is done in the NpyIter object.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_THREE_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            char *dataC, npy_intp *stridesC,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB,
+                            char **out_dataC, npy_intp *out_stridesC);
+
+/*
+ * The same as PyArray_PrepareOneRawArrayIter, but for four
+ * operands instead of one. Any broadcasting of the four operands
+ * should have already been done before calling this function,
+ * as the ndim and shape is only specified once for all operands.
+ *
+ * Only the strides of the first operand are used to reorder
+ * the dimensions, no attempt to consider all the strides together
+ * is made, as is done in the NpyIter object.
+ *
+ * You can use this together with NPY_RAW_ITER_START and
+ * NPY_RAW_ITER_FOUR_NEXT to handle the looping boilerplate of everything
+ * but the innermost loop (which is for idim == 0).
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_PrepareFourRawArrayIter(int ndim, npy_intp *shape,
+                            char *dataA, npy_intp *stridesA,
+                            char *dataB, npy_intp *stridesB,
+                            char *dataC, npy_intp *stridesC,
+                            char *dataD, npy_intp *stridesD,
+                            int *out_ndim, npy_intp *out_shape,
+                            char **out_dataA, npy_intp *out_stridesA,
+                            char **out_dataB, npy_intp *out_stridesB,
+                            char **out_dataC, npy_intp *out_stridesC,
+                            char **out_dataD, npy_intp *out_stridesD);
+
+/* Start raw iteration */
+#define NPY_RAW_ITER_START(idim, ndim, coord, shape) \
+        memset((coord), 0, (ndim) * sizeof(coord[0])); \
+        do {
+
+/* Increment to the next n-dimensional coordinate for one raw array */
+#define NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides) \
+            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
+                if (++(coord)[idim] == (shape)[idim]) { \
+                    (coord)[idim] = 0; \
+                    (data) -= ((shape)[idim] - 1) * (strides)[idim]; \
+                } \
+                else { \
+                    (data) += (strides)[idim]; \
+                    break; \
+                } \
+            } \
+        } while ((idim) < (ndim))
+
+/* Increment to the next n-dimensional coordinate for two raw arrays */
+#define NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape, \
+                              dataA, stridesA, dataB, stridesB) \
+            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
+                if (++(coord)[idim] == (shape)[idim]) { \
+                    (coord)[idim] = 0; \
+                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
+                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
+                } \
+                else { \
+                    (dataA) += (stridesA)[idim]; \
+                    (dataB) += (stridesB)[idim]; \
+                    break; \
+                } \
+            } \
+        } while ((idim) < (ndim))
+
+/* Increment to the next n-dimensional coordinate for three raw arrays */
+#define NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape, \
+                              dataA, stridesA, \
+                              dataB, stridesB, \
+                              dataC, stridesC) \
+            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
+                if (++(coord)[idim] == (shape)[idim]) { \
+                    (coord)[idim] = 0; \
+                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
+                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
+                    (dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
+                } \
+                else { \
+                    (dataA) += (stridesA)[idim]; \
+                    (dataB) += (stridesB)[idim]; \
+                    (dataC) += (stridesC)[idim]; \
+                    break; \
+                } \
+            } \
+        } while ((idim) < (ndim))
+
+/* Increment to the next n-dimensional coordinate for four raw arrays */
+#define NPY_RAW_ITER_FOUR_NEXT(idim, ndim, coord, shape, \
+                              dataA, stridesA, \
+                              dataB, stridesB, \
+                              dataC, stridesC, \
+                              dataD, stridesD) \
+            for ((idim) = 1; (idim) < (ndim); ++(idim)) { \
+                if (++(coord)[idim] == (shape)[idim]) { \
+                    (coord)[idim] = 0; \
+                    (dataA) -= ((shape)[idim] - 1) * (stridesA)[idim]; \
+                    (dataB) -= ((shape)[idim] - 1) * (stridesB)[idim]; \
+                    (dataC) -= ((shape)[idim] - 1) * (stridesC)[idim]; \
+                    (dataD) -= ((shape)[idim] - 1) * (stridesD)[idim]; \
+                } \
+                else { \
+                    (dataA) += (stridesA)[idim]; \
+                    (dataB) += (stridesB)[idim]; \
+                    (dataC) += (stridesC)[idim]; \
+                    (dataD) += (stridesD)[idim]; \
+                    break; \
+                } \
+            } \
+        } while ((idim) < (ndim))
+
 
 /*
  *            TRIVIAL ITERATION
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index 5dc58e990..a76adaa39 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -78,6 +78,83 @@ npy_Object@Kind@(PyObject *i1, PyObject *i2)
 }
 /**end repeat**/
 
+/* Emulates Python's 'a or b' behavior */
+static PyObject *
+npy_ObjectLogicalOr(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        Py_XINCREF(i2);
+        return i2;
+    }
+    else if (i2 == NULL) {
+        Py_INCREF(i2);
+        return i1;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+/* Emulates Python's 'a and b' behavior */
+static PyObject *
+npy_ObjectLogicalAnd(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else if (i2 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (!retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+
+/* Emulates Python's 'not b' behavior */
+static PyObject *
+npy_ObjectLogicalNot(PyObject *i1)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_Not(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(Py_True);
+            return Py_True;
+        }
+        else {
+            Py_INCREF(Py_False);
+            return Py_False;
+        }
+    }
+}
 
 /*
  *****************************************************************************
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0d1841b47..f4b0fc0df 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -611,7 +611,7 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
 /**end repeat**/
 
 NPY_NO_EXPORT void
-BOOL_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+BOOL__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((Bool *)op1) = 1;
@@ -642,7 +642,7 @@ BOOL_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNU
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
 
 NPY_NO_EXPORT void
-@S@@TYPE@_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+@S@@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((@s@@type@ *)op1) = 1;
@@ -985,7 +985,7 @@ TIMEDELTA_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNU
  */
 
 NPY_NO_EXPORT void
-@TYPE@_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((@type@ *)op1) = 1;
@@ -1458,7 +1458,7 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((@type@ *)op1) = 1;
@@ -1762,7 +1762,7 @@ HALF_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
 }
 
 NPY_NO_EXPORT void
-HALF_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+HALF__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((npy_half *)op1) = NPY_HALF_ONE;
@@ -2093,7 +2093,7 @@ C@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 NPY_NO_EXPORT void
-C@TYPE@_ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+C@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         ((@type@ *)op1)[0] = 1;
diff --git a/numpy/core/src/umath/loops.h b/numpy/core/src/umath/loops.h
index 2a792bf5b..9b16ed3fc 100644
--- a/numpy/core/src/umath/loops.h
+++ b/numpy/core/src/umath/loops.h
@@ -161,7 +161,7 @@ BOOL_logical_not(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(fu
 
 
 NPY_NO_EXPORT void
-BOOL_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+BOOL__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 /*
  *****************************************************************************
@@ -178,7 +178,7 @@ BOOL_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data
 #define BYTE_fmin BYTE_minimum
 
 NPY_NO_EXPORT void
-BYTE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+BYTE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 BYTE_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -301,7 +301,7 @@ BYTE_fmod(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
 #define UBYTE_fmin UBYTE_minimum
 
 NPY_NO_EXPORT void
-UBYTE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+UBYTE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 UBYTE_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -451,7 +451,7 @@ UBYTE_remainder(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(fun
 #define SHORT_fmin SHORT_minimum
 
 NPY_NO_EXPORT void
-SHORT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+SHORT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 SHORT_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -574,7 +574,7 @@ SHORT_fmod(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
 #define USHORT_fmin USHORT_minimum
 
 NPY_NO_EXPORT void
-USHORT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+USHORT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 USHORT_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -724,7 +724,7 @@ USHORT_remainder(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(fu
 #define INT_fmin INT_minimum
 
 NPY_NO_EXPORT void
-INT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+INT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 INT_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -847,7 +847,7 @@ INT_fmod(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
 #define UINT_fmin UINT_minimum
 
 NPY_NO_EXPORT void
-UINT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+UINT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 UINT_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -997,7 +997,7 @@ UINT_remainder(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func
 #define LONG_fmin LONG_minimum
 
 NPY_NO_EXPORT void
-LONG_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+LONG__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 LONG_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -1120,7 +1120,7 @@ LONG_fmod(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
 #define ULONG_fmin ULONG_minimum
 
 NPY_NO_EXPORT void
-ULONG_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+ULONG__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 ULONG_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -1270,7 +1270,7 @@ ULONG_remainder(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(fun
 #define LONGLONG_fmin LONGLONG_minimum
 
 NPY_NO_EXPORT void
-LONGLONG_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+LONGLONG__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 LONGLONG_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -1393,7 +1393,7 @@ LONGLONG_fmod(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func)
 #define ULONGLONG_fmin ULONGLONG_minimum
 
 NPY_NO_EXPORT void
-ULONGLONG_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+ULONGLONG__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 ULONGLONG_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -1662,7 +1662,7 @@ HALF_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(dat
 
 
 NPY_NO_EXPORT void
-HALF_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+HALF__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 HALF_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -1816,7 +1816,7 @@ FLOAT_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(da
 
 
 NPY_NO_EXPORT void
-FLOAT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+FLOAT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 FLOAT_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -1970,7 +1970,7 @@ DOUBLE_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(d
 
 
 NPY_NO_EXPORT void
-DOUBLE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+DOUBLE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 DOUBLE_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -2124,7 +2124,7 @@ LONGDOUBLE_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUS
 
 
 NPY_NO_EXPORT void
-LONGDOUBLE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+LONGDOUBLE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 LONGDOUBLE_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -2254,7 +2254,7 @@ NPY_NO_EXPORT void
 CFLOAT_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
-CFLOAT_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+CFLOAT__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 CFLOAT_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -2370,7 +2370,7 @@ NPY_NO_EXPORT void
 CDOUBLE_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
-CDOUBLE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+CDOUBLE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 CDOUBLE_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -2486,7 +2486,7 @@ NPY_NO_EXPORT void
 CLONGDOUBLE_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
-CLONGDOUBLE_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+CLONGDOUBLE__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 CLONGDOUBLE_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -2547,7 +2547,7 @@ TIMEDELTA_sign(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func
 #line 432
 
 NPY_NO_EXPORT void
-DATETIME_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+DATETIME__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 #line 440
 NPY_NO_EXPORT void
@@ -2587,7 +2587,7 @@ DATETIME_minimum(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(fu
 #line 432
 
 NPY_NO_EXPORT void
-TIMEDELTA_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+TIMEDELTA__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 #line 440
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index fdedc1933..0b59b3095 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -65,7 +65,7 @@ BOOL_@kind@(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
 NPY_NO_EXPORT void
-BOOL_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+BOOL__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 /*
  *****************************************************************************
@@ -90,7 +90,7 @@ BOOL_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
 
 NPY_NO_EXPORT void
-@S@@TYPE@_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+@S@@TYPE@__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_square(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
@@ -255,7 +255,7 @@ NPY_NO_EXPORT void
 
 
 NPY_NO_EXPORT void
-@TYPE@_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+@TYPE@__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 @TYPE@_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -369,7 +369,7 @@ NPY_NO_EXPORT void
 C@TYPE@_reciprocal(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
-C@TYPE@_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+C@TYPE@__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@_conjugate(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func));
@@ -431,7 +431,7 @@ TIMEDELTA_sign(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func
  */
 
 NPY_NO_EXPORT void
-@TYPE@_ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
+@TYPE@__ones_like(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(data));
 
 /**begin repeat1
  * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 1aca37bc7..273aea996 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -39,6 +39,7 @@
 #include "numpy/noprefix.h"
 #include "numpy/ufuncobject.h"
 #include "lowlevel_strided_loops.h"
+#include "ufunc_type_resolution.h"
 
 #include "ufunc_object.h"
 
@@ -46,10 +47,10 @@
 #define NPY_UF_DBG_TRACING 0
 
 #if NPY_UF_DBG_TRACING
-#define NPY_UF_DBG_PRINT(s) printf("%s", s)
-#define NPY_UF_DBG_PRINT1(s, p1) printf(s, p1)
-#define NPY_UF_DBG_PRINT2(s, p1, p2) printf(s, p1, p2)
-#define NPY_UF_DBG_PRINT3(s, p1, p2, p3) printf(s, p1, p2, p3)
+#define NPY_UF_DBG_PRINT(s) {printf("%s", s);fflush(stdout);}
+#define NPY_UF_DBG_PRINT1(s, p1) {printf((s), (p1));fflush(stdout);}
+#define NPY_UF_DBG_PRINT2(s, p1, p2) {printf(s, p1, p2);fflush(stdout);}
+#define NPY_UF_DBG_PRINT3(s, p1, p2, p3) {printf(s, p1, p2, p3);fflush(stdout);}
 #else
 #define NPY_UF_DBG_PRINT(s)
 #define NPY_UF_DBG_PRINT1(s, p1)
@@ -215,18 +216,6 @@ PyUFunc_clearfperr()
     PyUFunc_getfperr();
 }
 
-
-#define NO_UFUNCLOOP        0
-#define ZERO_EL_REDUCELOOP  0
-#define ONE_UFUNCLOOP       1
-#define ONE_EL_REDUCELOOP   1
-#define NOBUFFER_UFUNCLOOP  2
-#define NOBUFFER_REDUCELOOP 2
-#define BUFFER_UFUNCLOOP    3
-#define BUFFER_REDUCELOOP   3
-#define SIGNATURE_NOBUFFER_UFUNCLOOP 4
-
-
 /*
  * This function analyzes the input arguments
  * and determines an appropriate __array_prepare__ function to call
@@ -556,11 +545,11 @@ _is_same_name(const char* s1, const char* s2)
 
 /*
  * Sets core_num_dim_ix, core_num_dims, core_dim_ixs, core_offsets,
- * and core_signature in PyUFuncObject "self".  Returns 0 unless an
+ * and core_signature in PyUFuncObject "ufunc".  Returns 0 unless an
  * error occured.
  */
 static int
-_parse_signature(PyUFuncObject *self, const char *signature)
+_parse_signature(PyUFuncObject *ufunc, const char *signature)
 {
     size_t len;
     char const **var_names;
@@ -577,9 +566,9 @@ _parse_signature(PyUFuncObject *self, const char *signature)
     }
 
     len = strlen(signature);
-    self->core_signature = PyArray_malloc(sizeof(char) * (len+1));
-    if (self->core_signature) {
-        strcpy(self->core_signature, signature);
+    ufunc->core_signature = PyArray_malloc(sizeof(char) * (len+1));
+    if (ufunc->core_signature) {
+        strcpy(ufunc->core_signature, signature);
     }
     /* Allocate sufficient memory to store pointers to all dimension names */
     var_names = PyArray_malloc(sizeof(char const*) * len);
@@ -588,13 +577,13 @@ _parse_signature(PyUFuncObject *self, const char *signature)
         return -1;
     }
 
-    self->core_enabled = 1;
-    self->core_num_dim_ix = 0;
-    self->core_num_dims = PyArray_malloc(sizeof(int) * self->nargs);
-    self->core_dim_ixs = PyArray_malloc(sizeof(int) * len); /* shrink this later */
-    self->core_offsets = PyArray_malloc(sizeof(int) * self->nargs);
-    if (self->core_num_dims == NULL || self->core_dim_ixs == NULL
-        || self->core_offsets == NULL) {
+    ufunc->core_enabled = 1;
+    ufunc->core_num_dim_ix = 0;
+    ufunc->core_num_dims = PyArray_malloc(sizeof(int) * ufunc->nargs);
+    ufunc->core_dim_ixs = PyArray_malloc(sizeof(int) * len); /* shrink this later */
+    ufunc->core_offsets = PyArray_malloc(sizeof(int) * ufunc->nargs);
+    if (ufunc->core_num_dims == NULL || ufunc->core_dim_ixs == NULL
+        || ufunc->core_offsets == NULL) {
         PyErr_NoMemory();
         goto fail;
     }
@@ -602,7 +591,7 @@ _parse_signature(PyUFuncObject *self, const char *signature)
     i = _next_non_white_space(signature, 0);
     while (signature[i] != '\0') {
         /* loop over input/output arguments */
-        if (cur_arg == self->nin) {
+        if (cur_arg == ufunc->nin) {
             /* expect "->" */
             if (signature[i] != '-' || signature[i+1] != '>') {
                 parse_error = "expect '->'";
@@ -627,17 +616,17 @@ _parse_signature(PyUFuncObject *self, const char *signature)
                 parse_error = "expect dimension name";
                 goto fail;
             }
-            while (j < self->core_num_dim_ix) {
+            while (j < ufunc->core_num_dim_ix) {
                 if (_is_same_name(signature+i, var_names[j])) {
                     break;
                 }
                 j++;
             }
-            if (j >= self->core_num_dim_ix) {
+            if (j >= ufunc->core_num_dim_ix) {
                 var_names[j] = signature+i;
-                self->core_num_dim_ix++;
+                ufunc->core_num_dim_ix++;
             }
-            self->core_dim_ixs[cur_core_dim] = j;
+            ufunc->core_dim_ixs[cur_core_dim] = j;
             cur_core_dim++;
             nd++;
             i = _get_end_of_name(signature, i);
@@ -655,13 +644,13 @@ _parse_signature(PyUFuncObject *self, const char *signature)
                 }
             }
         }
-        self->core_num_dims[cur_arg] = nd;
-        self->core_offsets[cur_arg] = cur_core_dim-nd;
+        ufunc->core_num_dims[cur_arg] = nd;
+        ufunc->core_offsets[cur_arg] = cur_core_dim-nd;
         cur_arg++;
         nd = 0;
 
         i = _next_non_white_space(signature, i + 1);
-        if (cur_arg != self->nin && cur_arg != self->nargs) {
+        if (cur_arg != ufunc->nin && cur_arg != ufunc->nargs) {
             /*
              * The list of input arguments (or output arguments) was
              * only read partially
@@ -673,15 +662,15 @@ _parse_signature(PyUFuncObject *self, const char *signature)
             i = _next_non_white_space(signature, i + 1);
         }
     }
-    if (cur_arg != self->nargs) {
+    if (cur_arg != ufunc->nargs) {
         parse_error = "incomplete signature: not all arguments found";
         goto fail;
     }
-    self->core_dim_ixs = PyArray_realloc(self->core_dim_ixs,
+    ufunc->core_dim_ixs = PyArray_realloc(ufunc->core_dim_ixs,
             sizeof(int)*cur_core_dim);
     /* check for trivial core-signature, e.g. "(),()->()" */
     if (cur_core_dim == 0) {
-        self->core_enabled = 0;
+        ufunc->core_enabled = 0;
     }
     PyArray_free((void*)var_names);
     return 0;
@@ -713,7 +702,7 @@ fail:
  * non-zero references in out_op.  This
  * function does not do its own clean-up.
  */
-static int get_ufunc_arguments(PyUFuncObject *self,
+static int get_ufunc_arguments(PyUFuncObject *ufunc,
                 PyObject *args, PyObject *kwds,
                 PyArrayObject **out_op,
                 NPY_ORDER *out_order,
@@ -721,16 +710,18 @@ static int get_ufunc_arguments(PyUFuncObject *self,
                 PyObject **out_extobj,
                 PyObject **out_typetup,
                 int *out_subok,
-                PyArrayObject **out_wheremask)
+                PyArrayObject **out_wheremask,
+                int *out_use_maskna)
 {
-    npy_intp i, nargs, nin = self->nin;
+    int i, nargs, nin = ufunc->nin, nout = ufunc->nout;
     PyObject *obj, *context;
     PyObject *str_key_obj = NULL;
     char *ufunc_name;
 
     int any_flexible = 0, any_object = 0;
+    int any_non_maskna_out = 0, any_maskna_out = 0;
 
-    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
+    ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
 
     *out_extobj = NULL;
     *out_typetup = NULL;
@@ -740,11 +731,14 @@ static int get_ufunc_arguments(PyUFuncObject *self,
 
     /* Check number of arguments */
     nargs = PyTuple_Size(args);
-    if ((nargs < nin) || (nargs > self->nargs)) {
+    if ((nargs < nin) || (nargs > ufunc->nargs)) {
         PyErr_SetString(PyExc_ValueError, "invalid number of arguments");
         return -1;
     }
 
+    /* Need USE_MASKNA mode if any input has an NA mask */
+    *out_use_maskna = 0;
+
     /* Get input arguments */
     for(i = 0; i < nin; ++i) {
         obj = PyTuple_GET_ITEM(args, i);
@@ -753,7 +747,7 @@ static int get_ufunc_arguments(PyUFuncObject *self,
              * TODO: There should be a comment here explaining what
              *       context does.
              */
-            context = Py_BuildValue("OOi", self, args, i);
+            context = Py_BuildValue("OOi", ufunc, args, i);
             if (context == NULL) {
                 return -1;
             }
@@ -762,11 +756,15 @@ static int get_ufunc_arguments(PyUFuncObject *self,
             context = NULL;
         }
         out_op[i] = (PyArrayObject *)PyArray_FromAny(obj,
-                                        NULL, 0, 0, 0, context);
+                                    NULL, 0, 0, NPY_ARRAY_ALLOWNA, context);
         Py_XDECREF(context);
         if (out_op[i] == NULL) {
             return -1;
         }
+        /* If the array has an NA mask, enable USE_MASKNA mode */
+        if (PyArray_HASMASKNA(out_op[i])) {
+            *out_use_maskna = 1;
+        }
         if (!any_flexible &&
                 PyTypeNum_ISFLEXIBLE(PyArray_DESCR(out_op[i])->type_num)) {
             any_flexible = 1;
@@ -804,6 +802,13 @@ static int get_ufunc_arguments(PyUFuncObject *self,
             }
             Py_INCREF(obj);
             out_op[i] = (PyArrayObject *)obj;
+
+            if (PyArray_HASMASKNA((PyArrayObject *)obj)) {
+                any_maskna_out = 1;
+            }
+            else {
+                any_non_maskna_out = 1;
+            }
         }
         else {
             PyErr_SetString(PyExc_TypeError,
@@ -896,6 +901,13 @@ static int get_ufunc_arguments(PyUFuncObject *self,
                             }
                             Py_INCREF(value);
                             out_op[nin] = (PyArrayObject *)value;
+
+                            if (PyArray_HASMASKNA((PyArrayObject *)value)) {
+                                any_maskna_out = 1;
+                            }
+                            else {
+                                any_non_maskna_out = 1;
+                            }
                         }
                         else {
                             PyErr_SetString(PyExc_TypeError,
@@ -965,8 +977,46 @@ static int get_ufunc_arguments(PyUFuncObject *self,
             }
         }
     }
-
     Py_XDECREF(str_key_obj);
+
+    /*
+     * If NA mask support is enabled and there are non-maskNA outputs,
+     * only proceed if all the inputs contain no NA values.
+     */
+    if (*out_use_maskna && any_non_maskna_out) {
+        /* Check all the inputs for NA */
+        for(i = 0; i < nin; ++i) {
+            if (PyArray_HASMASKNA(out_op[i])) {
+                int containsna = PyArray_ContainsNA(out_op[i], NULL, NULL);
+                if (containsna == -1) {
+                    return -1;
+                }
+                else if (containsna) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "Cannot assign NA value to an array which "
+                            "does not support NAs");
+                    return -1;
+                }
+            }
+        }
+
+        /* Disable MASKNA - the inner loop uses NPY_ITER_IGNORE_MASKNA */
+        *out_use_maskna = 0;
+    }
+    /*
+     * If we're not using a masked loop, but an output has an NA mask,
+     * set it to all exposed.
+     */
+    else if (!(*out_use_maskna) && any_maskna_out) {
+        for (i = nin; i < nin+nout; ++i) {
+            if (PyArray_HASMASKNA(out_op[i])) {
+                if (PyArray_AssignMaskNA(out_op[i], 1, NULL, 0, NULL) < 0) {
+                    return -1;
+                }
+            }
+        }
+    }
+
     return 0;
 
 fail:
@@ -991,12 +1041,12 @@ fail:
  * -1 if there is an error.
  */
 static int
-check_for_trivial_loop(PyUFuncObject *self,
+check_for_trivial_loop(PyUFuncObject *ufunc,
                         PyArrayObject **op,
                         PyArray_Descr **dtype,
                         npy_intp buffersize)
 {
-    npy_intp i, nin = self->nin, nop = nin + self->nout;
+    npy_intp i, nin = ufunc->nin, nop = nin + ufunc->nout;
 
     for (i = 0; i < nop; ++i) {
         /*
@@ -1107,7 +1157,7 @@ trivial_three_operand_loop(PyArrayObject **op,
  * exactly the same, which may be more strict than before.
  */
 static int
-prepare_ufunc_output(PyUFuncObject *self,
+prepare_ufunc_output(PyUFuncObject *ufunc,
                     PyArrayObject **op,
                     PyObject *arr_prep,
                     PyObject *arr_prep_args,
@@ -1118,7 +1168,7 @@ prepare_ufunc_output(PyUFuncObject *self,
         PyArrayObject *arr;
 
         res = PyObject_CallFunction(arr_prep, "O(OOi)",
-                    *op, self, arr_prep_args, i);
+                    *op, ufunc, arr_prep_args, i);
         if ((res == NULL) || (res == Py_None) || !PyArray_Check(res)) {
             if (!PyErr_Occurred()){
                 PyErr_SetString(PyExc_TypeError,
@@ -1162,7 +1212,7 @@ prepare_ufunc_output(PyUFuncObject *self,
 }
 
 static int
-iterator_loop(PyUFuncObject *self,
+iterator_loop(PyUFuncObject *ufunc,
                     PyArrayObject **op,
                     PyArray_Descr **dtype,
                     NPY_ORDER order,
@@ -1172,7 +1222,7 @@ iterator_loop(PyUFuncObject *self,
                     PyUFuncGenericFunction innerloop,
                     void *innerloopdata)
 {
-    npy_intp i, nin = self->nin, nout = self->nout;
+    npy_intp i, nin = ufunc->nin, nout = ufunc->nout;
     npy_intp nop = nin + nout;
     npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
@@ -1233,7 +1283,7 @@ iterator_loop(PyUFuncObject *self,
 
     /* Call the __array_prepare__ functions where necessary */
     for (i = 0; i < nout; ++i) {
-        if (prepare_ufunc_output(self, &op[nin+i],
+        if (prepare_ufunc_output(ufunc, &op[nin+i],
                             arr_prep[i], arr_prep_args, i) < 0) {
             NpyIter_Deallocate(iter);
             return -1;
@@ -1296,18 +1346,28 @@ iterator_loop(PyUFuncObject *self,
  * innerloopdata   - data to pass to the inner loop
  */
 static int
-execute_ufunc_loop(PyUFuncObject *self,
+execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
                     int trivial_loop_ok,
                     PyArrayObject **op,
-                    PyArray_Descr **dtype,
+                    PyArray_Descr **dtypes,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    PyObject *arr_prep_args,
-                    PyUFuncGenericFunction innerloop,
-                    void *innerloopdata)
+                    PyObject *arr_prep_args)
 {
-    npy_intp nin = self->nin, nout = self->nout;
+    npy_intp nin = ufunc->nin, nout = ufunc->nout;
+    PyUFuncGenericFunction innerloop;
+    void *innerloopdata;
+    int needs_api = 0;
+
+    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
+                    &innerloop, &innerloopdata, &needs_api) < 0) {
+        return -1;
+    }
+    /* If the loop wants the arrays, provide them. */
+    if (_does_loop_use_arrays(innerloopdata)) {
+        innerloopdata = (void*)op;
+    }
 
     /* First check for the trivial cases that don't need an iterator */
     if (trivial_loop_ok) {
@@ -1315,9 +1375,9 @@ execute_ufunc_loop(PyUFuncObject *self,
             if (op[1] == NULL &&
                         (order == NPY_ANYORDER || order == NPY_KEEPORDER) &&
                         PyArray_TRIVIALLY_ITERABLE(op[0])) {
-                Py_INCREF(dtype[1]);
+                Py_INCREF(dtypes[1]);
                 op[1] = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
-                             dtype[1],
+                             dtypes[1],
                              PyArray_NDIM(op[0]),
                              PyArray_DIMS(op[0]),
                              NULL, NULL,
@@ -1326,7 +1386,7 @@ execute_ufunc_loop(PyUFuncObject *self,
                              NULL);
 
                 /* Call the __prepare_array__ if necessary */
-                if (prepare_ufunc_output(self, &op[1],
+                if (prepare_ufunc_output(ufunc, &op[1],
                                     arr_prep[0], arr_prep_args, 0) < 0) {
                     return -1;
                 }
@@ -1341,7 +1401,7 @@ execute_ufunc_loop(PyUFuncObject *self,
                         PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1])) {
 
                 /* Call the __prepare_array__ if necessary */
-                if (prepare_ufunc_output(self, &op[1],
+                if (prepare_ufunc_output(ufunc, &op[1],
                                     arr_prep[0], arr_prep_args, 0) < 0) {
                     return -1;
                 }
@@ -1367,9 +1427,9 @@ execute_ufunc_loop(PyUFuncObject *self,
                 else {
                     tmp = op[1];
                 }
-                Py_INCREF(dtype[2]);
+                Py_INCREF(dtypes[2]);
                 op[2] = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
-                                 dtype[2],
+                                 dtypes[2],
                                  PyArray_NDIM(tmp),
                                  PyArray_DIMS(tmp),
                                  NULL, NULL,
@@ -1378,7 +1438,7 @@ execute_ufunc_loop(PyUFuncObject *self,
                                  NULL);
 
                 /* Call the __prepare_array__ if necessary */
-                if (prepare_ufunc_output(self, &op[2],
+                if (prepare_ufunc_output(ufunc, &op[2],
                                     arr_prep[0], arr_prep_args, 0) < 0) {
                     return -1;
                 }
@@ -1394,7 +1454,7 @@ execute_ufunc_loop(PyUFuncObject *self,
                     PyArray_TRIVIALLY_ITERABLE_TRIPLE(op[0], op[1], op[2])) {
 
                 /* Call the __prepare_array__ if necessary */
-                if (prepare_ufunc_output(self, &op[2],
+                if (prepare_ufunc_output(ufunc, &op[2],
                                     arr_prep[0], arr_prep_args, 0) < 0) {
                     return -1;
                 }
@@ -1413,7 +1473,7 @@ execute_ufunc_loop(PyUFuncObject *self,
      */
 
     NPY_UF_DBG_PRINT("iterator loop\n");
-    if (iterator_loop(self, op, dtype, order,
+    if (iterator_loop(ufunc, op, dtypes, order,
                     buffersize, arr_prep, arr_prep_args,
                     innerloop, innerloopdata) < 0) {
         return -1;
@@ -1423,9 +1483,43 @@ execute_ufunc_loop(PyUFuncObject *self,
 }
 
 /*
+ * This function combines the 'nin' input masks together, copying the
+ * result into each of the 'nout' output masks.
+ */
+static void
+combine_ufunc_maskna(char **masks, npy_intp *strides, npy_intp count,
+                        int nin, int nout)
+{
+    char *masks_copies[NPY_MAXARGS];
+    npy_intp i;
+    int iop;
+
+    /* Make copies of the mask pointers to modify */
+    memcpy(masks_copies, masks, (nin + nout) * sizeof(char *));
+
+    /*
+     * TODO: This code only works for NPY_BOOL masks, will need to
+     *       generalize this for multi-NA.
+     */
+    for (i = 0; i < count; ++i) {
+        char maskvalue = *masks_copies[0];
+        masks_copies[0] += strides[0];
+        for (iop = 1; iop < nin; ++iop) {
+            maskvalue &= *masks_copies[iop];
+            masks_copies[iop] += strides[iop];
+        }
+        for (iop = nin; iop < nin + nout; ++iop) {
+            *masks_copies[iop] = maskvalue;
+            masks_copies[iop] += strides[iop];
+        }
+    }
+}
+
+/*
  * nin             - number of inputs
  * nout            - number of outputs
  * wheremask       - if not NULL, the 'where=' parameter to the ufunc.
+ * use_maskna      - if non-zero, flag USE_MASKNA for all the operands
  * op              - the operands (nin + nout of them)
  * order           - the loop execution order/output memory order
  * buffersize      - how big of a buffer to use
@@ -1434,28 +1528,27 @@ execute_ufunc_loop(PyUFuncObject *self,
  * innerloopdata   - data to pass to the inner loop
  */
 static int
-execute_ufunc_masked_loop(PyUFuncObject *self,
+execute_ufunc_masked_loop(PyUFuncObject *ufunc,
                     PyArrayObject *wheremask,
+                    int use_maskna,
                     PyArrayObject **op,
-                    PyArray_Descr **dtype,
+                    PyArray_Descr **dtypes,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    PyObject *arr_prep_args,
-                    PyUFuncGenericMaskedFunction innerloop,
-                    NpyAuxData *innerloopdata)
+                    PyObject *arr_prep_args)
 {
-    npy_intp i, nin = self->nin, nout = self->nout;
-    npy_intp nop = nin + nout;
+    int i, nin = ufunc->nin, nout = ufunc->nout;
+    int nop = nin + nout;
     npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
-    char *baseptrs[NPY_MAXARGS];
     int needs_api;
+    npy_intp default_op_in_flags = 0, default_op_out_flags = 0;
 
     NpyIter_IterNextFunc *iternext;
     char **dataptr;
-    npy_intp *stride;
-    npy_intp *count_ptr;
+    npy_intp *strides;
+    npy_intp *countptr;
 
     PyArrayObject **op_it;
 
@@ -1468,23 +1561,41 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
             return -1;
         }
         op[nop] = wheremask;
-        dtype[nop] = NULL;
+        dtypes[nop] = NULL;
+        default_op_out_flags |= NPY_ITER_WRITEMASKED;
+    }
+
+    if (use_maskna) {
+        default_op_in_flags |= NPY_ITER_USE_MASKNA;
+        default_op_out_flags |= NPY_ITER_USE_MASKNA;
+    }
+    /*
+     * Some operands may still have NA masks, but they will
+     * have been checked to ensure they have no NAs using
+     * PyArray_ContainsNA. Thus we flag to ignore MASKNA here.
+     */
+    else {
+        default_op_in_flags |= NPY_ITER_IGNORE_MASKNA;
+        default_op_out_flags |= NPY_ITER_IGNORE_MASKNA;
     }
 
     /* Set up the flags */
     for (i = 0; i < nin; ++i) {
-        op_flags[i] = NPY_ITER_READONLY|
+        op_flags[i] = default_op_in_flags |
+                      NPY_ITER_READONLY |
                       NPY_ITER_ALIGNED;
     }
     for (i = nin; i < nop; ++i) {
-        op_flags[i] = NPY_ITER_WRITEONLY|
-                      NPY_ITER_ALIGNED|
-                      NPY_ITER_ALLOCATE|
-                      NPY_ITER_NO_BROADCAST|
-                      NPY_ITER_NO_SUBTYPE|
-                      NPY_ITER_WRITEMASKED;
+        op_flags[i] = default_op_out_flags |
+                      NPY_ITER_WRITEONLY |
+                      NPY_ITER_ALIGNED |
+                      NPY_ITER_ALLOCATE |
+                      NPY_ITER_NO_BROADCAST |
+                      NPY_ITER_NO_SUBTYPE;
+    }
+    if (wheremask != NULL) {
+        op_flags[nop] = NPY_ITER_READONLY | NPY_ITER_ARRAYMASK;
     }
-    op_flags[nop] = NPY_ITER_READONLY|NPY_ITER_ARRAYMASK;
 
     NPY_UF_DBG_PRINT("Making iterator\n");
 
@@ -1494,14 +1605,13 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
      * is faster to calculate.
      */
     iter = NpyIter_AdvancedNew(nop + ((wheremask != NULL) ? 1 : 0), op,
-                        NPY_ITER_EXTERNAL_LOOP|
-                        NPY_ITER_REFS_OK|
-                        NPY_ITER_ZEROSIZE_OK|
-                        NPY_ITER_BUFFERED|
-                        NPY_ITER_GROWINNER|
-                        NPY_ITER_DELAY_BUFALLOC,
+                        NPY_ITER_EXTERNAL_LOOP |
+                        NPY_ITER_REFS_OK |
+                        NPY_ITER_ZEROSIZE_OK |
+                        NPY_ITER_BUFFERED |
+                        NPY_ITER_GROWINNER,
                         order, NPY_UNSAFE_CASTING,
-                        op_flags, dtype,
+                        op_flags, dtypes,
                         0, NULL, NULL, buffersize);
     if (iter == NULL) {
         return -1;
@@ -1522,7 +1632,7 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
 
     /* Call the __array_prepare__ functions where necessary */
     for (i = 0; i < nout; ++i) {
-        if (prepare_ufunc_output(self, &op[nin+i],
+        if (prepare_ufunc_output(ufunc, &op[nin+i],
                             arr_prep[i], arr_prep_args, i) < 0) {
             NpyIter_Deallocate(iter);
             return -1;
@@ -1531,23 +1641,38 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
 
     /* Only do the loop if the iteration size is non-zero */
     if (NpyIter_GetIterSize(iter) != 0) {
+        PyUFunc_MaskedStridedInnerLoopFunc *innerloop;
+        NpyAuxData *innerloopdata;
+        npy_intp fixed_strides[2*NPY_MAXARGS];
+        PyArray_Descr **iter_dtypes;
 
-        /* Reset the iterator with the base pointers from the wrapped outputs */
-        for (i = 0; i < nin; ++i) {
-            baseptrs[i] = PyArray_BYTES(op_it[i]);
-        }
+        /* Validate that the prepare_ufunc_output didn't mess with pointers */
         for (i = nin; i < nop; ++i) {
-            baseptrs[i] = PyArray_BYTES(op[i]);
-        }
-        if (wheremask != NULL) {
-            baseptrs[nop] = PyArray_BYTES(op[nop]);
+            if (PyArray_BYTES(op[i]) != PyArray_BYTES(op_it[i])) {
+                PyErr_SetString(PyExc_ValueError,
+                        "The __array_prepare__ functions modified the data "
+                        "pointer addresses in an invalid fashion");
+                NpyIter_Deallocate(iter);
+                return -1;
+            }
         }
-        NPY_UF_DBG_PRINT("reset base pointers call:\n");
-        if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
+
+        /*
+         * Get the inner loop, with the possibility of specialization
+         * based on the fixed strides.
+         */
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+        iter_dtypes = NpyIter_GetDescrArray(iter);
+        if (ufunc->masked_inner_loop_selector(ufunc, dtypes,
+                        wheremask != NULL ? iter_dtypes[nop]
+                                          : iter_dtypes[nop + nin],
+                        fixed_strides,
+                        wheremask != NULL ? fixed_strides[nop]
+                                          : fixed_strides[nop + nin],
+                        &innerloop, &innerloopdata, &needs_api) < 0) {
             NpyIter_Deallocate(iter);
             return -1;
         }
-        NPY_UF_DBG_PRINT("finished reset base pointers call\n");
 
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
@@ -1556,8 +1681,8 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
             return -1;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        stride = NpyIter_GetInnerStrideArray(iter);
-        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
+        strides = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
 
         if (!needs_api) {
             NPY_BEGIN_THREADS;
@@ -1565,14 +1690,32 @@ execute_ufunc_masked_loop(PyUFuncObject *self,
 
         NPY_UF_DBG_PRINT("Actual inner loop:\n");
         /* Execute the loop */
-        do {
-            NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)*count_ptr);
-            innerloop(dataptr, count_ptr, stride, innerloopdata);
-        } while (iternext(iter));
+        if (wheremask != NULL) {
+            do {
+                NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)*countptr);
+                innerloop(dataptr, strides,
+                            dataptr[nop], strides[nop],
+                            *countptr, innerloopdata);
+            } while (iternext(iter));
+        }
+        else {
+            do {
+                NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)*countptr);
+                /* Combine the input NA masks for the output */
+                combine_ufunc_maskna(&dataptr[nop], &strides[nop], *countptr,
+                                        nin, nout);
+                /* Evaluate the ufunc wherever the NA mask says */
+                innerloop(dataptr, strides,
+                            dataptr[nop + nin], strides[nop + nin],
+                            *countptr, innerloopdata);
+            } while (iternext(iter));
+        }
 
         if (!needs_api) {
             NPY_END_THREADS;
         }
+
+        NPY_AUXDATA_FREE(innerloopdata);
     }
 
     NpyIter_Deallocate(iter);
@@ -1618,7 +1761,7 @@ make_arr_prep_args(npy_intp nin, PyObject *args, PyObject *kwds)
 }
 
 static int
-PyUFunc_GeneralizedFunction(PyUFuncObject *self,
+PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
                         PyObject *args, PyObject *kwds,
                         PyArrayObject **op)
 {
@@ -1626,8 +1769,9 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     int i, idim, nop;
     char *ufunc_name;
     int retval = -1, subok = 1;
+    int needs_api = 0;
 
-    PyArray_Descr *dtype[NPY_MAXARGS];
+    PyArray_Descr *dtypes[NPY_MAXARGS];
 
     /* Use remapped axes for generalized ufunc */
     int broadcast_ndim, op_ndim;
@@ -1653,6 +1797,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
 
     npy_intp *inner_strides_tmp, *ax_strides_tmp[NPY_MAXDIMS];
     int core_dim_ixs_size, *core_dim_ixs;
+    int use_maskna = 0;
 
     /* The __array_prepare__ function to call for each output */
     PyObject *arr_prep[NPY_MAXARGS];
@@ -1663,52 +1808,56 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     PyObject *arr_prep_args = NULL;
 
     NPY_ORDER order = NPY_KEEPORDER;
-    /*
-     * Currently trying out SAME_KIND casting rule by default.
-     */
-    NPY_CASTING casting = NPY_SAME_KIND_CASTING;
+    /* Use the default assignment casting rule */
+    NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
     /* When provided, extobj and typetup contain borrowed references */
     PyObject *extobj = NULL, *type_tup = NULL;
 
-    if (self == NULL) {
+    if (ufunc == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
         return -1;
     }
 
-    nin = self->nin;
-    nout = self->nout;
+    nin = ufunc->nin;
+    nout = ufunc->nout;
     nop = nin + nout;
 
-    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
+    ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s\n", ufunc_name);
 
     /* Initialize all the operands and dtypes to NULL */
     for (i = 0; i < nop; ++i) {
         op[i] = NULL;
-        dtype[i] = NULL;
+        dtypes[i] = NULL;
         arr_prep[i] = NULL;
     }
 
     NPY_UF_DBG_PRINT("Getting arguments\n");
 
     /* Get all the arguments */
-    retval = get_ufunc_arguments(self, args, kwds,
+    retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, NULL);
+                &type_tup, &subok, NULL, &use_maskna);
     if (retval < 0) {
         goto fail;
     }
 
+    if (use_maskna) {
+        PyErr_SetString(PyExc_ValueError,
+                "Generalized ufuncs do not support ndarrays with NA masks");
+        goto fail;
+    }
+
     /* Figure out the number of dimensions needed by the iterator */
     broadcast_ndim = 0;
     for (i = 0; i < nin; ++i) {
-        int n = PyArray_NDIM(op[i]) - self->core_num_dims[i];
+        int n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
         if (n > broadcast_ndim) {
             broadcast_ndim = n;
         }
     }
-    op_ndim = broadcast_ndim + self->core_num_dim_ix;
+    op_ndim = broadcast_ndim + ufunc->core_num_dim_ix;
     if (op_ndim > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
                     "too many dimensions for generalized ufunc %s",
@@ -1719,7 +1868,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
 
     /* Fill in op_axes for all the operands */
     core_dim_ixs_size = 0;
-    core_dim_ixs = self->core_dim_ixs;
+    core_dim_ixs = ufunc->core_dim_ixs;
     for (i = 0; i < nop; ++i) {
         int n;
         if (op[i]) {
@@ -1727,7 +1876,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
              * Note that n may be negative if broadcasting
              * extends into the core dimensions.
              */
-            n = PyArray_NDIM(op[i]) - self->core_num_dims[i];
+            n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
         }
         else {
             n = broadcast_ndim;
@@ -1745,7 +1894,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
         for (idim = broadcast_ndim; idim < op_ndim; ++idim) {
             op_axes_arrays[i][idim] = -1;
         }
-        for (idim = 0; idim < self->core_num_dims[i]; ++idim) {
+        for (idim = 0; idim < ufunc->core_num_dims[i]; ++idim) {
             if (n + idim >= 0) {
                 op_axes_arrays[i][broadcast_ndim + core_dim_ixs[idim]] =
                                                                     n + idim;
@@ -1754,8 +1903,8 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
                 op_axes_arrays[i][broadcast_ndim + core_dim_ixs[idim]] = -1;
             }
         }
-        core_dim_ixs_size += self->core_num_dims[i];
-        core_dim_ixs += self->core_num_dims[i];
+        core_dim_ixs_size += ufunc->core_num_dims[i];
+        core_dim_ixs += ufunc->core_num_dims[i];
         op_axes[i] = op_axes_arrays[i];
     }
 
@@ -1778,39 +1927,47 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     NPY_UF_DBG_PRINT("Finding inner loop\n");
 
 
-    retval = self->type_resolution_function(self, casting,
-                        op, type_tup, dtype, &innerloop, &innerloopdata);
+    retval = ufunc->type_resolver(ufunc, casting,
+                            op, type_tup, dtypes);
+    if (retval < 0) {
+        goto fail;
+    }
+    /* For the generalized ufunc, we get the loop right away too */
+    retval = ufunc->legacy_inner_loop_selector(ufunc, dtypes,
+                                    &innerloop, &innerloopdata, &needs_api);
     if (retval < 0) {
         goto fail;
     }
 
     /*
      * FAIL with NotImplemented if the other object has
-     * the __r<op>__ method and has __array_priority__ as
-     * an attribute (signalling it can handle ndarray's)
-     * and is not already an ndarray or a subtype of the same type.
+     * the __r<op>__ method and has a higher priority than
+     * the current op (signalling it can handle ndarray's).
     */
-    if (nin == 2 && nout == 1 && dtype[1]->type_num == NPY_OBJECT) {
+    if (nin == 2 && nout == 1 && dtypes[1]->type_num == NPY_OBJECT) {
         PyObject *_obj = PyTuple_GET_ITEM(args, 1);
-        if (!PyArray_CheckExact(_obj)
-               /* If both are same subtype of object arrays, then proceed */
-                && !(Py_TYPE(_obj) == Py_TYPE(PyTuple_GET_ITEM(args, 0)))
-                && PyObject_HasAttrString(_obj, "__array_priority__")
-                && _has_reflected_op(_obj, ufunc_name)) {
-            retval = -2;
-            goto fail;
+        if (!PyArray_CheckExact(_obj)) {
+            double self_prio, other_prio;
+            self_prio = PyArray_GetPriority(PyTuple_GET_ITEM(args, 0),
+                                                        NPY_SCALAR_PRIORITY);
+            other_prio = PyArray_GetPriority(_obj, NPY_SCALAR_PRIORITY);
+            if (self_prio < other_prio &&
+                            _has_reflected_op(_obj, ufunc_name)) {
+                retval = -2;
+                goto fail;
+            }
         }
     }
 
 #if NPY_UF_DBG_TRACING
     printf("input types:\n");
     for (i = 0; i < nin; ++i) {
-        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        PyObject_Print((PyObject *)dtypes[i], stdout, 0);
         printf(" ");
     }
     printf("\noutput types:\n");
     for (i = nin; i < nop; ++i) {
-        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        PyObject_Print((PyObject *)dtypes[i], stdout, 0);
         printf(" ");
     }
     printf("\n");
@@ -1859,7 +2016,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
                                       NPY_ITER_REFS_OK|
                                       NPY_ITER_REDUCE_OK,
                            order, NPY_UNSAFE_CASTING, op_flags,
-                           dtype, op_ndim, op_axes, NULL, 0);
+                           dtypes, op_ndim, op_axes, NULL, 0);
     if (iter == NULL) {
         retval = -1;
         goto fail;
@@ -1880,9 +2037,9 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     inner_strides = (npy_intp *)PyArray_malloc(
                         NPY_SIZEOF_INTP * (nop+core_dim_ixs_size));
     /* The strides after the first nop match core_dim_ixs */
-    core_dim_ixs = self->core_dim_ixs;
+    core_dim_ixs = ufunc->core_dim_ixs;
     inner_strides_tmp = inner_strides + nop;
-    for (idim = 0; idim < self->core_num_dim_ix; ++idim) {
+    for (idim = 0; idim < ufunc->core_num_dim_ix; ++idim) {
         ax_strides_tmp[idim] = NpyIter_GetAxisStrideArray(iter,
                                                 broadcast_ndim+idim);
         if (ax_strides_tmp[idim] == NULL) {
@@ -1891,12 +2048,12 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
         }
     }
     for (i = 0; i < nop; ++i) {
-        for (idim = 0; idim < self->core_num_dims[i]; ++idim) {
+        for (idim = 0; idim < ufunc->core_num_dims[i]; ++idim) {
             inner_strides_tmp[idim] = ax_strides_tmp[core_dim_ixs[idim]][i];
         }
 
-        core_dim_ixs += self->core_num_dims[i];
-        inner_strides_tmp += self->core_num_dims[i];
+        core_dim_ixs += ufunc->core_num_dims[i];
+        inner_strides_tmp += ufunc->core_num_dims[i];
     }
 
     /* Set up the inner dimensions array */
@@ -1906,10 +2063,10 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     }
     /* Move the core dimensions to start at the second element */
     memmove(&inner_dimensions[1], &inner_dimensions[broadcast_ndim],
-                        NPY_SIZEOF_INTP * self->core_num_dim_ix);
+                        NPY_SIZEOF_INTP * ufunc->core_num_dim_ix);
 
     /* Remove all the core dimensions from the iterator */
-    for (i = 0; i < self->core_num_dim_ix; ++i) {
+    for (i = 0; i < ufunc->core_num_dim_ix; ++i) {
         if (NpyIter_RemoveAxis(iter, broadcast_ndim) != NPY_SUCCEED) {
             retval = -1;
             goto fail;
@@ -1977,7 +2134,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *self,
     NpyIter_Deallocate(iter);
     /* The caller takes ownership of all the references in op */
     for (i = 0; i < nop; ++i) {
-        Py_XDECREF(dtype[i]);
+        Py_XDECREF(dtypes[i]);
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(errobj);
@@ -1999,7 +2156,7 @@ fail:
     for (i = 0; i < nop; ++i) {
         Py_XDECREF(op[i]);
         op[i] = NULL;
-        Py_XDECREF(dtype[i]);
+        Py_XDECREF(dtypes[i]);
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(errobj);
@@ -2017,7 +2174,7 @@ fail:
  * 'op' is an array of at least NPY_MAXARGS PyArrayObject *.
  */
 NPY_NO_EXPORT int
-PyUFunc_GenericFunction(PyUFuncObject *self,
+PyUFunc_GenericFunction(PyUFuncObject *ufunc,
                         PyObject *args, PyObject *kwds,
                         PyArrayObject **op)
 {
@@ -2027,24 +2184,13 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
     int retval = -1, subok = 1;
     int usemaskedloop = 0;
 
-    PyArray_Descr *dtype[NPY_MAXARGS];
+    PyArray_Descr *dtypes[NPY_MAXARGS];
 
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
     PyObject *errobj = NULL;
     int first_error = 1;
 
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
-
-    /*
-     * The selected masked inner loop, when the 'where='
-     * parameter or arrays with missing values are in op.
-     */
-    PyUFuncGenericMaskedFunction masked_innerloop = NULL;
-    NpyAuxData *masked_innerloopdata = NULL;
-
     /* The mask provided in the 'where=' parameter */
     PyArrayObject *wheremask = NULL;
 
@@ -2056,56 +2202,65 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
      */
     PyObject *arr_prep_args = NULL;
 
-    int trivial_loop_ok = 0;
+    int trivial_loop_ok = 0, use_maskna = 0;
 
     NPY_ORDER order = NPY_KEEPORDER;
-    /*
-     * Currently trying out SAME_KIND casting rule by default.
-     */
-    NPY_CASTING casting = NPY_SAME_KIND_CASTING;
+    /* Use the default assignment casting rule */
+    NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
     /* When provided, extobj and typetup contain borrowed references */
     PyObject *extobj = NULL, *type_tup = NULL;
 
-    if (self == NULL) {
+    if (ufunc == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
         return -1;
     }
 
-    if (self->core_enabled) {
-        return PyUFunc_GeneralizedFunction(self, args, kwds, op);
+    if (ufunc->core_enabled) {
+        return PyUFunc_GeneralizedFunction(ufunc, args, kwds, op);
     }
 
-    nin = self->nin;
-    nout = self->nout;
+    nin = ufunc->nin;
+    nout = ufunc->nout;
     nop = nin + nout;
 
-    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
+    ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s\n", ufunc_name);
 
     /* Initialize all the operands and dtypes to NULL */
     for (i = 0; i < nop; ++i) {
         op[i] = NULL;
-        dtype[i] = NULL;
+        dtypes[i] = NULL;
         arr_prep[i] = NULL;
     }
 
     NPY_UF_DBG_PRINT("Getting arguments\n");
 
     /* Get all the arguments */
-    retval = get_ufunc_arguments(self, args, kwds,
+    retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, &wheremask);
+                &type_tup, &subok, &wheremask, &use_maskna);
     if (retval < 0) {
         goto fail;
     }
 
     /*
-     * For now just the where mask triggers this, but later arrays
-     * with missing data will trigger it as well.
+     * Use the masked loop if either an input had an NA mask or a wheremask
+     * was specified.
      */
-    if (wheremask != NULL) {
+    if (wheremask != NULL || use_maskna) {
         usemaskedloop = 1;
+
+        /*
+         * TODO: Implement support for this (requires more work in the
+         *       iterator first)
+         */
+        if (wheremask && use_maskna) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Ufuncs do not work with NA masked arrays and "
+                    "the where= parameter at the same time yet");
+            goto fail;
+        }
     }
 
     /* Get the buffersize, errormask, and error object globals */
@@ -2126,30 +2281,19 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
 
     NPY_UF_DBG_PRINT("Finding inner loop\n");
 
-    if (usemaskedloop) {
-        retval = self->type_resolution_masked_function(self, casting,
-                            op, type_tup, dtype,
-                            &masked_innerloop, &masked_innerloopdata);
-        if (retval < 0) {
-            goto fail;
-        }
+    retval = ufunc->type_resolver(ufunc, casting,
+                            op, type_tup, dtypes);
+    if (retval < 0) {
+        goto fail;
     }
-    else {
-        retval = self->type_resolution_function(self, casting,
-                            op, type_tup, dtype,
-                            &innerloop, &innerloopdata);
-        if (retval < 0) {
-            goto fail;
-        }
 
+    /* Only do the trivial loop check for the unmasked version. */
+    if (!usemaskedloop) {
         /*
-         * This checks whether a trivial loop is ok,
-         * making copies of scalar and one dimensional operands if that will
-         * help.
-         *
-         * Only do the trivial loop check for the unmasked version.
+         * This checks whether a trivial loop is ok, making copies of
+         * scalar and one dimensional operands if that will help.
          */
-        trivial_loop_ok = check_for_trivial_loop(self, op, dtype, buffersize);
+        trivial_loop_ok = check_for_trivial_loop(ufunc, op, dtypes, buffersize);
         if (trivial_loop_ok < 0) {
             goto fail;
         }
@@ -2161,28 +2305,30 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
      * an attribute (signalling it can handle ndarray's)
      * and is not already an ndarray or a subtype of the same type.
     */
-    if (nin == 2 && nout == 1 && dtype[1]->type_num == NPY_OBJECT) {
+    if (nin == 2 && nout == 1 && dtypes[1]->type_num == NPY_OBJECT) {
         PyObject *_obj = PyTuple_GET_ITEM(args, 1);
-        if (!PyArray_CheckExact(_obj)
-               /* If both are same subtype of object arrays, then proceed */
-                && !(Py_TYPE(_obj) == Py_TYPE(PyTuple_GET_ITEM(args, 0)))
-                && PyObject_HasAttrString(_obj, "__array_priority__")
-                && _has_reflected_op(_obj, ufunc_name)) {
-            retval = -2;
-            goto fail;
+        if (!PyArray_CheckExact(_obj)) {
+            double self_prio, other_prio;
+            self_prio = PyArray_GetPriority(PyTuple_GET_ITEM(args, 0),
+                                                        NPY_SCALAR_PRIORITY);
+            other_prio = PyArray_GetPriority(_obj, NPY_SCALAR_PRIORITY);
+            if (self_prio < other_prio &&
+                            _has_reflected_op(_obj, ufunc_name)) {
+                retval = -2;
+                goto fail;
+            }
         }
     }
 
-
 #if NPY_UF_DBG_TRACING
     printf("input types:\n");
     for (i = 0; i < nin; ++i) {
-        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        PyObject_Print((PyObject *)dtypes[i], stdout, 0);
         printf(" ");
     }
     printf("\noutput types:\n");
     for (i = nin; i < nop; ++i) {
-        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        PyObject_Print((PyObject *)dtypes[i], stdout, 0);
         printf(" ");
     }
     printf("\n");
@@ -2204,17 +2350,6 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
         }
     }
 
-    /*
-     * If the loop wants the arrays, provide them.
-     *
-     * TODO: Remove this, since this is already basically broken
-     *       with the addition of the masked inner loops and
-     *       not worth fixing.
-     */
-    if (!usemaskedloop && _does_loop_use_arrays(innerloopdata)) {
-        innerloopdata = (void*)op;
-    }
-
     /* Start with the floating-point exception flags cleared */
     PyUFunc_clearfperr();
 
@@ -2222,18 +2357,29 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
     if (usemaskedloop) {
         NPY_UF_DBG_PRINT("Executing masked inner loop\n");
 
-        retval = execute_ufunc_masked_loop(self, wheremask,
-                            op, dtype, order,
-                            buffersize, arr_prep, arr_prep_args,
-                            masked_innerloop, masked_innerloopdata);
+        retval = execute_ufunc_masked_loop(ufunc, wheremask, use_maskna,
+                            op, dtypes, order,
+                            buffersize, arr_prep, arr_prep_args);
     }
     else {
         NPY_UF_DBG_PRINT("Executing unmasked inner loop\n");
 
-        retval = execute_ufunc_loop(self, trivial_loop_ok,
-                            op, dtype, order,
-                            buffersize, arr_prep, arr_prep_args,
-                            innerloop, innerloopdata);
+        if (ufunc->legacy_inner_loop_selector != NULL) {
+            retval = execute_legacy_ufunc_loop(ufunc, trivial_loop_ok,
+                                op, dtypes, order,
+                                buffersize, arr_prep, arr_prep_args);
+        }
+        else {
+            /*
+             * TODO: When this is supported, it should be preferred over
+             * the legacy_inner_loop_selector
+             */
+            PyErr_SetString(PyExc_RuntimeError,
+                    "usage of the new inner_loop_selector isn't "
+                    "implemented yet");
+            retval = -1;
+            goto fail;
+        }
     }
     if (retval < 0) {
         goto fail;
@@ -2248,7 +2394,7 @@ PyUFunc_GenericFunction(PyUFuncObject *self,
 
     /* The caller takes ownership of all the references in op */
     for (i = 0; i < nop; ++i) {
-        Py_XDECREF(dtype[i]);
+        Py_XDECREF(dtypes[i]);
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(errobj);
@@ -2265,7 +2411,7 @@ fail:
     for (i = 0; i < nop; ++i) {
         Py_XDECREF(op[i]);
         op[i] = NULL;
-        Py_XDECREF(dtype[i]);
+        Py_XDECREF(dtypes[i]);
         Py_XDECREF(arr_prep[i]);
     }
     Py_XDECREF(errobj);
@@ -2284,7 +2430,7 @@ fail:
  * Returns 0 on success, -1 on failure.
  */
 static int
-get_binary_op_function(PyUFuncObject *self, int *otype,
+get_binary_op_function(PyUFuncObject *ufunc, int *otype,
                         PyUFuncGenericFunction *out_innerloop,
                         void **out_innerloopdata)
 {
@@ -2295,13 +2441,13 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
                                 *otype);
 
     /* If the type is custom and there are userloops, search for it here */
-    if (self->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
+    if (ufunc->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
         PyObject *key, *obj;
         key = PyInt_FromLong(*otype);
         if (key == NULL) {
             return -1;
         }
-        obj = PyDict_GetItem(self->userloops, key);
+        obj = PyDict_GetItem(ufunc->userloops, key);
         Py_DECREF(key);
         if (obj != NULL) {
             funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
@@ -2321,8 +2467,8 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
     }
 
     /* Search for a function with compatible inputs */
-    for (i = 0; i < self->ntypes; ++i) {
-        char *types = self->types + i*self->nargs;
+    for (i = 0; i < ufunc->ntypes; ++i) {
+        char *types = ufunc->types + i*ufunc->nargs;
 
         NPY_UF_DBG_PRINT3("Trying loop with signature %d %d -> %d\n",
                                 types[0], types[1], types[2]);
@@ -2332,8 +2478,8 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
                     (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
             /* If the signature is "xx->x", we found the loop */
             if (types[2] == types[0]) {
-                *out_innerloop = self->functions[i];
-                *out_innerloopdata = self->data[i];
+                *out_innerloop = ufunc->functions[i];
+                *out_innerloopdata = ufunc->data[i];
                 *otype = types[0];
                 return 0;
             }
@@ -2349,16 +2495,16 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
     }
 
     /* Search for the exact function */
-    for (i = 0; i < self->ntypes; ++i) {
-        char *types = self->types + i*self->nargs;
+    for (i = 0; i < ufunc->ntypes; ++i) {
+        char *types = ufunc->types + i*ufunc->nargs;
 
         if (PyArray_CanCastSafely(*otype, types[0]) &&
                     types[0] == types[1] &&
                     types[1] == types[2] &&
                     (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
             /* Since the signature is "xx->x", we found the loop */
-            *out_innerloop = self->functions[i];
-            *out_innerloopdata = self->data[i];
+            *out_innerloop = ufunc->functions[i];
+            *out_innerloopdata = ufunc->data[i];
             *otype = types[0];
             return 0;
         }
@@ -2367,6 +2513,273 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
     return -1;
 }
 
+static int
+reduce_type_resolver(PyUFuncObject *ufunc, PyArrayObject *arr,
+                        PyArray_Descr *odtype, PyArray_Descr **out_dtype)
+{
+    int i, retcode;
+    PyArrayObject *op[3] = {arr, arr, NULL};
+    PyArray_Descr *dtypes[3] = {NULL, NULL, NULL};
+    char *ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
+    PyObject *type_tup = NULL;
+
+    *out_dtype = NULL;
+
+    /*
+     * If odtype is specified, make a type tuple for the type
+     * resolution.
+     */
+    if (odtype != NULL) {
+        type_tup = Py_BuildValue("OOO", odtype, odtype, Py_None);
+        if (type_tup == NULL) {
+            return -1;
+        }
+    }
+
+    /* Use the type resolution function to find our loop */
+    retcode = ufunc->type_resolver(
+                        ufunc, NPY_UNSAFE_CASTING,
+                        op, type_tup, dtypes);
+    Py_DECREF(type_tup);
+    if (retcode == -1) {
+        return -1;
+    }
+    else if (retcode == -2) {
+        PyErr_Format(PyExc_RuntimeError,
+                "type resolution returned NotImplemented to "
+                "reduce ufunc %s", ufunc_name);
+        return -1;
+    }
+
+    /*
+     * The first two type should be equivalent. Because of how
+     * reduce has historically behaved in NumPy, the return type
+     * could be different, and it is the return type on which the
+     * reduction occurs.
+     */
+    if (!PyArray_EquivTypes(dtypes[0], dtypes[1])) {
+        for (i = 0; i < 3; ++i) {
+            Py_DECREF(dtypes[i]);
+        }
+        PyErr_Format(PyExc_RuntimeError,
+                "could not find a type resolution appropriate for "
+                "reduce ufunc %s", ufunc_name);
+        return -1;
+    }
+
+    Py_DECREF(dtypes[0]);
+    Py_DECREF(dtypes[1]);
+    *out_dtype = dtypes[2];
+
+    return 0;
+}
+
+static int
+assign_reduce_identity_zero(PyArrayObject *result, int preservena, void *data)
+{
+    return PyArray_AssignZero(result, NULL, preservena, NULL);
+}
+
+static int
+assign_reduce_identity_one(PyArrayObject *result, int preservena, void *data)
+{
+    return PyArray_AssignOne(result, NULL, preservena, NULL);
+}
+
+static int
+reduce_loop(NpyIter *iter, char **dataptrs, npy_intp *strides,
+            npy_intp *countptr, NpyIter_IterNextFunc *iternext,
+            int needs_api, npy_intp skip_first_count, void *data)
+{
+    PyArray_Descr *dtypes[3], **iter_dtypes;
+    PyUFuncObject *ufunc = (PyUFuncObject *)data;
+    char *dataptrs_copy[3];
+    npy_intp strides_copy[3];
+
+    /* The normal selected inner loop */
+    PyUFuncGenericFunction innerloop = NULL;
+    void *innerloopdata = NULL;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Get the inner loop */
+    iter_dtypes = NpyIter_GetDescrArray(iter);
+    dtypes[0] = iter_dtypes[0];
+    dtypes[1] = iter_dtypes[1];
+    dtypes[2] = iter_dtypes[0];
+    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
+                            &innerloop, &innerloopdata, &needs_api) < 0) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    if (skip_first_count > 0) {
+        do {
+            npy_intp count = *countptr;
+
+            /* Skip any first-visit elements */
+            if (NpyIter_IsFirstVisit(iter, 0)) {
+                if (strides[0] == 0) {
+                    --count;
+                    --skip_first_count;
+                    dataptrs[1] += strides[1];
+                }
+                else {
+                    skip_first_count -= count;
+                    count = 0;
+                }
+            }
+
+            /* Turn the two items into three for the inner loop */
+            dataptrs_copy[0] = dataptrs[0];
+            dataptrs_copy[1] = dataptrs[1];
+            dataptrs_copy[2] = dataptrs[0];
+            strides_copy[0] = strides[0];
+            strides_copy[1] = strides[1];
+            strides_copy[2] = strides[0];
+            innerloop(dataptrs_copy, &count,
+                        strides_copy, innerloopdata);
+
+            /* Jump to the faster loop when skipping is done */
+            if (skip_first_count == 0) {
+                if (iternext(iter)) {
+                    break;
+                }
+                else {
+                    goto finish_loop;
+                }
+            }
+        } while (iternext(iter));
+    }
+    do {
+        /* Turn the two items into three for the inner loop */
+        dataptrs_copy[0] = dataptrs[0];
+        dataptrs_copy[1] = dataptrs[1];
+        dataptrs_copy[2] = dataptrs[0];
+        strides_copy[0] = strides[0];
+        strides_copy[1] = strides[1];
+        strides_copy[2] = strides[0];
+        innerloop(dataptrs_copy, countptr,
+                    strides_copy, innerloopdata);
+    } while (iternext(iter));
+
+finish_loop:
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
+static int
+masked_reduce_loop(NpyIter *iter, char **dataptrs, npy_intp *strides,
+            npy_intp *countptr, NpyIter_IterNextFunc *iternext,
+            int needs_api, npy_intp skip_first_count, void *data)
+{
+    PyArray_Descr *dtypes[3], **iter_dtypes;
+    npy_intp fixed_strides[3], fixed_mask_stride;
+    PyUFuncObject *ufunc = (PyUFuncObject *)data;
+    char *dataptrs_copy[3];
+    npy_intp strides_copy[3];
+
+    /* The masked selected inner loop */
+    PyUFunc_MaskedStridedInnerLoopFunc *innerloop = NULL;
+    NpyAuxData *innerloopdata = NULL;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Get the inner loop */
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    fixed_mask_stride = fixed_strides[2];
+    fixed_strides[2] = fixed_strides[0];
+    iter_dtypes = NpyIter_GetDescrArray(iter);
+    dtypes[0] = iter_dtypes[0];
+    dtypes[1] = iter_dtypes[1];
+    dtypes[2] = iter_dtypes[0];
+    if (ufunc->masked_inner_loop_selector(ufunc, dtypes, iter_dtypes[2],
+                            fixed_strides, fixed_mask_stride,
+                            &innerloop, &innerloopdata, &needs_api) < 0) {
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    if (skip_first_count > 0) {
+        do {
+            npy_intp count = *countptr;
+
+            /* Skip any first-visit elements */
+            if (NpyIter_IsFirstVisit(iter, 0)) {
+                if (strides[0] == 0) {
+                    --count;
+                    --skip_first_count;
+                    dataptrs[1] += strides[1];
+                    dataptrs[2] += strides[2];
+                }
+                else {
+                    skip_first_count -= count;
+                    count = 0;
+                }
+            }
+
+            /* Turn the two items into three for the inner loop */
+            dataptrs_copy[0] = dataptrs[0];
+            dataptrs_copy[1] = dataptrs[1];
+            dataptrs_copy[2] = dataptrs[0];
+            strides_copy[0] = strides[0];
+            strides_copy[1] = strides[1];
+            strides_copy[2] = strides[0];
+            /*
+             * If skipna=True, this masks based on the mask in 'arr',
+             * otherwise it masks based on the mask in 'result'
+             */
+            innerloop(dataptrs_copy, strides_copy,
+                        dataptrs[2], strides[2],
+                        count, innerloopdata);
+
+            /* Jump to the faster loop when skipping is done */
+            if (skip_first_count == 0) {
+                if (iternext(iter)) {
+                    break;
+                }
+                else {
+                    goto finish_loop;
+                }
+            }
+        } while (iternext(iter));
+    }
+    do {
+        /* Turn the two items into three for the inner loop */
+        dataptrs_copy[0] = dataptrs[0];
+        dataptrs_copy[1] = dataptrs[1];
+        dataptrs_copy[2] = dataptrs[0];
+        strides_copy[0] = strides[0];
+        strides_copy[1] = strides[1];
+        strides_copy[2] = strides[0];
+        /*
+         * If skipna=True, this masks based on the mask in 'arr',
+         * otherwise it masks based on the mask in 'result'
+         */
+        innerloop(dataptrs_copy, strides_copy,
+                    dataptrs[2], strides[2],
+                    *countptr, innerloopdata);
+    } while (iternext(iter));
+
+finish_loop:
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    NPY_AUXDATA_FREE(innerloopdata);
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+}
+
 /*
  * The implementation of the reduction operators with the new iterator
  * turned into a bit of a long function here, but I think the design
@@ -2381,19 +2794,98 @@ get_binary_op_function(PyUFuncObject *self, int *otype,
  * >>> timeit einsum("i->",a)
  * 100000 loops, best of 3: 13.5 us per loop
  *
+ * The axes must already be bounds-checked by the calling function,
+ * this function does not validate them.
  */
+static PyArrayObject *
+PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
+        int naxes, int *axes, PyArray_Descr *odtype, int skipna, int keepdims)
+{
+    int iaxes, reorderable, ndim;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    PyArray_Descr *dtype;
+    PyArrayObject *result;
+    PyArray_AssignReduceIdentityFunc *assign_identity = NULL;
+    char *ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
+    /* These parameters come from a TLS global */
+    int buffersize = 0, errormask = 0;
+    PyObject *errobj = NULL;
+
+    NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s.reduce\n", ufunc_name);
+
+    ndim = PyArray_NDIM(arr);
+
+    /* Create an array of flags for reduction */
+    memset(axis_flags, 0, ndim);
+    for (iaxes = 0; iaxes < naxes; ++iaxes) {
+        int axis = axes[iaxes];
+        if (axis_flags[axis]) {
+            PyErr_SetString(PyExc_ValueError,
+                    "duplicate value in 'axis'");
+            return NULL;
+        }
+        axis_flags[axis] = 1;
+    }
+
+    switch (ufunc->identity) {
+        case PyUFunc_Zero:
+            assign_identity = &assign_reduce_identity_zero;
+            reorderable = 1;
+            break;
+        case PyUFunc_One:
+            assign_identity = &assign_reduce_identity_one;
+            reorderable = 1;
+            break;
+        case PyUFunc_None:
+            reorderable = 0;
+            break;
+        case PyUFunc_ReorderableNone:
+            reorderable = 1;
+            break;
+        default:
+            PyErr_Format(PyExc_ValueError,
+                    "ufunc %s has an invalid identity for reduction",
+                    ufunc_name);
+            return NULL;
+    }
+
+    if (PyUFunc_GetPyValues("reduce", &buffersize, &errormask, &errobj) < 0) {
+        return NULL;
+    }
+
+    /* Get the reduction dtype */
+    if (reduce_type_resolver(ufunc, arr, odtype, &dtype) < 0) {
+        Py_XDECREF(errobj);
+        return NULL;
+    }
+
+    result = PyArray_ReduceWrapper(arr, out, NULL, dtype, dtype,
+                                NPY_UNSAFE_CASTING,
+                                axis_flags, reorderable,
+                                skipna, NULL, keepdims, 0,
+                                assign_identity,
+                                reduce_loop,
+                                masked_reduce_loop,
+                                NULL,
+                                ufunc, buffersize, ufunc_name);
+
+    Py_DECREF(dtype);
+    Py_XDECREF(errobj);
+    return result;
+}
+
+
 static PyObject *
-PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
-                    PyArrayObject *out,
-                    int axis, int otype, int operation, char *opname)
+PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
+                   int axis, int otype, int skipna)
 {
     PyArrayObject *op[2];
     PyArray_Descr *op_dtypes[2] = {NULL, NULL};
     int op_axes_arrays[2][NPY_MAXDIMS];
     int *op_axes[2] = {op_axes_arrays[0], op_axes_arrays[1]};
     npy_uint32 op_flags[2];
-    int i, idim, ndim, otype_final;
-    int needs_api, need_outer_iterator;
+    int idim, ndim, otype_final;
+    int needs_api, need_outer_iterator, use_maskna = 0;
 
     NpyIter *iter = NULL, *iter_inner = NULL;
 
@@ -2401,7 +2893,7 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
     PyUFuncGenericFunction innerloop = NULL;
     void *innerloopdata = NULL;
 
-    char *ufunc_name = self->name ? self->name : "(unknown)";
+    char *ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
 
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -2409,15 +2901,26 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
 
     NPY_BEGIN_THREADS_DEF;
 
-    NPY_UF_DBG_PRINT2("\nEvaluating ufunc %s.%s\n", ufunc_name, opname);
+    NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s.accumulate\n", ufunc_name);
 
 #if 0
-    printf("Doing %s.%s on array with dtype :  ", ufunc_name, opname);
+    printf("Doing %s.accumulate on array with dtype :  ", ufunc_name);
     PyObject_Print((PyObject *)PyArray_DESCR(arr), stdout, 0);
     printf("\n");
 #endif
 
-    if (PyUFunc_GetPyValues(opname, &buffersize, &errormask, &errobj) < 0) {
+    use_maskna = PyArray_HASMASKNA(arr);
+    if (use_maskna) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "ufunc accumulate doesn't support NA masked arrays yet");
+        return NULL;
+    }
+    /* If there's no NA mask, there are no NAs to skip */
+    else {
+        skipna = 0;
+    }
+
+    if (PyUFunc_GetPyValues("accumulate", &buffersize, &errormask, &errobj) < 0) {
         return NULL;
     }
 
@@ -2425,13 +2928,13 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
     Py_XINCREF(out);
 
     otype_final = otype;
-    if (get_binary_op_function(self, &otype_final,
+    if (get_binary_op_function(ufunc, &otype_final,
                                 &innerloop, &innerloopdata) < 0) {
         PyArray_Descr *dtype = PyArray_DescrFromType(otype);
         PyErr_Format(PyExc_ValueError,
-                     "could not find a matching type for %s.%s, "
+                     "could not find a matching type for %s.accumulate, "
                      "requested type has type code '%c'",
-                            ufunc_name, opname, dtype ? dtype->type : '-');
+                            ufunc_name, dtype ? dtype->type : '-');
         Py_XDECREF(dtype);
         goto fail;
     }
@@ -2461,52 +2964,39 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
     }
 
 #if NPY_UF_DBG_TRACING
-    printf("Found %s.%s inner loop with dtype :  ", ufunc_name, opname);
+    printf("Found %s.accumulate inner loop with dtype :  ", ufunc_name);
     PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
     printf("\n");
 #endif
 
     /* Set up the op_axes for the outer loop */
-    if (operation == UFUNC_REDUCE) {
-        for (i = 0, idim = 0; idim < ndim; ++idim) {
-            if (idim != axis) {
-                op_axes_arrays[0][i] = i;
-                op_axes_arrays[1][i] = idim;
-                i++;
-            }
-        }
-    }
-    else if (operation == UFUNC_ACCUMULATE) {
-        for (idim = 0; idim < ndim; ++idim) {
-            op_axes_arrays[0][idim] = idim;
-            op_axes_arrays[1][idim] = idim;
-        }
-    }
-    else {
-        PyErr_Format(PyExc_RuntimeError,
-                    "invalid reduction operation %s.%s", ufunc_name, opname);
-        goto fail;
+    for (idim = 0; idim < ndim; ++idim) {
+        op_axes_arrays[0][idim] = idim;
+        op_axes_arrays[1][idim] = idim;
     }
 
     /* The per-operand flags for the outer loop */
-    op_flags[0] = NPY_ITER_READWRITE|
-                  NPY_ITER_NO_BROADCAST|
-                  NPY_ITER_ALLOCATE|
+    op_flags[0] = NPY_ITER_READWRITE |
+                  NPY_ITER_NO_BROADCAST |
+                  NPY_ITER_ALLOCATE |
                   NPY_ITER_NO_SUBTYPE;
     op_flags[1] = NPY_ITER_READONLY;
 
+    if (use_maskna) {
+        op_flags[0] |= NPY_ITER_USE_MASKNA;
+        op_flags[1] |= NPY_ITER_USE_MASKNA;
+    }
+
     op[0] = out;
     op[1] = arr;
 
     need_outer_iterator = (ndim > 1);
-    if (operation == UFUNC_ACCUMULATE) {
-        /* This is because we can't buffer, so must do UPDATEIFCOPY */
-        if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) ||
-                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) ||
-                (out &&
-                 !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
-            need_outer_iterator = 1;
-        }
+    /* We can't buffer, so must do UPDATEIFCOPY */
+    if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) ||
+            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) ||
+            (out &&
+             !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
+        need_outer_iterator = 1;
     }
 
     if (need_outer_iterator) {
@@ -2515,25 +3005,17 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
                            NPY_ITER_REFS_OK;
         PyArray_Descr **op_dtypes_param = NULL;
 
-        if (operation == UFUNC_REDUCE) {
-            ndim_iter = ndim - 1;
-            if (out == NULL) {
-                op_dtypes_param = op_dtypes;
-            }
-        }
-        else if (operation == UFUNC_ACCUMULATE) {
-            /*
-             * The way accumulate is set up, we can't do buffering,
-             * so make a copy instead when necessary.
-             */
-            ndim_iter = ndim;
-            flags |= NPY_ITER_MULTI_INDEX;
-            /* Add some more flags */
-            op_flags[0] |= NPY_ITER_UPDATEIFCOPY|NPY_ITER_ALIGNED;
-            op_flags[1] |= NPY_ITER_COPY|NPY_ITER_ALIGNED;
-            op_dtypes_param = op_dtypes;
-            op_dtypes[1] = op_dtypes[0];
-        }
+        /*
+         * The way accumulate is set up, we can't do buffering,
+         * so make a copy instead when necessary.
+         */
+        ndim_iter = ndim;
+        flags |= NPY_ITER_MULTI_INDEX;
+        /* Add some more flags */
+        op_flags[0] |= NPY_ITER_UPDATEIFCOPY|NPY_ITER_ALIGNED;
+        op_flags[1] |= NPY_ITER_COPY|NPY_ITER_ALIGNED;
+        op_dtypes_param = op_dtypes;
+        op_dtypes[1] = op_dtypes[0];
         NPY_UF_DBG_PRINT("Allocating outer iterator\n");
         iter = NpyIter_AdvancedNew(2, op, flags,
                                    NPY_KEEPORDER, NPY_UNSAFE_CASTING,
@@ -2544,25 +3026,23 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
             goto fail;
         }
 
-        if (operation == UFUNC_ACCUMULATE) {
-            /* In case COPY or UPDATEIFCOPY occurred */
-            op[0] = NpyIter_GetOperandArray(iter)[0];
-            op[1] = NpyIter_GetOperandArray(iter)[1];
+        /* In case COPY or UPDATEIFCOPY occurred */
+        op[0] = NpyIter_GetOperandArray(iter)[0];
+        op[1] = NpyIter_GetOperandArray(iter)[1];
 
-            if (PyArray_SIZE(op[0]) == 0) {
-                if (out == NULL) {
-                    out = op[0];
-                    Py_INCREF(out);
-                }
-                goto finish;
+        if (PyArray_SIZE(op[0]) == 0) {
+            if (out == NULL) {
+                out = op[0];
+                Py_INCREF(out);
             }
+            goto finish;
+        }
 
-            if (NpyIter_RemoveAxis(iter, axis) != NPY_SUCCEED) {
-                goto fail;
-            }
-            if (NpyIter_RemoveMultiIndex(iter) != NPY_SUCCEED) {
-                goto fail;
-            }
+        if (NpyIter_RemoveAxis(iter, axis) != NPY_SUCCEED) {
+            goto fail;
+        }
+        if (NpyIter_RemoveMultiIndex(iter) != NPY_SUCCEED) {
+            goto fail;
         }
     }
 
@@ -2575,102 +3055,38 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
         else {
             PyArray_Descr *dtype = op_dtypes[0];
             Py_INCREF(dtype);
-            if (operation == UFUNC_REDUCE) {
-                op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
-                                        &PyArray_Type, dtype,
-                                        0, NULL, NULL, NULL,
-                                        0, NULL);
-            }
-            else if (operation == UFUNC_ACCUMULATE) {
-                op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
-                                        &PyArray_Type, dtype,
-                                        ndim, PyArray_DIMS(op[1]), NULL, NULL,
-                                        0, NULL);
-            }
+            op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
+                                    &PyArray_Type, dtype,
+                                    ndim, PyArray_DIMS(op[1]), NULL, NULL,
+                                    0, NULL);
             if (out == NULL) {
                 goto fail;
             }
+
+            if (use_maskna) {
+                if (PyArray_AllocateMaskNA(out, 1, 0, 1) < 0) {
+                    goto fail;
+                }
+            }
         }
     }
 
     /*
-     * If the reduction unit has size zero, either return the reduction
+     * If the reduction axis has size zero, either return the reduction
      * unit for UFUNC_REDUCE, or return the zero-sized output array
      * for UFUNC_ACCUMULATE.
      */
     if (PyArray_DIM(op[1], axis) == 0) {
-        if (operation == UFUNC_REDUCE) {
-            if (self->identity == PyUFunc_None) {
-                PyErr_Format(PyExc_ValueError,
-                             "zero-size array to %s.%s "
-                             "without identity", ufunc_name, opname);
-                goto fail;
-            }
-            if (self->identity == PyUFunc_One) {
-                PyObject *obj = PyInt_FromLong((long) 1);
-                if (obj == NULL) {
-                    goto fail;
-                }
-                PyArray_FillWithScalar(op[0], obj);
-                Py_DECREF(obj);
-            } else {
-                PyObject *obj = PyInt_FromLong((long) 0);
-                if (obj == NULL) {
-                    goto fail;
-                }
-                PyArray_FillWithScalar(op[0], obj);
-                Py_DECREF(obj);
-            }
-        }
-
         goto finish;
     }
     else if (PyArray_SIZE(op[0]) == 0) {
         goto finish;
     }
 
-    /* Only allocate an inner iterator if it's necessary */
-    if (!PyArray_ISALIGNED(op[1]) || !PyArray_ISALIGNED(op[0]) ||
-                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(op[1])) ||
-                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(op[0]))) {
-        /* Also set the dtype for buffering arr */
-        op_dtypes[1] = op_dtypes[0];
-
-        NPY_UF_DBG_PRINT("Allocating inner iterator\n");
-        if (operation == UFUNC_REDUCE) {
-            /* The per-operand flags for the inner loop */
-            op_flags[0] = NPY_ITER_READWRITE|
-                          NPY_ITER_ALIGNED;
-            op_flags[1] = NPY_ITER_READONLY|
-                          NPY_ITER_ALIGNED;
-
-            op_axes[0][0] = -1;
-            op_axes[1][0] = axis;
-
-            iter_inner = NpyIter_AdvancedNew(2, op, NPY_ITER_EXTERNAL_LOOP|
-                                       NPY_ITER_BUFFERED|
-                                       NPY_ITER_DELAY_BUFALLOC|
-                                       NPY_ITER_GROWINNER|
-                                       NPY_ITER_REDUCE_OK|
-                                       NPY_ITER_REFS_OK,
-                                       NPY_CORDER, NPY_UNSAFE_CASTING,
-                                       op_flags, op_dtypes,
-                                       1, op_axes, NULL, buffersize);
-        }
-        /* Should never get an inner iterator for ACCUMULATE */
-        else {
-            PyErr_SetString(PyExc_RuntimeError,
-                "internal ufunc reduce error, should not need inner iterator");
-            goto fail;
-        }
-        if (iter_inner == NULL) {
-            goto fail;
-        }
-    }
-
     if (iter && NpyIter_GetIterSize(iter) != 0) {
         char *dataptr_copy[3];
         npy_intp stride_copy[3];
+        npy_intp count_m1, stride0, stride1;
 
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
@@ -2685,131 +3101,54 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
         dataptr = NpyIter_GetDataPtrArray(iter);
 
 
-        /* Execute the loop with two nested iterators */
-        if (iter_inner) {
-            /* Only UFUNC_REDUCE uses iter_inner */
-            NpyIter_IterNextFunc *iternext_inner;
-            char **dataptr_inner;
-            npy_intp *stride_inner;
-            npy_intp count, *count_ptr_inner;
+        /* Execute the loop with just the outer iterator */
+        count_m1 = PyArray_DIM(op[1], axis)-1;
+        stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
 
-            NPY_UF_DBG_PRINT("UFunc: Reduce loop with two nested iterators\n");
-            iternext_inner = NpyIter_GetIterNext(iter_inner, NULL);
-            if (iternext_inner == NULL) {
-                goto fail;
-            }
-            dataptr_inner = NpyIter_GetDataPtrArray(iter_inner);
-            stride_inner = NpyIter_GetInnerStrideArray(iter_inner);
-            count_ptr_inner = NpyIter_GetInnerLoopSizePtr(iter_inner);
+        NPY_UF_DBG_PRINT("UFunc: Reduce loop with just outer iterator\n");
 
-            needs_api = NpyIter_IterationNeedsAPI(iter) ||
-                        NpyIter_IterationNeedsAPI(iter_inner);
+        stride0 = PyArray_STRIDE(op[0], axis);
 
-            if (!needs_api) {
-                NPY_BEGIN_THREADS;
-            }
+        stride_copy[0] = stride0;
+        stride_copy[1] = stride1;
+        stride_copy[2] = stride0;
 
-            do {
-                int first = 1;
+        needs_api = NpyIter_IterationNeedsAPI(iter);
 
-                /* Reset the inner iterator to the outer's data */
-                if (NpyIter_ResetBasePointers(iter_inner, dataptr, NULL)
-                                                != NPY_SUCCEED) {
-                    goto fail;
-                }
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
 
-                /* Copy the first element to start the reduction */
-                if (otype == NPY_OBJECT) {
-                    Py_XDECREF(*(PyObject **)dataptr_inner[0]);
-                    *(PyObject **)dataptr_inner[0] =
-                                        *(PyObject **)dataptr_inner[1];
-                    Py_XINCREF(*(PyObject **)dataptr_inner[0]);
-                }
-                else {
-                    memcpy(dataptr_inner[0], dataptr_inner[1], itemsize);
-                }
+        do {
 
-                stride_copy[0] = 0;
-                stride_copy[2] = 0;
-                do {
-                    count = *count_ptr_inner;
-                    /* Turn the two items into three for the inner loop */
-                    dataptr_copy[0] = dataptr_inner[0];
-                    dataptr_copy[1] = dataptr_inner[1];
-                    dataptr_copy[2] = dataptr_inner[0];
-                    if (first) {
-                        --count;
-                        dataptr_copy[1] += stride_inner[1];
-                        first = 0;
-                    }
-                    stride_copy[1] = stride_inner[1];
-                    NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
-                    innerloop(dataptr_copy, &count,
-                                stride_copy, innerloopdata);
-                } while(iternext_inner(iter_inner));
-            } while (iternext(iter));
+            dataptr_copy[0] = dataptr[0];
+            dataptr_copy[1] = dataptr[1];
+            dataptr_copy[2] = dataptr[0];
 
-            if (!needs_api) {
-                NPY_END_THREADS;
+            /* Copy the first element to start the reduction */
+            if (otype == NPY_OBJECT) {
+                Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+                *(PyObject **)dataptr_copy[0] =
+                                    *(PyObject **)dataptr_copy[1];
+                Py_XINCREF(*(PyObject **)dataptr_copy[0]);
             }
-        }
-        /* Execute the loop with just the outer iterator */
-        else {
-            npy_intp count_m1 = PyArray_DIM(op[1], axis)-1;
-            npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
-
-            NPY_UF_DBG_PRINT("UFunc: Reduce loop with just outer iterator\n");
-
-            if (operation == UFUNC_ACCUMULATE) {
-                stride0 = PyArray_STRIDE(op[0], axis);
+            else {
+                memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
             }
 
-            stride_copy[0] = stride0;
-            stride_copy[1] = stride1;
-            stride_copy[2] = stride0;
-
-            needs_api = NpyIter_IterationNeedsAPI(iter);
-
-            if (!needs_api) {
-                NPY_BEGIN_THREADS;
+            if (count_m1 > 0) {
+                /* Turn the two items into three for the inner loop */
+                dataptr_copy[1] += stride1;
+                dataptr_copy[2] += stride0;
+                NPY_UF_DBG_PRINT1("iterator loop count %d\n",
+                                                (int)count_m1);
+                innerloop(dataptr_copy, &count_m1,
+                            stride_copy, innerloopdata);
             }
+        } while (iternext(iter));
 
-            do {
-
-                dataptr_copy[0] = dataptr[0];
-                dataptr_copy[1] = dataptr[1];
-                dataptr_copy[2] = dataptr[0];
-
-                /* Copy the first element to start the reduction */
-                if (otype == NPY_OBJECT) {
-                    Py_XDECREF(*(PyObject **)dataptr_copy[0]);
-                    *(PyObject **)dataptr_copy[0] =
-                                        *(PyObject **)dataptr_copy[1];
-                    Py_XINCREF(*(PyObject **)dataptr_copy[0]);
-                }
-                else {
-                    memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
-                }
-
-                if (count_m1 > 0) {
-                    /* Turn the two items into three for the inner loop */
-                    if (operation == UFUNC_REDUCE) {
-                        dataptr_copy[1] += stride1;
-                    }
-                    else if (operation == UFUNC_ACCUMULATE) {
-                        dataptr_copy[1] += stride1;
-                        dataptr_copy[2] += stride0;
-                    }
-                    NPY_UF_DBG_PRINT1("iterator loop count %d\n",
-                                                    (int)count_m1);
-                    innerloop(dataptr_copy, &count_m1,
-                                stride_copy, innerloopdata);
-                }
-            } while (iternext(iter));
-
-            if (!needs_api) {
-                NPY_END_THREADS;
-            }
+        if (!needs_api) {
+            NPY_END_THREADS;
         }
     }
     else if (iter == NULL) {
@@ -2818,142 +3157,61 @@ PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
 
         int itemsize = op_dtypes[0]->elsize;
 
-        /* Execute the loop with just the inner iterator */
-        if (iter_inner) {
-            /* Only UFUNC_REDUCE uses iter_inner */
-            NpyIter_IterNextFunc *iternext_inner;
-            char **dataptr_inner;
-            npy_intp *stride_inner;
-            npy_intp count, *count_ptr_inner;
-            int first = 1;
-
-            NPY_UF_DBG_PRINT("UFunc: Reduce loop with just inner iterator\n");
-
-            iternext_inner = NpyIter_GetIterNext(iter_inner, NULL);
-            if (iternext_inner == NULL) {
-                goto fail;
-            }
-            dataptr_inner = NpyIter_GetDataPtrArray(iter_inner);
-            stride_inner = NpyIter_GetInnerStrideArray(iter_inner);
-            count_ptr_inner = NpyIter_GetInnerLoopSizePtr(iter_inner);
-
-            /* Reset the inner iterator to prepare the buffers */
-            if (NpyIter_Reset(iter_inner, NULL) != NPY_SUCCEED) {
-                goto fail;
-            }
+        /* Execute the loop with no iterators */
+        npy_intp count = PyArray_DIM(op[1], axis);
+        npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
 
-            needs_api = NpyIter_IterationNeedsAPI(iter_inner);
+        NPY_UF_DBG_PRINT("UFunc: Reduce loop with no iterators\n");
 
-            if (!needs_api) {
-                NPY_BEGIN_THREADS;
-            }
+        if (PyArray_NDIM(op[0]) != PyArray_NDIM(op[1]) ||
+                !PyArray_CompareLists(PyArray_DIMS(op[0]),
+                                      PyArray_DIMS(op[1]),
+                                      PyArray_NDIM(op[0]))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "provided out is the wrong size "
+                    "for the reduction");
+            goto fail;
+        }
+        stride0 = PyArray_STRIDE(op[0], axis);
 
-            /* Copy the first element to start the reduction */
-            if (otype == NPY_OBJECT) {
-                Py_XDECREF(*(PyObject **)dataptr_inner[0]);
-                *(PyObject **)dataptr_inner[0] =
-                                    *(PyObject **)dataptr_inner[1];
-                Py_XINCREF(*(PyObject **)dataptr_inner[0]);
-            }
-            else {
-                memcpy(dataptr_inner[0], dataptr_inner[1], itemsize);
-            }
+        stride_copy[0] = stride0;
+        stride_copy[1] = stride1;
+        stride_copy[2] = stride0;
 
-            stride_copy[0] = 0;
-            stride_copy[2] = 0;
-            do {
-                count = *count_ptr_inner;
-                /* Turn the two items into three for the inner loop */
-                dataptr_copy[0] = dataptr_inner[0];
-                dataptr_copy[1] = dataptr_inner[1];
-                dataptr_copy[2] = dataptr_inner[0];
-                if (first) {
-                    --count;
-                    dataptr_copy[1] += stride_inner[1];
-                    first = 0;
-                }
-                stride_copy[1] = stride_inner[1];
-                NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
-                innerloop(dataptr_copy, &count,
-                            stride_copy, innerloopdata);
-            } while(iternext_inner(iter_inner));
+        /* Turn the two items into three for the inner loop */
+        dataptr_copy[0] = PyArray_BYTES(op[0]);
+        dataptr_copy[1] = PyArray_BYTES(op[1]);
+        dataptr_copy[2] = PyArray_BYTES(op[0]);
 
-            if (!needs_api) {
-                NPY_END_THREADS;
-            }
+        /* Copy the first element to start the reduction */
+        if (otype == NPY_OBJECT) {
+            Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+            *(PyObject **)dataptr_copy[0] =
+                                *(PyObject **)dataptr_copy[1];
+            Py_XINCREF(*(PyObject **)dataptr_copy[0]);
         }
-        /* Execute the loop with no iterators */
         else {
-            npy_intp count = PyArray_DIM(op[1], axis);
-            npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
-
-            NPY_UF_DBG_PRINT("UFunc: Reduce loop with no iterators\n");
+            memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
+        }
 
-            if (operation == UFUNC_REDUCE) {
-                if (PyArray_NDIM(op[0]) != 0) {
-                    PyErr_SetString(PyExc_ValueError,
-                            "provided out is the wrong size "
-                            "for the reduction");
-                    goto fail;
-                }
-            }
-            else if (operation == UFUNC_ACCUMULATE) {
-                if (PyArray_NDIM(op[0]) != PyArray_NDIM(op[1]) ||
-                        !PyArray_CompareLists(PyArray_DIMS(op[0]),
-                                              PyArray_DIMS(op[1]),
-                                              PyArray_NDIM(op[0]))) {
-                    PyErr_SetString(PyExc_ValueError,
-                            "provided out is the wrong size "
-                            "for the reduction");
-                    goto fail;
-                }
-                stride0 = PyArray_STRIDE(op[0], axis);
-            }
+        if (count > 1) {
+            --count;
+            dataptr_copy[1] += stride1;
+            dataptr_copy[2] += stride0;
 
-            stride_copy[0] = stride0;
-            stride_copy[1] = stride1;
-            stride_copy[2] = stride0;
+            NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
 
-            /* Turn the two items into three for the inner loop */
-            dataptr_copy[0] = PyArray_BYTES(op[0]);
-            dataptr_copy[1] = PyArray_BYTES(op[1]);
-            dataptr_copy[2] = PyArray_BYTES(op[0]);
+            needs_api = PyDataType_REFCHK(op_dtypes[0]);
 
-            /* Copy the first element to start the reduction */
-            if (otype == NPY_OBJECT) {
-                Py_XDECREF(*(PyObject **)dataptr_copy[0]);
-                *(PyObject **)dataptr_copy[0] =
-                                    *(PyObject **)dataptr_copy[1];
-                Py_XINCREF(*(PyObject **)dataptr_copy[0]);
-            }
-            else {
-                memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
             }
 
-            if (count > 1) {
-                --count;
-                if (operation == UFUNC_REDUCE) {
-                    dataptr_copy[1] += stride1;
-                }
-                else if (operation == UFUNC_ACCUMULATE) {
-                    dataptr_copy[1] += stride1;
-                    dataptr_copy[2] += stride0;
-                }
-
-                NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
-
-                needs_api = PyDataType_REFCHK(op_dtypes[0]);
-
-                if (!needs_api) {
-                    NPY_BEGIN_THREADS;
-                }
+            innerloop(dataptr_copy, &count,
+                        stride_copy, innerloopdata);
 
-                innerloop(dataptr_copy, &count,
-                            stride_copy, innerloopdata);
-
-                if (!needs_api) {
-                    NPY_END_THREADS;
-                }
+            if (!needs_api) {
+                NPY_END_THREADS;
             }
         }
     }
@@ -2988,30 +3246,6 @@ fail:
 }
 
 /*
- * We have two basic kinds of loops. One is used when arr is not-swapped
- * and aligned and output type is the same as input type.  The other uses
- * buffers when one of these is not satisfied.
- *
- *  Zero-length and one-length axes-to-be-reduced are handled separately.
- */
-static PyObject *
-PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
-        int axis, int otype)
-{
-    return PyUFunc_ReductionOp(self, arr, out, axis, otype,
-                                UFUNC_REDUCE, "reduce");
-}
-
-
-static PyObject *
-PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
-                   int axis, int otype)
-{
-    return PyUFunc_ReductionOp(self, arr, out, axis, otype,
-                                UFUNC_ACCUMULATE, "accumulate");
-}
-
-/*
  * Reduceat performs a reduce over an axis using the indices as a guide
  *
  * op.reduceat(array,indices)  computes
@@ -3031,8 +3265,8 @@ PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
  * output shape is based on the size of indices
  */
 static PyObject *
-PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
-                 PyArrayObject *out, int axis, int otype)
+PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
+                 PyArrayObject *out, int axis, int otype, int skipna)
 {
     PyArrayObject *op[3];
     PyArray_Descr *op_dtypes[3] = {NULL, NULL, NULL};
@@ -3052,7 +3286,7 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
     PyUFuncGenericFunction innerloop = NULL;
     void *innerloopdata = NULL;
 
-    char *ufunc_name = self->name ? self->name : "(unknown)";
+    char *ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
     char *opname = "reduceat";
 
     /* These parameters come from extobj= or from a TLS global */
@@ -3061,6 +3295,12 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
 
     NPY_BEGIN_THREADS_DEF;
 
+    if (PyArray_HASMASKNA(arr)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                    "ufunc reduceat doesn't support NA masked arrays yet");
+        return NULL;
+    }
+
     reduceat_ind = (npy_intp *)PyArray_DATA(ind);
     ind_size = PyArray_DIM(ind, 0);
     red_axis_size = PyArray_DIM(arr, axis);
@@ -3092,7 +3332,7 @@ PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
     Py_XINCREF(out);
 
     otype_final = otype;
-    if (get_binary_op_function(self, &otype_final,
+    if (get_binary_op_function(ufunc, &otype_final,
                                 &innerloop, &innerloopdata) < 0) {
         PyArray_Descr *dtype = PyArray_DescrFromType(otype);
         PyErr_Format(PyExc_ValueError,
@@ -3393,38 +3633,43 @@ fail:
  * but they are handled separately for speed)
  */
 static PyObject *
-PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
+PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
                          PyObject *kwds, int operation)
 {
-    int axis=0;
+    int i, naxes=0, ndim;
+    int axes[NPY_MAXDIMS];
+    PyObject *axes_in = NULL;
     PyArrayObject *mp, *ret = NULL;
     PyObject *op, *res = NULL;
     PyObject *obj_ind, *context;
     PyArrayObject *indices = NULL;
     PyArray_Descr *otype = NULL;
     PyArrayObject *out = NULL;
-    static char *kwlist1[] = {"array", "axis", "dtype", "out", NULL};
-    static char *kwlist2[] = {"array", "indices", "axis", "dtype", "out", NULL};
+    int skipna = 0, keepdims = 0;
+    static char *kwlist1[] = {"array", "axis", "dtype",
+                                "out", "skipna", "keepdims", NULL};
+    static char *kwlist2[] = {"array", "indices", "axis",
+                                "dtype", "out", "skipna", NULL};
     static char *_reduce_type[] = {"reduce", "accumulate", "reduceat", NULL};
 
-    if (self == NULL) {
+    if (ufunc == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
         return NULL;
     }
-    if (self->core_enabled) {
+    if (ufunc->core_enabled) {
         PyErr_Format(PyExc_RuntimeError,
                      "Reduction not defined on ufunc with signature");
         return NULL;
     }
-    if (self->nin != 2) {
+    if (ufunc->nin != 2) {
         PyErr_Format(PyExc_ValueError,
                      "%s only supported for binary functions",
                      _reduce_type[operation]);
         return NULL;
     }
-    if (self->nout != 1) {
+    if (ufunc->nout != 1) {
         PyErr_Format(PyExc_ValueError,
-                     "%s only supported for functions " \
+                     "%s only supported for functions "
                      "returning a single value",
                      _reduce_type[operation]);
         return NULL;
@@ -3433,10 +3678,13 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
     if (operation == UFUNC_REDUCEAT) {
         PyArray_Descr *indtype;
         indtype = PyArray_DescrFromType(PyArray_INTP);
-        if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|iO&O&", kwlist2,
-                                        &op, &obj_ind, &axis,
+        if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|OO&O&i", kwlist2,
+                                        &op,
+                                        &obj_ind,
+                                        &axes_in,
                                         PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out)) {
+                                        PyArray_OutputConverter, &out,
+                                        &skipna)) {
             Py_XDECREF(otype);
             return NULL;
         }
@@ -3447,35 +3695,46 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
             return NULL;
         }
     }
+    else if (operation == UFUNC_ACCUMULATE) {
+        if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&ii", kwlist1,
+                                        &op,
+                                        &axes_in,
+                                        PyArray_DescrConverter2, &otype,
+                                        PyArray_OutputConverter, &out,
+                                        &skipna,
+                                        &keepdims)) {
+            Py_XDECREF(otype);
+            return NULL;
+        }
+    }
     else {
-        if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|iO&O&", kwlist1,
-                                        &op, &axis,
+        if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&ii", kwlist1,
+                                        &op,
+                                        &axes_in,
                                         PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out)) {
+                                        PyArray_OutputAllowNAConverter, &out,
+                                        &skipna,
+                                        &keepdims)) {
             Py_XDECREF(otype);
             return NULL;
         }
     }
     /* Ensure input is an array */
     if (!PyArray_Check(op) && !PyArray_IsScalar(op, Generic)) {
-        context = Py_BuildValue("O(O)i", self, op, 0);
+        context = Py_BuildValue("O(O)i", ufunc, op, 0);
     }
     else {
         context = NULL;
     }
-    mp = (PyArrayObject *)PyArray_FromAny(op, NULL, 0, 0, 0, context);
+    mp = (PyArrayObject *)PyArray_FromAny(op, NULL, 0, 0,
+                                            NPY_ARRAY_ALLOWNA, context);
     Py_XDECREF(context);
     if (mp == NULL) {
         return NULL;
     }
-    /* Check to see if input is zero-dimensional */
-    if (PyArray_NDIM(mp) == 0) {
-        PyErr_Format(PyExc_TypeError, "cannot %s on a scalar",
-                     _reduce_type[operation]);
-        Py_XDECREF(otype);
-        Py_DECREF(mp);
-        return NULL;
-    }
+
+    ndim = PyArray_NDIM(mp);
+
     /* Check to see that type (and otype) is not FLEXIBLE */
     if (PyArray_ISFLEXIBLE(mp) ||
         (otype && PyTypeNum_ISFLEXIBLE(otype->type_num))) {
@@ -3487,15 +3746,110 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
         return NULL;
     }
 
-    if (axis < 0) {
-        axis += PyArray_NDIM(mp);
+    /* Convert the 'axis' parameter into a list of axes */
+    if (axes_in == NULL) {
+        naxes = 1;
+        axes[0] = 0;
+    }
+    /* Convert 'None' into all the axes */
+    else if (axes_in == Py_None) {
+        naxes = ndim;
+        for (i = 0; i < naxes; ++i) {
+            axes[i] = i;
+        }
+    }
+    else if (PyTuple_Check(axes_in)) {
+        naxes = PyTuple_Size(axes_in);
+        if (naxes < 0 || naxes > NPY_MAXDIMS) {
+            PyErr_SetString(PyExc_ValueError,
+                    "too many values for 'axis'");
+            Py_XDECREF(otype);
+            Py_DECREF(mp);
+            return NULL;
+        }
+        for (i = 0; i < naxes; ++i) {
+            PyObject *tmp = PyTuple_GET_ITEM(axes_in, i);
+            long axis = PyInt_AsLong(tmp);
+            if (axis == -1 && PyErr_Occurred()) {
+                Py_XDECREF(otype);
+                Py_DECREF(mp);
+                return NULL;
+            }
+            if (axis < 0) {
+                axis += ndim;
+            }
+            if (axis < 0 || axis >= ndim) {
+                PyErr_SetString(PyExc_ValueError,
+                        "'axis' entry is out of bounds");
+                Py_XDECREF(otype);
+                Py_DECREF(mp);
+                return NULL;
+            }
+            axes[i] = (int)axis;
+        }
+    }
+    /* Try to interpret axis as an integer */
+    else {
+        long axis = PyInt_AsLong(axes_in);
+        /* TODO: PyNumber_Index would be good to use here */
+        if (axis == -1 && PyErr_Occurred()) {
+            Py_XDECREF(otype);
+            Py_DECREF(mp);
+            return NULL;
+        }
+        if (axis < 0) {
+            axis += ndim;
+        }
+        /* Special case letting axis={0 or -1} slip through for scalars */
+        if (ndim == 0 && (axis == 0 || axis == -1)) {
+            axis = 0;
+        }
+        else if (axis < 0 || axis >= ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                    "'axis' entry is out of bounds");
+            Py_XDECREF(otype);
+            Py_DECREF(mp);
+            return NULL;
+        }
+        axes[0] = (int)axis;
+        naxes = 1;
     }
-    if (axis < 0 || axis >= PyArray_NDIM(mp)) {
-        PyErr_SetString(PyExc_ValueError, "axis not in array");
+
+    /* Check to see if input is zero-dimensional. */
+    if (ndim == 0) {
+        /*
+         * A reduction with no axes is still valid but trivial.
+         * As a special case for backwards compatibility in 'sum',
+         * 'prod', et al, also allow a reduction where axis=0, even
+         * though this is technically incorrect.
+         */
+        if (operation == UFUNC_REDUCE &&
+                    (naxes == 0 || (naxes == 1 && axes[0] == 0))) {
+            Py_XDECREF(otype);
+            /* If there's an output parameter, copy the value */
+            if (out != NULL) {
+                if (PyArray_CopyInto(out, mp) < 0) {
+                    Py_DECREF(mp);
+                    return NULL;
+                }
+                else {
+                    Py_DECREF(mp);
+                    Py_INCREF(out);
+                    return (PyObject *)out;
+                }
+            }
+            /* Otherwise return the array unscathed */
+            else {
+                return PyArray_Return(mp);
+            }
+        }
+        PyErr_Format(PyExc_TypeError, "cannot %s on a scalar",
+                     _reduce_type[operation]);
         Py_XDECREF(otype);
         Py_DECREF(mp);
         return NULL;
     }
+
      /*
       * If out is specified it determines otype
       * unless otype already specified.
@@ -3511,17 +3865,17 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
          */
         int typenum = PyArray_TYPE(mp);
         if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
-            && ((strcmp(self->name,"add") == 0)
-                || (strcmp(self->name,"multiply") == 0))) {
+            && ((strcmp(ufunc->name,"add") == 0)
+                || (strcmp(ufunc->name,"multiply") == 0))) {
             if (PyTypeNum_ISBOOL(typenum)) {
-                typenum = PyArray_LONG;
+                typenum = NPY_LONG;
             }
             else if ((size_t)PyArray_DESCR(mp)->elsize < sizeof(long)) {
                 if (PyTypeNum_ISUNSIGNED(typenum)) {
-                    typenum = PyArray_ULONG;
+                    typenum = NPY_ULONG;
                 }
                 else {
-                    typenum = PyArray_LONG;
+                    typenum = NPY_LONG;
                 }
             }
         }
@@ -3531,24 +3885,45 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
 
     switch(operation) {
     case UFUNC_REDUCE:
-        ret = (PyArrayObject *)PyUFunc_Reduce(self, mp, out, axis,
-                                              otype->type_num);
+        ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
+                                          otype, skipna, keepdims);
         break;
     case UFUNC_ACCUMULATE:
-        ret = (PyArrayObject *)PyUFunc_Accumulate(self, mp, out, axis,
-                                                  otype->type_num);
+        if (naxes != 1) {
+            PyErr_SetString(PyExc_ValueError,
+                        "accumulate does not allow multiple axes");
+            Py_XDECREF(otype);
+            Py_DECREF(mp);
+            return NULL;
+        }
+        ret = (PyArrayObject *)PyUFunc_Accumulate(ufunc, mp, out, axes[0],
+                                                  otype->type_num, skipna);
         break;
     case UFUNC_REDUCEAT:
-        ret = (PyArrayObject *)PyUFunc_Reduceat(self, mp, indices, out,
-                                                axis, otype->type_num);
+        if (naxes != 1) {
+            PyErr_SetString(PyExc_ValueError,
+                        "reduceat does not allow multiple axes");
+            Py_XDECREF(otype);
+            Py_DECREF(mp);
+            return NULL;
+        }
+        ret = (PyArrayObject *)PyUFunc_Reduceat(ufunc, mp, indices, out,
+                                            axes[0], otype->type_num, skipna);
         Py_DECREF(indices);
         break;
     }
     Py_DECREF(mp);
     Py_DECREF(otype);
+
     if (ret == NULL) {
         return NULL;
     }
+
+    /* If an output parameter was provided, don't wrap it */
+    if (out != NULL) {
+        return (PyObject *)ret;
+    }
+
     if (Py_TYPE(op) != Py_TYPE(ret)) {
         res = PyObject_CallMethod(op, "__array_wrap__", "O", ret);
         if (res == NULL) {
@@ -3704,7 +4079,7 @@ _find_array_wrap(PyObject *args, PyObject *kwds,
 
 
 static PyObject *
-ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
+ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
     int i;
     PyTupleObject *ret;
@@ -3718,19 +4093,19 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
      * Initialize all array objects to NULL to make cleanup easier
      * if something goes wrong.
      */
-    for(i = 0; i < self->nargs; i++) {
+    for(i = 0; i < ufunc->nargs; i++) {
         mps[i] = NULL;
     }
 
-    errval = PyUFunc_GenericFunction(self, args, kwds, mps);
+    errval = PyUFunc_GenericFunction(ufunc, args, kwds, mps);
     if (errval < 0) {
-        for (i = 0; i < self->nargs; i++) {
+        for (i = 0; i < ufunc->nargs; i++) {
             PyArray_XDECREF_ERR(mps[i]);
         }
         if (errval == -1) {
             return NULL;
         }
-        else if (self->nin == 2 && self->nout == 1) {
+        else if (ufunc->nin == 2 && ufunc->nout == 1) {
             /* To allow the other argument to be given a chance */
             Py_INCREF(Py_NotImplemented);
             return Py_NotImplemented;
@@ -3743,7 +4118,7 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
     }
 
     /* Free the input references */
-    for (i = 0; i < self->nin; i++) {
+    for (i = 0; i < ufunc->nin; i++) {
         Py_XDECREF(mps[i]);
     }
 
@@ -3764,11 +4139,11 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
      * None --- array-object passed in don't call PyArray_Return
      * method --- the __array_wrap__ method to call.
      */
-    _find_array_wrap(args, kwds, wraparr, self->nin, self->nout);
+    _find_array_wrap(args, kwds, wraparr, ufunc->nin, ufunc->nout);
 
     /* wrap outputs */
-    for (i = 0; i < self->nout; i++) {
-        int j = self->nin+i;
+    for (i = 0; i < ufunc->nout; i++) {
+        int j = ufunc->nin+i;
         PyObject *wrap = wraparr[i];
 
         if (wrap != NULL) {
@@ -3777,7 +4152,7 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
                 retobj[i] = (PyObject *)mps[j];
                 continue;
             }
-            res = PyObject_CallFunction(wrap, "O(OOi)", mps[j], self, args, i);
+            res = PyObject_CallFunction(wrap, "O(OOi)", mps[j], ufunc, args, i);
             if (res == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
                 PyErr_Clear();
                 res = PyObject_CallFunctionObjArgs(wrap, mps[j], NULL);
@@ -3795,23 +4170,26 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
                 continue;
             }
         }
-        /* default behavior */
-        retobj[i] = PyArray_Return(mps[j]);
+        else {
+            /* default behavior */
+            retobj[i] = PyArray_Return(mps[j]);
+        }
+
     }
 
-    if (self->nout == 1) {
+    if (ufunc->nout == 1) {
         return retobj[0];
     }
     else {
-        ret = (PyTupleObject *)PyTuple_New(self->nout);
-        for (i = 0; i < self->nout; i++) {
+        ret = (PyTupleObject *)PyTuple_New(ufunc->nout);
+        for (i = 0; i < ufunc->nout; i++) {
             PyTuple_SET_ITEM(ret, i, retobj[i]);
         }
         return (PyObject *)ret;
     }
 
 fail:
-    for (i = self->nin; i < self->nargs; i++) {
+    for (i = ufunc->nin; i < ufunc->nargs; i++) {
         Py_XDECREF(mps[i]);
     }
     return NULL;
@@ -3967,59 +4345,61 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
                                      char *name, char *doc,
                                      int check_return, const char *signature)
 {
-    PyUFuncObject *self;
+    PyUFuncObject *ufunc;
 
-    self = PyArray_malloc(sizeof(PyUFuncObject));
-    if (self == NULL) {
+    ufunc = PyArray_malloc(sizeof(PyUFuncObject));
+    if (ufunc == NULL) {
         return NULL;
     }
-    PyObject_Init((PyObject *)self, &PyUFunc_Type);
+    PyObject_Init((PyObject *)ufunc, &PyUFunc_Type);
 
-    self->nin = nin;
-    self->nout = nout;
-    self->nargs = nin+nout;
-    self->identity = identity;
+    ufunc->nin = nin;
+    ufunc->nout = nout;
+    ufunc->nargs = nin+nout;
+    ufunc->identity = identity;
 
-    self->functions = func;
-    self->data = data;
-    self->types = types;
-    self->ntypes = ntypes;
-    self->check_return = check_return;
-    self->ptr = NULL;
-    self->obj = NULL;
-    self->userloops=NULL;
+    ufunc->functions = func;
+    ufunc->data = data;
+    ufunc->types = types;
+    ufunc->ntypes = ntypes;
+    ufunc->check_return = check_return;
+    ufunc->ptr = NULL;
+    ufunc->obj = NULL;
+    ufunc->userloops=NULL;
 
-    self->type_resolution_function = &PyUFunc_DefaultTypeResolution;
-    self->type_resolution_masked_function =
-                            &PyUFunc_DefaultTypeResolutionMasked;
+    /* Type resolution and inner loop selection functions */
+    ufunc->type_resolver = &PyUFunc_DefaultTypeResolver;
+    ufunc->legacy_inner_loop_selector = &PyUFunc_DefaultLegacyInnerLoopSelector;
+    ufunc->inner_loop_selector = NULL;
+    ufunc->masked_inner_loop_selector = &PyUFunc_DefaultMaskedInnerLoopSelector;
 
     if (name == NULL) {
-        self->name = "?";
+        ufunc->name = "?";
     }
     else {
-        self->name = name;
+        ufunc->name = name;
     }
     if (doc == NULL) {
-        self->doc = "NULL";
+        ufunc->doc = "NULL";
     }
     else {
-        self->doc = doc;
+        ufunc->doc = doc;
     }
 
     /* generalized ufunc */
-    self->core_enabled = 0;
-    self->core_num_dim_ix = 0;
-    self->core_num_dims = NULL;
-    self->core_dim_ixs = NULL;
-    self->core_offsets = NULL;
-    self->core_signature = NULL;
+    ufunc->core_enabled = 0;
+    ufunc->core_num_dim_ix = 0;
+    ufunc->core_num_dims = NULL;
+    ufunc->core_dim_ixs = NULL;
+    ufunc->core_offsets = NULL;
+    ufunc->core_signature = NULL;
     if (signature != NULL) {
-        if (_parse_signature(self, signature) != 0) {
-            Py_DECREF(self);
+        if (_parse_signature(ufunc, signature) != 0) {
+            Py_DECREF(ufunc);
             return NULL;
         }
     }
-    return (PyObject *)self;
+    return (PyObject *)ufunc;
 }
 
 /* Specify that the loop specified by the given index should use the array of
@@ -4037,8 +4417,14 @@ PyUFunc_SetUsesArraysAsData(void **data, size_t i)
  * Return 1 if the given data pointer for the loop specifies that it needs the
  * arrays as the data pointer.
  *
- * NOTE: This is easier to specify with the type_resolution_function
+ * NOTE: This is easier to specify with the type_resolver
  *       in the ufunc object.
+ *
+ * TODO: Remove this, since this is already basically broken
+ *       with the addition of the masked inner loops and
+ *       not worth fixing since the new loop selection functions
+ *       have access to the full dtypes and can dynamically allocate
+ *       arbitrary auxiliary data.
  */
 static int
 _does_loop_use_arrays(void *data)
@@ -4232,34 +4618,34 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
 
 
 static void
-ufunc_dealloc(PyUFuncObject *self)
+ufunc_dealloc(PyUFuncObject *ufunc)
 {
-    if (self->core_num_dims) {
-        PyArray_free(self->core_num_dims);
+    if (ufunc->core_num_dims) {
+        PyArray_free(ufunc->core_num_dims);
     }
-    if (self->core_dim_ixs) {
-        PyArray_free(self->core_dim_ixs);
+    if (ufunc->core_dim_ixs) {
+        PyArray_free(ufunc->core_dim_ixs);
     }
-    if (self->core_offsets) {
-        PyArray_free(self->core_offsets);
+    if (ufunc->core_offsets) {
+        PyArray_free(ufunc->core_offsets);
     }
-    if (self->core_signature) {
-        PyArray_free(self->core_signature);
+    if (ufunc->core_signature) {
+        PyArray_free(ufunc->core_signature);
     }
-    if (self->ptr) {
-        PyArray_free(self->ptr);
+    if (ufunc->ptr) {
+        PyArray_free(ufunc->ptr);
     }
-    Py_XDECREF(self->userloops);
-    Py_XDECREF(self->obj);
-    PyArray_free(self);
+    Py_XDECREF(ufunc->userloops);
+    Py_XDECREF(ufunc->obj);
+    PyArray_free(ufunc);
 }
 
 static PyObject *
-ufunc_repr(PyUFuncObject *self)
+ufunc_repr(PyUFuncObject *ufunc)
 {
     char buf[100];
 
-    sprintf(buf, "<ufunc '%.50s'>", self->name);
+    sprintf(buf, "<ufunc '%.50s'>", ufunc->name);
     return PyUString_FromString(buf);
 }
 
@@ -4276,7 +4662,7 @@ ufunc_repr(PyUFuncObject *self)
  * The result has dimensions a.ndim + b.ndim
  */
 static PyObject *
-ufunc_outer(PyUFuncObject *self, PyObject *args, PyObject *kwds)
+ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
     int i;
     PyObject *ret;
@@ -4284,14 +4670,14 @@ ufunc_outer(PyUFuncObject *self, PyObject *args, PyObject *kwds)
     PyObject *new_args, *tmp;
     PyObject *shape1, *shape2, *newshape;
 
-    if (self->core_enabled) {
+    if (ufunc->core_enabled) {
         PyErr_Format(PyExc_TypeError,
                      "method outer is not allowed in ufunc with non-trivial"\
                      " signature");
         return NULL;
     }
 
-    if(self->nin != 2) {
+    if(ufunc->nin != 2) {
         PyErr_SetString(PyExc_ValueError,
                         "outer product only supported "\
                         "for binary functions");
@@ -4354,7 +4740,7 @@ ufunc_outer(PyUFuncObject *self, PyObject *args, PyObject *kwds)
     Py_DECREF(ap1);
     Py_DECREF(ap2);
     Py_DECREF(ap_new);
-    ret = ufunc_generic_call(self, new_args, kwds);
+    ret = ufunc_generic_call(ufunc, new_args, kwds);
     Py_DECREF(new_args);
     return ret;
 
@@ -4367,21 +4753,21 @@ ufunc_outer(PyUFuncObject *self, PyObject *args, PyObject *kwds)
 
 
 static PyObject *
-ufunc_reduce(PyUFuncObject *self, PyObject *args, PyObject *kwds)
+ufunc_reduce(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
-    return PyUFunc_GenericReduction(self, args, kwds, UFUNC_REDUCE);
+    return PyUFunc_GenericReduction(ufunc, args, kwds, UFUNC_REDUCE);
 }
 
 static PyObject *
-ufunc_accumulate(PyUFuncObject *self, PyObject *args, PyObject *kwds)
+ufunc_accumulate(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
-    return PyUFunc_GenericReduction(self, args, kwds, UFUNC_ACCUMULATE);
+    return PyUFunc_GenericReduction(ufunc, args, kwds, UFUNC_ACCUMULATE);
 }
 
 static PyObject *
-ufunc_reduceat(PyUFuncObject *self, PyObject *args, PyObject *kwds)
+ufunc_reduceat(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
-    return PyUFunc_GenericReduction(self, args, kwds, UFUNC_REDUCEAT);
+    return PyUFunc_GenericReduction(ufunc, args, kwds, UFUNC_REDUCEAT);
 }
 
 
@@ -4442,7 +4828,7 @@ _typecharfromnum(int num) {
 }
 
 static PyObject *
-ufunc_get_doc(PyUFuncObject *self)
+ufunc_get_doc(PyUFuncObject *ufunc)
 {
     /*
      * Put docstring first or FindMethod finds it... could so some
@@ -4451,20 +4837,20 @@ ufunc_get_doc(PyUFuncObject *self)
      * construct name(x1, x2, ...,[ out1, out2, ...]) __doc__
      */
     PyObject *outargs, *inargs, *doc;
-    outargs = _makeargs(self->nout, "out", 1);
-    inargs = _makeargs(self->nin, "x", 0);
+    outargs = _makeargs(ufunc->nout, "out", 1);
+    inargs = _makeargs(ufunc->nin, "x", 0);
     if (outargs == NULL) {
         doc = PyUString_FromFormat("%s(%s)\n\n%s",
-                                   self->name,
+                                   ufunc->name,
                                    PyString_AS_STRING(inargs),
-                                   self->doc);
+                                   ufunc->doc);
     }
     else {
         doc = PyUString_FromFormat("%s(%s[, %s])\n\n%s",
-                                   self->name,
+                                   ufunc->name,
                                    PyString_AS_STRING(inargs),
                                    PyString_AS_STRING(outargs),
-                                   self->doc);
+                                   ufunc->doc);
         Py_DECREF(outargs);
     }
     Py_DECREF(inargs);
@@ -4472,38 +4858,38 @@ ufunc_get_doc(PyUFuncObject *self)
 }
 
 static PyObject *
-ufunc_get_nin(PyUFuncObject *self)
+ufunc_get_nin(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(self->nin);
+    return PyInt_FromLong(ufunc->nin);
 }
 
 static PyObject *
-ufunc_get_nout(PyUFuncObject *self)
+ufunc_get_nout(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(self->nout);
+    return PyInt_FromLong(ufunc->nout);
 }
 
 static PyObject *
-ufunc_get_nargs(PyUFuncObject *self)
+ufunc_get_nargs(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(self->nargs);
+    return PyInt_FromLong(ufunc->nargs);
 }
 
 static PyObject *
-ufunc_get_ntypes(PyUFuncObject *self)
+ufunc_get_ntypes(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(self->ntypes);
+    return PyInt_FromLong(ufunc->ntypes);
 }
 
 static PyObject *
-ufunc_get_types(PyUFuncObject *self)
+ufunc_get_types(PyUFuncObject *ufunc)
 {
     /* return a list with types grouped input->output */
     PyObject *list;
     PyObject *str;
-    int k, j, n, nt = self->ntypes;
-    int ni = self->nin;
-    int no = self->nout;
+    int k, j, n, nt = ufunc->ntypes;
+    int ni = ufunc->nin;
+    int no = ufunc->nout;
     char *t;
     list = PyList_New(nt);
     if (list == NULL) {
@@ -4513,13 +4899,13 @@ ufunc_get_types(PyUFuncObject *self)
     n = 0;
     for (k = 0; k < nt; k++) {
         for (j = 0; j<ni; j++) {
-            t[j] = _typecharfromnum(self->types[n]);
+            t[j] = _typecharfromnum(ufunc->types[n]);
             n++;
         }
         t[ni] = '-';
         t[ni+1] = '>';
         for (j = 0; j < no; j++) {
-            t[ni + 2 + j] = _typecharfromnum(self->types[n]);
+            t[ni + 2 + j] = _typecharfromnum(ufunc->types[n]);
             n++;
         }
         str = PyUString_FromStringAndSize(t, no + ni + 2);
@@ -4530,15 +4916,15 @@ ufunc_get_types(PyUFuncObject *self)
 }
 
 static PyObject *
-ufunc_get_name(PyUFuncObject *self)
+ufunc_get_name(PyUFuncObject *ufunc)
 {
-    return PyUString_FromString(self->name);
+    return PyUString_FromString(ufunc->name);
 }
 
 static PyObject *
-ufunc_get_identity(PyUFuncObject *self)
+ufunc_get_identity(PyUFuncObject *ufunc)
 {
-    switch(self->identity) {
+    switch(ufunc->identity) {
     case PyUFunc_One:
         return PyInt_FromLong(1);
     case PyUFunc_Zero:
@@ -4548,12 +4934,12 @@ ufunc_get_identity(PyUFuncObject *self)
 }
 
 static PyObject *
-ufunc_get_signature(PyUFuncObject *self)
+ufunc_get_signature(PyUFuncObject *ufunc)
 {
-    if (!self->core_enabled) {
+    if (!ufunc->core_enabled) {
         Py_RETURN_NONE;
     }
-    return PyUString_FromString(self->core_signature);
+    return PyUString_FromString(ufunc->core_signature);
 }
 
 #undef _typecharfromnum
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index fb7352070..0d6cf19f1 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -122,19 +122,16 @@ ensure_dtype_nbo(PyArray_Descr *type)
 /*UFUNC_API
  *
  * This function applies the default type resolution rules
- * for the provided ufunc, filling out_dtypes, out_innerloop,
- * and out_innerloopdata.
+ * for the provided ufunc.
  *
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_DefaultTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_DefaultTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int i, nop = ufunc->nin + ufunc->nout;
     int retval = 0, any_object = 0;
@@ -151,20 +148,20 @@ PyUFunc_DefaultTypeResolution(PyUFuncObject *ufunc,
     /*
      * Decide the casting rules for inputs and outputs.  We want
      * NPY_SAFE_CASTING or stricter, so that the loop selection code
-     * doesn't choose an integer loop for float inputs, for example.
+     * doesn't choose an integer loop for float inputs, or a float32
+     * loop for float64 inputs.
      */
     input_casting = (casting > NPY_SAFE_CASTING) ? NPY_SAFE_CASTING : casting;
 
     if (type_tup == NULL) {
         /* Find the best ufunc inner loop, and fill in the dtypes */
-        retval = find_best_ufunc_inner_loop(ufunc, operands,
+        retval = linear_search_type_resolver(ufunc, operands,
                         input_casting, casting, any_object,
-                        out_dtypes, out_innerloop, out_innerloopdata);
+                        out_dtypes);
     } else {
         /* Find the specified ufunc inner loop, and fill in the dtypes */
-        retval = find_specified_ufunc_inner_loop(ufunc, type_tup,
-                        operands, casting, any_object, out_dtypes,
-                        out_innerloop, out_innerloopdata);
+        retval = type_tuple_type_resolver(ufunc, type_tup,
+                        operands, casting, any_object, out_dtypes);
     }
 
     return retval;
@@ -176,22 +173,16 @@ PyUFunc_DefaultTypeResolution(PyUFuncObject *ufunc,
  * PyArray_ResultType instead of a linear search to get the best
  * loop.
  *
- * Note that a simpler linear search through the functions loop
- * is still done, but switching to a simple array lookup for
- * built-in types would be better at some point.
- *
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
-    int i, type_num, type_num1, type_num2;
+    int i, type_num1, type_num2;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -212,8 +203,8 @@ PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
     type_num2 = PyArray_DESCR(operands[1])->type_num;
     if (type_num1 >= NPY_NTYPES || type_num2 >= NPY_NTYPES ||
             type_num1 == NPY_OBJECT || type_num2 == NPY_OBJECT) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_DefaultTypeResolver(ufunc, casting, operands,
+                type_tup, out_dtypes);
     }
 
     if (type_tup == NULL) {
@@ -226,24 +217,30 @@ PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
         Py_INCREF(out_dtypes[1]);
     }
     else {
+        PyObject *item;
+        PyArray_Descr *dtype = NULL;
+
         /*
          * If the type tuple isn't a single-element tuple, let the
          * default type resolution handle this one.
          */
         if (!PyTuple_Check(type_tup) || PyTuple_GET_SIZE(type_tup) != 1) {
-            return PyUFunc_DefaultTypeResolution(ufunc, casting,
-                    operands, type_tup, out_dtypes,
-                    out_innerloop, out_innerloopdata);
+            return PyUFunc_DefaultTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
         }
 
-        if (!PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 0))) {
+        item = PyTuple_GET_ITEM(type_tup, 0);
+
+        if (item == Py_None) {
             PyErr_SetString(PyExc_ValueError,
                     "require data type in the type tuple");
             return -1;
         }
+        else if (!PyArray_DescrConverter(item, &dtype)) {
+            return -1;
+        }
 
-        out_dtypes[0] = ensure_dtype_nbo(
-                            (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 0));
+        out_dtypes[0] = ensure_dtype_nbo(dtype);
         if (out_dtypes[0] == NULL) {
             return -1;
         }
@@ -270,31 +267,7 @@ PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    type_num = out_dtypes[0]->type_num;
-
-    /* If we have a built-in type, search in the functions list */
-    if (type_num < NPY_NTYPES) {
-        char *types = ufunc->types;
-        int n = ufunc->ntypes;
-
-        for (i = 0; i < n; ++i) {
-            if (types[3*i] == type_num) {
-                *out_innerloop = ufunc->functions[i];
-                *out_innerloopdata = ufunc->data[i];
-                return 0;
-            }
-        }
-
-        PyErr_Format(PyExc_TypeError,
-                "ufunc '%s' not supported for the input types",
-                ufunc_name);
-        return -1;
-    }
-    else {
-        PyErr_SetString(PyExc_RuntimeError,
-                "user type shouldn't have resulted from type promotion");
-        return -1;
-    }
+    return 0;
 }
 
 /*
@@ -309,15 +282,13 @@ PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleUnaryOperationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
-    int i, type_num, type_num1;
+    int i, type_num1;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -336,8 +307,8 @@ PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
      */
     type_num1 = PyArray_DESCR(operands[0])->type_num;
     if (type_num1 >= NPY_NTYPES || type_num1 == NPY_OBJECT) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_DefaultTypeResolver(ufunc, casting, operands,
+                type_tup, out_dtypes);
     }
 
     if (type_tup == NULL) {
@@ -350,24 +321,30 @@ PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
         Py_INCREF(out_dtypes[1]);
     }
     else {
+        PyObject *item;
+        PyArray_Descr *dtype = NULL;
+
         /*
          * If the type tuple isn't a single-element tuple, let the
          * default type resolution handle this one.
          */
         if (!PyTuple_Check(type_tup) || PyTuple_GET_SIZE(type_tup) != 1) {
-            return PyUFunc_DefaultTypeResolution(ufunc, casting,
-                    operands, type_tup, out_dtypes,
-                    out_innerloop, out_innerloopdata);
+            return PyUFunc_DefaultTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
         }
 
-        if (!PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 0))) {
+        item = PyTuple_GET_ITEM(type_tup, 0);
+
+        if (item == Py_None) {
             PyErr_SetString(PyExc_ValueError,
                     "require data type in the type tuple");
             return -1;
         }
+        else if (!PyArray_DescrConverter(item, &dtype)) {
+            return -1;
+        }
 
-        out_dtypes[0] = ensure_dtype_nbo(
-                            (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 0));
+        out_dtypes[0] = ensure_dtype_nbo(dtype);
         if (out_dtypes[0] == NULL) {
             return -1;
         }
@@ -384,31 +361,7 @@ PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    type_num = out_dtypes[0]->type_num;
-
-    /* If we have a built-in type, search in the functions list */
-    if (type_num < NPY_NTYPES) {
-        char *types = ufunc->types;
-        int n = ufunc->ntypes;
-
-        for (i = 0; i < n; ++i) {
-            if (types[2*i] == type_num) {
-                *out_innerloop = ufunc->functions[i];
-                *out_innerloopdata = ufunc->data[i];
-                return 0;
-            }
-        }
-
-        PyErr_Format(PyExc_TypeError,
-                "ufunc '%s' not supported for the input types",
-                ufunc_name);
-        return -1;
-    }
-    else {
-        PyErr_SetString(PyExc_RuntimeError,
-                "user type shouldn't have resulted from type promotion");
-        return -1;
-    }
+    return 0;
 }
 
 /*
@@ -417,18 +370,15 @@ PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
  * casting.
  */
 NPY_NO_EXPORT int
-PyUFunc_OnesLikeTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_OnesLikeTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING NPY_UNUSED(casting),
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
-    return PyUFunc_SimpleUnaryOperationTypeResolution(ufunc,
+    return PyUFunc_SimpleUnaryOperationTypeResolver(ufunc,
                         NPY_UNSAFE_CASTING,
-                        operands, type_tup, out_dtypes,
-                        out_innerloop, out_innerloopdata);
+                        operands, type_tup, out_dtypes);
 }
 
 
@@ -445,15 +395,13 @@ PyUFunc_OnesLikeTypeResolution(PyUFuncObject *ufunc,
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleBinaryOperationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
-    int i, type_num, type_num1, type_num2;
+    int i, type_num1, type_num2;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -474,8 +422,8 @@ PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
     type_num2 = PyArray_DESCR(operands[1])->type_num;
     if (type_num1 >= NPY_NTYPES || type_num2 >= NPY_NTYPES ||
             type_num1 == NPY_OBJECT || type_num2 == NPY_OBJECT) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_DefaultTypeResolver(ufunc, casting, operands,
+                type_tup, out_dtypes);
     }
 
     if (type_tup == NULL) {
@@ -490,24 +438,30 @@ PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
         Py_INCREF(out_dtypes[2]);
     }
     else {
+        PyObject *item;
+        PyArray_Descr *dtype = NULL;
+
         /*
          * If the type tuple isn't a single-element tuple, let the
          * default type resolution handle this one.
          */
         if (!PyTuple_Check(type_tup) || PyTuple_GET_SIZE(type_tup) != 1) {
-            return PyUFunc_DefaultTypeResolution(ufunc, casting,
-                    operands, type_tup, out_dtypes,
-                    out_innerloop, out_innerloopdata);
+            return PyUFunc_DefaultTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
         }
 
-        if (!PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 0))) {
+        item = PyTuple_GET_ITEM(type_tup, 0);
+
+        if (item == Py_None) {
             PyErr_SetString(PyExc_ValueError,
                     "require data type in the type tuple");
             return -1;
         }
+        else if (!PyArray_DescrConverter(item, &dtype)) {
+            return -1;
+        }
 
-        out_dtypes[0] = ensure_dtype_nbo(
-                            (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 0));
+        out_dtypes[0] = ensure_dtype_nbo(dtype);
         if (out_dtypes[0] == NULL) {
             return -1;
         }
@@ -526,31 +480,7 @@ PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    type_num = out_dtypes[0]->type_num;
-
-    /* If we have a built-in type, search in the functions list */
-    if (type_num < NPY_NTYPES) {
-        char *types = ufunc->types;
-        int n = ufunc->ntypes;
-
-        for (i = 0; i < n; ++i) {
-            if (types[3*i] == type_num) {
-                *out_innerloop = ufunc->functions[i];
-                *out_innerloopdata = ufunc->data[i];
-                return 0;
-            }
-        }
-
-        PyErr_Format(PyExc_TypeError,
-                "ufunc '%s' not supported for the input types",
-                ufunc_name);
-        return -1;
-    }
-    else {
-        PyErr_SetString(PyExc_RuntimeError,
-                "user type shouldn't have resulted from type promotion");
-        return -1;
-    }
+    return 0;
 }
 
 /*
@@ -561,23 +491,20 @@ PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_AbsoluteTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_AbsoluteTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     /* Use the default for complex types, to find the loop producing float */
     if (PyTypeNum_ISCOMPLEX(PyArray_DESCR(operands[0])->type_num)) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                    type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_DefaultTypeResolver(ufunc, casting, operands,
+                    type_tup, out_dtypes);
     }
     else {
-        return PyUFunc_SimpleUnaryOperationTypeResolution(ufunc, casting,
-                    operands, type_tup, out_dtypes,
-                    out_innerloop, out_innerloopdata);
+        return PyUFunc_SimpleUnaryOperationTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
     }
 }
 
@@ -668,17 +595,14 @@ timedelta_dtype_with_copied_meta(PyArray_Descr *dtype)
  *    m8[Y|M|B] + M8[<A>]
  */
 NPY_NO_EXPORT int
-PyUFunc_AdditionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_AdditionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int type_num1, type_num2;
-    char *types;
-    int i, n;
+    int i;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -688,8 +612,8 @@ PyUFunc_AdditionTypeResolution(PyUFuncObject *ufunc,
 
     /* Use the default when datetime and timedelta are not involved */
     if (!PyTypeNum_ISDATETIME(type_num1) && !PyTypeNum_ISDATETIME(type_num2)) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                    type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
     }
 
     if (type_num1 == NPY_TIMEDELTA) {
@@ -831,22 +755,7 @@ PyUFunc_AdditionTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    /* Search in the functions list */
-    types = ufunc->types;
-    n = ufunc->ntypes;
-
-    for (i = 0; i < n; ++i) {
-        if (types[3*i] == type_num1 && types[3*i+1] == type_num2) {
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            return 0;
-        }
-    }
-
-    PyErr_Format(PyExc_TypeError,
-            "internal error: could not find appropriate datetime "
-            "inner loop in %s ufunc", ufunc_name);
-    return -1;
+    return 0;
 
 type_reso_error: {
         PyObject *errmsg;
@@ -875,17 +784,14 @@ type_reso_error: {
  *    M8[<A>] - m8[Y|M|B]
  */
 NPY_NO_EXPORT int
-PyUFunc_SubtractionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SubtractionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int type_num1, type_num2;
-    char *types;
-    int i, n;
+    int i;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -895,8 +801,8 @@ PyUFunc_SubtractionTypeResolution(PyUFuncObject *ufunc,
 
     /* Use the default when datetime and timedelta are not involved */
     if (!PyTypeNum_ISDATETIME(type_num1) && !PyTypeNum_ISDATETIME(type_num2)) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                    type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
     }
 
     if (type_num1 == NPY_TIMEDELTA) {
@@ -1019,22 +925,7 @@ PyUFunc_SubtractionTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    /* Search in the functions list */
-    types = ufunc->types;
-    n = ufunc->ntypes;
-
-    for (i = 0; i < n; ++i) {
-        if (types[3*i] == type_num1 && types[3*i+1] == type_num2) {
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            return 0;
-        }
-    }
-
-    PyErr_Format(PyExc_TypeError,
-            "internal error: could not find appropriate datetime "
-            "inner loop in %s ufunc", ufunc_name);
-    return -1;
+    return 0;
 
 type_reso_error: {
         PyObject *errmsg;
@@ -1060,17 +951,14 @@ type_reso_error: {
  *    m8[<A>] * float## => m8[<A>] * float64
  */
 NPY_NO_EXPORT int
-PyUFunc_MultiplicationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_MultiplicationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int type_num1, type_num2;
-    char *types;
-    int i, n;
+    int i;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -1080,8 +968,8 @@ PyUFunc_MultiplicationTypeResolution(PyUFuncObject *ufunc,
 
     /* Use the default when datetime and timedelta are not involved */
     if (!PyTypeNum_ISDATETIME(type_num1) && !PyTypeNum_ISDATETIME(type_num2)) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                    type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting,
+                    operands, type_tup, out_dtypes);
     }
 
     if (type_num1 == NPY_TIMEDELTA) {
@@ -1180,22 +1068,7 @@ PyUFunc_MultiplicationTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    /* Search in the functions list */
-    types = ufunc->types;
-    n = ufunc->ntypes;
-
-    for (i = 0; i < n; ++i) {
-        if (types[3*i] == type_num1 && types[3*i+1] == type_num2) {
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            return 0;
-        }
-    }
-
-    PyErr_Format(PyExc_TypeError,
-            "internal error: could not find appropriate datetime "
-            "inner loop in %s ufunc", ufunc_name);
-    return -1;
+    return 0;
 
 type_reso_error: {
         PyObject *errmsg;
@@ -1220,17 +1093,14 @@ type_reso_error: {
  *    m8[<A>] / float## to m8[<A>] / float64 -> m8[<A>]
  */
 NPY_NO_EXPORT int
-PyUFunc_DivisionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_DivisionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int type_num1, type_num2;
-    char *types;
-    int i, n;
+    int i;
     char *ufunc_name;
 
     ufunc_name = ufunc->name ? ufunc->name : "<unnamed ufunc>";
@@ -1240,8 +1110,8 @@ PyUFunc_DivisionTypeResolution(PyUFuncObject *ufunc,
 
     /* Use the default when datetime and timedelta are not involved */
     if (!PyTypeNum_ISDATETIME(type_num1) && !PyTypeNum_ISDATETIME(type_num2)) {
-        return PyUFunc_DefaultTypeResolution(ufunc, casting, operands,
-                    type_tup, out_dtypes, out_innerloop, out_innerloopdata);
+        return PyUFunc_DefaultTypeResolver(ufunc, casting, operands,
+                    type_tup, out_dtypes);
     }
 
     if (type_num1 == NPY_TIMEDELTA) {
@@ -1317,22 +1187,7 @@ PyUFunc_DivisionTypeResolution(PyUFuncObject *ufunc,
         return -1;
     }
 
-    /* Search in the functions list */
-    types = ufunc->types;
-    n = ufunc->ntypes;
-
-    for (i = 0; i < n; ++i) {
-        if (types[3*i] == type_num1 && types[3*i+1] == type_num2) {
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            return 0;
-        }
-    }
-
-    PyErr_Format(PyExc_TypeError,
-            "internal error: could not find appropriate datetime "
-            "inner loop in %s ufunc", ufunc_name);
-    return -1;
+    return 0;
 
 type_reso_error: {
         PyObject *errmsg;
@@ -1349,6 +1204,122 @@ type_reso_error: {
     }
 }
 
+static int
+find_userloop(PyUFuncObject *ufunc,
+                PyArray_Descr **dtypes,
+                PyUFuncGenericFunction *out_innerloop,
+                void **out_innerloopdata)
+{
+    npy_intp i, nin = ufunc->nin, j, nargs = nin + ufunc->nout;
+    PyUFunc_Loop1d *funcdata;
+
+    /* Use this to try to avoid repeating the same userdef loop search */
+    int last_userdef = -1;
+
+    for (i = 0; i < nin; ++i) {
+        int type_num = dtypes[i]->type_num;
+        if (type_num != last_userdef && PyTypeNum_ISUSERDEF(type_num)) {
+            PyObject *key, *obj;
+
+            last_userdef = type_num;
+
+            key = PyInt_FromLong(type_num);
+            if (key == NULL) {
+                return -1;
+            }
+            obj = PyDict_GetItem(ufunc->userloops, key);
+            Py_DECREF(key);
+            if (obj == NULL) {
+                continue;
+            }
+            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            while (funcdata != NULL) {
+                int *types = funcdata->arg_types;
+
+                for (j = 0; j < nargs; ++j) {
+                    if (types[j] != dtypes[j]->type_num) {
+                        break;
+                    }
+                }
+                /* It matched */
+                if (j == nargs) {
+                    *out_innerloop = funcdata->func;
+                    *out_innerloopdata = funcdata->data;
+                    return 1;
+                }
+
+                funcdata = funcdata->next;
+            }
+        }
+    }
+
+    /* Didn't find a match */
+    return 0;
+}
+
+NPY_NO_EXPORT int
+PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
+                                PyArray_Descr **dtypes,
+                                PyUFuncGenericFunction *out_innerloop,
+                                void **out_innerloopdata,
+                                int *out_needs_api)
+{
+    int nargs = ufunc->nargs;
+    char *types;
+    const char *ufunc_name;
+    PyObject *errmsg;
+    int i, j;
+
+    ufunc_name = ufunc->name ? ufunc->name : "(unknown)";
+
+    /*
+     * If there are user-loops search them first.
+     * TODO: There needs to be a loop selection acceleration structure,
+     *       like a hash table.
+     */
+    if (ufunc->userloops) {
+        switch (find_userloop(ufunc, dtypes,
+                    out_innerloop, out_innerloopdata)) {
+            /* Error */
+            case -1:
+                return -1;
+            /* Found a loop */
+            case 1:
+                return 0;
+        }
+    }
+
+    types = ufunc->types;
+    for (i = 0; i < ufunc->ntypes; ++i) {
+        /* Copy the types into an int array for matching */
+        for (j = 0; j < nargs; ++j) {
+            if (types[j] != dtypes[j]->type_num) {
+                break;
+            }
+        }
+        if (j == nargs) {
+            *out_innerloop = ufunc->functions[i];
+            *out_innerloopdata = ufunc->data[i];
+            return 0;
+        }
+
+        types += nargs;
+    }
+
+    errmsg = PyUString_FromFormat("ufunc '%s' did not contain a loop "
+                    "with signature matching types ", ufunc_name);
+    for (i = 0; i < nargs; ++i) {
+        PyUString_ConcatAndDel(&errmsg,
+                PyObject_Repr((PyObject *)dtypes[i]));
+        if (i < nargs - 1) {
+            PyUString_ConcatAndDel(&errmsg, PyUString_FromString(" "));
+        }
+    }
+    PyErr_SetObject(PyExc_TypeError, errmsg);
+
+    return -1;
+}
+
 typedef struct {
     NpyAuxData base;
     PyUFuncGenericFunction unmasked_innerloop;
@@ -1380,40 +1351,34 @@ ufunc_masker_data_clone(NpyAuxData *data)
  */
 static void
 unmasked_ufunc_loop_as_masked(
-             char **args,
-             npy_intp *dimensions,
-             npy_intp *steps,
+             char **dataptrs, npy_intp *strides,
+             char *mask, npy_intp mask_stride,
+             npy_intp loopsize,
              NpyAuxData *innerloopdata)
 {
     _ufunc_masker_data *data;
     int iargs, nargs;
     PyUFuncGenericFunction unmasked_innerloop;
     void *unmasked_innerloopdata;
-    npy_intp loopsize, subloopsize;
-    char *mask;
-    npy_intp mask_stride;
+    npy_intp subloopsize;
 
     /* Put the aux data into local variables */
     data = (_ufunc_masker_data *)innerloopdata;
     unmasked_innerloop = data->unmasked_innerloop;
     unmasked_innerloopdata = data->unmasked_innerloopdata;
     nargs = data->nargs;
-    loopsize = *dimensions;
-    mask = args[nargs];
-    mask_stride = steps[nargs];
-
 
     /* Process the data as runs of unmasked values */
     do {
         /* Skip masked values */
         subloopsize = 0;
         while (subloopsize < loopsize &&
-                        !NpyMask_IsExposed(*(npy_mask *)mask)) {
+                        !NpyMaskValue_IsExposed(*(npy_mask *)mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
         for (iargs = 0; iargs < nargs; ++iargs) {
-            args[iargs] += subloopsize * steps[iargs];
+            dataptrs[iargs] += subloopsize * strides[iargs];
         }
         loopsize -= subloopsize;
         /*
@@ -1422,39 +1387,53 @@ unmasked_ufunc_loop_as_masked(
          */
         subloopsize = 0;
         while (subloopsize < loopsize &&
-                        NpyMask_IsExposed(*(npy_mask *)mask)) {
+                        NpyMaskValue_IsExposed(*(npy_mask *)mask)) {
             ++subloopsize;
             mask += mask_stride;
         }
-        unmasked_innerloop(args, &subloopsize, steps, unmasked_innerloopdata);
+        unmasked_innerloop(dataptrs, &subloopsize, strides,
+                                        unmasked_innerloopdata);
         for (iargs = 0; iargs < nargs; ++iargs) {
-            args[iargs] += subloopsize * steps[iargs];
+            dataptrs[iargs] += subloopsize * strides[iargs];
         }
         loopsize -= subloopsize;
     } while (loopsize > 0);
 }
 
 
-/*UFUNC_API
- *
- * This function calls the unmasked type resolution function of the
- * ufunc, then wraps it with a function which only calls the inner
- * loop where the mask is True.
+/*
+ * This function wraps a legacy inner loop so it becomes masked.
  *
  * Returns 0 on success, -1 on error.
  */
 NPY_NO_EXPORT int
-PyUFunc_DefaultTypeResolutionMasked(PyUFuncObject *ufunc,
-                                NPY_CASTING casting,
-                                PyArrayObject **operands,
-                                PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericMaskedFunction *out_innerloop,
-                                NpyAuxData **out_innerloopdata)
+PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
+                            PyArray_Descr **dtypes,
+                            PyArray_Descr *mask_dtype,
+                            npy_intp *NPY_UNUSED(fixed_strides),
+                            npy_intp NPY_UNUSED(fixed_mask_stride),
+                            PyUFunc_MaskedStridedInnerLoopFunc **out_innerloop,
+                            NpyAuxData **out_innerloopdata,
+                            int *out_needs_api)
 {
     int retcode;
     _ufunc_masker_data *data;
 
+    if (ufunc->legacy_inner_loop_selector == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "the ufunc default masked inner loop selector doesn't "
+                "yet support wrapping the new inner loop selector, it "
+                "still only wraps the legacy inner loop selector");
+        return -1;
+    }
+
+    if (mask_dtype->type_num != NPY_BOOL) {
+        PyErr_SetString(PyExc_ValueError,
+                "only boolean masks are supported in ufunc inner loops "
+                "presently");
+        return -1;
+    }
+
     /* Create a new NpyAuxData object for the masker data */
     data = (_ufunc_masker_data *)PyArray_malloc(sizeof(_ufunc_masker_data));
     if (data == NULL) {
@@ -1467,9 +1446,9 @@ PyUFunc_DefaultTypeResolutionMasked(PyUFuncObject *ufunc,
     data->nargs = ufunc->nin + ufunc->nout;
 
     /* Get the unmasked ufunc inner loop */
-    retcode = ufunc->type_resolution_function(ufunc, casting,
-                    operands, type_tup, out_dtypes,
-                    &data->unmasked_innerloop, &data->unmasked_innerloopdata);
+    retcode = ufunc->legacy_inner_loop_selector(ufunc, dtypes,
+                    &data->unmasked_innerloop, &data->unmasked_innerloopdata,
+                    out_needs_api);
     if (retcode < 0) {
         PyArray_free(data);
         return retcode;
@@ -1576,39 +1555,66 @@ ufunc_loop_matches(PyUFuncObject *self,
 
 static int
 set_ufunc_loop_data_types(PyUFuncObject *self, PyArrayObject **op,
-                    PyArray_Descr **out_dtype,
-                    int *types)
+                    PyArray_Descr **out_dtypes,
+                    int *type_nums)
 {
     int i, nin = self->nin, nop = nin + self->nout;
 
-    /* Fill the dtypes array */
+    /*
+     * Fill the dtypes array.
+     * For outputs,
+     * also search the inputs for a matching type_num to copy
+     * instead of creating a new one, similarly to preserve metadata.
+     **/
     for (i = 0; i < nop; ++i) {
-        out_dtype[i] = PyArray_DescrFromType(types[i]);
-        if (out_dtype[i] == NULL) {
-            while (--i >= 0) {
-                Py_DECREF(out_dtype[i]);
-                out_dtype[i] = NULL;
-            }
-            return -1;
+        /*
+         * Copy the dtype from 'op' if the type_num matches,
+         * to preserve metadata.
+         */
+        if (op[i] != NULL && PyArray_DESCR(op[i])->type_num == type_nums[i]) {
+            out_dtypes[i] = ensure_dtype_nbo(PyArray_DESCR(op[i]));
+            Py_XINCREF(out_dtypes[i]);
+        }
+        /*
+         * For outputs, copy the dtype from op[0] if the type_num
+         * matches, similarly to preserve metdata.
+         */
+        else if (i >= nin && op[0] != NULL &&
+                            PyArray_DESCR(op[0])->type_num == type_nums[i]) {
+            out_dtypes[i] = ensure_dtype_nbo(PyArray_DESCR(op[0]));
+            Py_XINCREF(out_dtypes[i]);
+        }
+        /* Otherwise create a plain descr from the type number */
+        else {
+            out_dtypes[i] = PyArray_DescrFromType(type_nums[i]);
+        }
+
+        if (out_dtypes[i] == NULL) {
+            goto fail;
         }
     }
 
     return 0;
+
+fail:
+    while (--i >= 0) {
+        Py_DECREF(out_dtypes[i]);
+        out_dtypes[i] = NULL;
+    }
+    return -1;
 }
 
 /*
  * Does a search through the arguments and the loops
  */
 static int
-find_ufunc_matching_userloop(PyUFuncObject *self,
+linear_search_userloop_type_resolver(PyUFuncObject *self,
                         PyArrayObject **op,
                         NPY_CASTING input_casting,
                         NPY_CASTING output_casting,
                         int any_object,
                         int use_min_scalar,
                         PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata,
                         int *out_no_castable_output,
                         char *out_err_src_typecode,
                         char *out_err_dst_typecode)
@@ -1650,11 +1656,6 @@ find_ufunc_matching_userloop(PyUFuncObject *self,
                     /* Found a match */
                     case 1:
                         set_ufunc_loop_data_types(self, op, out_dtype, types);
-
-                        /* Save the inner loop and its data */
-                        *out_innerloop = funcdata->func;
-                        *out_innerloopdata = funcdata->data;
-
                         return 0;
                 }
 
@@ -1671,16 +1672,14 @@ find_ufunc_matching_userloop(PyUFuncObject *self,
  * Does a search through the arguments and the loops
  */
 static int
-find_ufunc_specified_userloop(PyUFuncObject *self,
+type_tuple_userloop_type_resolver(PyUFuncObject *self,
                         int n_specified,
                         int *specified_types,
                         PyArrayObject **op,
                         NPY_CASTING casting,
                         int any_object,
                         int use_min_scalar,
-                        PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata)
+                        PyArray_Descr **out_dtype)
 {
     int i, j, nin = self->nin, nop = nin + self->nout;
     PyUFunc_Loop1d *funcdata;
@@ -1714,7 +1713,8 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
 
                 if (n_specified == nop) {
                     for (j = 0; j < nop; ++j) {
-                        if (types[j] != specified_types[j]) {
+                        if (types[j] != specified_types[j] &&
+                                    specified_types[j] != NPY_NOTYPE) {
                             matched = 0;
                             break;
                         }
@@ -1737,11 +1737,6 @@ find_ufunc_specified_userloop(PyUFuncObject *self,
                     /* It works */
                     case 1:
                         set_ufunc_loop_data_types(self, op, out_dtype, types);
-
-                        /* Save the inner loop and its data */
-                        *out_innerloop = funcdata->func;
-                        *out_innerloopdata = funcdata->data;
-
                         return 0;
                     /* Didn't match */
                     case 0:
@@ -1841,14 +1836,12 @@ should_use_min_scalar(PyArrayObject **op, int nop)
  * references in out_dtype.  This function does not do its own clean-up.
  */
 NPY_NO_EXPORT int
-find_best_ufunc_inner_loop(PyUFuncObject *self,
+linear_search_type_resolver(PyUFuncObject *self,
                         PyArrayObject **op,
                         NPY_CASTING input_casting,
                         NPY_CASTING output_casting,
                         int any_object,
-                        PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata)
+                        PyArray_Descr **out_dtype)
 {
     npy_intp i, j, nin = self->nin, nop = nin + self->nout;
     int types[NPY_MAXARGS];
@@ -1864,10 +1857,9 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
 
     /* If the ufunc has userloops, search for them. */
     if (self->userloops) {
-        switch (find_ufunc_matching_userloop(self, op,
+        switch (linear_search_userloop_type_resolver(self, op,
                                 input_casting, output_casting,
-                                any_object, use_min_scalar,
-                                out_dtype, out_innerloop, out_innerloopdata,
+                                any_object, use_min_scalar, out_dtype,
                                 &no_castable_output, &err_src_typecode,
                                 &err_dst_typecode)) {
             /* Error */
@@ -1916,14 +1908,8 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
             /* Found a match */
             case 1:
                 set_ufunc_loop_data_types(self, op, out_dtype, types);
-
-                /* Save the inner loop and its data */
-                *out_innerloop = self->functions[i];
-                *out_innerloopdata = self->data[i];
-
                 return 0;
         }
-
     }
 
     /* If no function was found, throw an error */
@@ -1958,14 +1944,12 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
  * references in out_dtype.  This function does not do its own clean-up.
  */
 NPY_NO_EXPORT int
-find_specified_ufunc_inner_loop(PyUFuncObject *self,
+type_tuple_type_resolver(PyUFuncObject *self,
                         PyObject *type_tup,
                         PyArrayObject **op,
                         NPY_CASTING casting,
                         int any_object,
-                        PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata)
+                        PyArray_Descr **out_dtype)
 {
     npy_intp i, j, n, nin = self->nin, nop = nin + self->nout;
     int n_specified = 0;
@@ -1982,23 +1966,37 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
 
     /* Fill in specified_types from the tuple or string */
     if (PyTuple_Check(type_tup)) {
+        int nonecount = 0;
         n = PyTuple_GET_SIZE(type_tup);
         if (n != 1 && n != nop) {
             PyErr_Format(PyExc_ValueError,
-                         "a type-tuple must be specified " \
+                         "a type-tuple must be specified "
                          "of length 1 or %d for ufunc '%s'", (int)nop,
                          self->name ? self->name : "(unknown)");
             return -1;
         }
 
         for (i = 0; i < n; ++i) {
-            PyArray_Descr *dtype = NULL;
-            if (!PyArray_DescrConverter(PyTuple_GET_ITEM(type_tup, i),
-                                                                &dtype)) {
-                return -1;
+            PyObject *item = PyTuple_GET_ITEM(type_tup, i);
+            if (item == Py_None) {
+                specified_types[i] = NPY_NOTYPE;
+                ++nonecount;
+            }
+            else {
+                PyArray_Descr *dtype = NULL;
+                if (!PyArray_DescrConverter(item, &dtype)) {
+                    return -1;
+                }
+                specified_types[i] = dtype->type_num;
+                Py_DECREF(dtype);
             }
-            specified_types[i] = dtype->type_num;
-            Py_DECREF(dtype);
+        }
+
+        if (nonecount == n) {
+            PyErr_SetString(PyExc_ValueError,
+                    "the type-tuple provided to the ufunc "
+                    "must specify at least one none-None dtype");
+            return -1;
         }
 
         n_specified = n;
@@ -2064,11 +2062,11 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
 
     /* If the ufunc has userloops, search for them. */
     if (self->userloops) {
-        switch (find_ufunc_specified_userloop(self,
+        switch (type_tuple_userloop_type_resolver(self,
                         n_specified, specified_types,
                         op, casting,
                         any_object, use_min_scalar,
-                        out_dtype, out_innerloop, out_innerloopdata)) {
+                        out_dtype)) {
             /* Error */
             case -1:
                 return -1;
@@ -2089,7 +2087,8 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
 
         if (n_specified == nop) {
             for (j = 0; j < nop; ++j) {
-                if (types[j] != specified_types[j]) {
+                if (types[j] != specified_types[j] &&
+                                specified_types[j] != NPY_NOTYPE) {
                     matched = 0;
                     break;
                 }
@@ -2115,11 +2114,6 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
             /* It worked */
             case 1:
                 set_ufunc_loop_data_types(self, op, out_dtype, types);
-
-                /* Save the inner loop and its data */
-                *out_innerloop = self->functions[i];
-                *out_innerloopdata = self->data[i];
-
                 return 0;
             /* Didn't work */
             case 0:
@@ -2131,7 +2125,6 @@ find_specified_ufunc_inner_loop(PyUFuncObject *self,
                      ufunc_name);
                 return -1;
         }
-
     }
 
     /* If no function was found, throw an error */
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index f1ded2e9b..a1241827e 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -2,84 +2,67 @@
 #define _NPY_PRIVATE__UFUNC_TYPE_RESOLUTION_H_
 
 NPY_NO_EXPORT int
-PyUFunc_SimpleBinaryComparisonTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_SimpleUnaryOperationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleUnaryOperationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_OnesLikeTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_OnesLikeTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_SimpleBinaryOperationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SimpleBinaryOperationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_AbsoluteTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_AbsoluteTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_AdditionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_AdditionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_SubtractionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_SubtractionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 NPY_NO_EXPORT int
-PyUFunc_MultiplicationTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_MultiplicationTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
+
 NPY_NO_EXPORT int
-PyUFunc_DivisionTypeResolution(PyUFuncObject *ufunc,
+PyUFunc_DivisionTypeResolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata);
+                                PyArray_Descr **out_dtypes);
 
 /*
  * Does a linear search for the best inner loop of the ufunc.
@@ -88,14 +71,12 @@ PyUFunc_DivisionTypeResolution(PyUFuncObject *ufunc,
  * references in out_dtype.  This function does not do its own clean-up.
  */
 NPY_NO_EXPORT int
-find_best_ufunc_inner_loop(PyUFuncObject *self,
+linear_search_type_resolver(PyUFuncObject *self,
                         PyArrayObject **op,
                         NPY_CASTING input_casting,
                         NPY_CASTING output_casting,
                         int any_object,
-                        PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata);
+                        PyArray_Descr **out_dtype);
 
 /*
  * Does a linear search for the inner loop of the ufunc specified by type_tup.
@@ -104,13 +85,29 @@ find_best_ufunc_inner_loop(PyUFuncObject *self,
  * references in out_dtype.  This function does not do its own clean-up.
  */
 NPY_NO_EXPORT int
-find_specified_ufunc_inner_loop(PyUFuncObject *self,
+type_tuple_type_resolver(PyUFuncObject *self,
                         PyObject *type_tup,
                         PyArrayObject **op,
                         NPY_CASTING casting,
                         int any_object,
-                        PyArray_Descr **out_dtype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata);
+                        PyArray_Descr **out_dtype);
+
+NPY_NO_EXPORT int
+PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
+                                PyArray_Descr **dtypes,
+                                PyUFuncGenericFunction *out_innerloop,
+                                void **out_innerloopdata,
+                                int *out_needs_api);
+
+NPY_NO_EXPORT int
+PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
+                            PyArray_Descr **dtypes,
+                            PyArray_Descr *mask_dtypes,
+                            npy_intp *NPY_UNUSED(fixed_strides),
+                            npy_intp NPY_UNUSED(fixed_mask_stride),
+                            PyUFunc_MaskedStridedInnerLoopFunc **out_innerloop,
+                            NpyAuxData **out_innerloopdata,
+                            int *out_needs_api);
+
 
 #endif
diff --git a/numpy/core/src/umath/umathmodule.c.src b/numpy/core/src/umath/umathmodule.c.src
index 52dcd4c1b..9843b0eba 100644
--- a/numpy/core/src/umath/umathmodule.c.src
+++ b/numpy/core/src/umath/umathmodule.c.src
@@ -46,28 +46,38 @@
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
 static int
-object_ufunc_type_resolution(PyUFuncObject *ufunc,
+object_ufunc_type_resolver(PyUFuncObject *ufunc,
                                 NPY_CASTING casting,
                                 PyArrayObject **operands,
                                 PyObject *type_tup,
-                                PyArray_Descr **out_dtypes,
-                                PyUFuncGenericFunction *out_innerloop,
-                                void **out_innerloopdata)
+                                PyArray_Descr **out_dtypes)
 {
     int i, nop = ufunc->nin + ufunc->nout;
+    PyArray_Descr *obj_dtype;
 
-    out_dtypes[0] = PyArray_DescrFromType(NPY_OBJECT);
-    if (out_dtypes[0] == NULL) {
+    obj_dtype = PyArray_DescrFromType(NPY_OBJECT);
+    if (obj_dtype == NULL) {
         return -1;
     }
 
-    for (i = 1; i < nop; ++i) {
-        out_dtypes[i] = out_dtypes[0];
-        Py_INCREF(out_dtypes[0]);
+    for (i = 0; i < nop; ++i) {
+        Py_INCREF(obj_dtype);
+        out_dtypes[i] = obj_dtype;
     }
 
+    return 0;
+}
+
+static int
+object_ufunc_loop_selector(PyUFuncObject *ufunc,
+                            PyArray_Descr **NPY_UNUSED(dtypes),
+                            PyUFuncGenericFunction *out_innerloop,
+                            void **out_innerloopdata,
+                            int *out_needs_api)
+{
     *out_innerloop = ufunc->functions[0];
     *out_innerloopdata = ufunc->data[0];
+    *out_needs_api = 1;
 
     return 0;
 }
@@ -114,7 +124,8 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     self->core_offsets = NULL;
     self->core_signature = NULL;
 
-    self->type_resolution_function = &object_ufunc_type_resolution;
+    self->type_resolver = &object_ufunc_type_resolver;
+    self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
 
     pyname = PyObject_GetAttrString(function, "__name__");
     if (pyname) {
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index d2d8241f2..55d3c4ea8 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -84,6 +84,53 @@ def test_array_astype():
     assert_(not (a is b))
     assert_(type(b) != np.matrix)
 
+def test_copyto_fromscalar():
+    a = np.arange(6, dtype='f4').reshape(2,3)
+
+    # Simple copy
+    np.copyto(a, 1.5)
+    assert_equal(a, 1.5)
+    np.copyto(a.T, 2.5)
+    assert_equal(a, 2.5)
+
+    # Where-masked copy
+    mask = np.array([[0,1,0],[0,0,1]], dtype='?')
+    np.copyto(a, 3.5, where=mask)
+    assert_equal(a, [[2.5,3.5,2.5],[2.5,2.5,3.5]])
+    mask = np.array([[0,1],[1,1],[1,0]], dtype='?')
+    np.copyto(a.T, 4.5, where=mask)
+    assert_equal(a, [[2.5,4.5,4.5],[4.5,4.5,3.5]])
+
+    # Simple copy to NA-masked
+    a_orig = a
+    a = a_orig.view(maskna=True)
+    a[...] = np.NA
+    np.copyto(a, 0.5)
+    assert_equal(a, 0.5)
+    a[...] = np.NA
+    np.copyto(a.T, 1.5)
+    assert_equal(a, 1.5)
+
+    # Where-masked copy to NA-masked
+    a[0,0] = np.NA
+    a[1,1] = np.NA
+    mask = np.array([[1,0,1],[0,0,1]], dtype='?')
+    np.copyto(a, 2.5, where=mask)
+    assert_equal(np.isna(a), [[0,0,0],[0,1,0]])
+    assert_equal(a_orig, [[2.5,1.5,2.5],[1.5,1.5,2.5]])
+
+    # Simple preservena=True copy
+    a[0,0] = np.NA
+    np.copyto(a, 3.5, preservena=True)
+    assert_equal(np.isna(a), [[1,0,0],[0,1,0]])
+    assert_equal(a_orig, [[2.5,3.5,3.5],[3.5,1.5,3.5]])
+
+    # Where-masked preservena=True copy
+    mask = np.array([[1,0,1],[0,0,1]], dtype='?')
+    np.copyto(a, 4.5, where=mask, preservena=True)
+    assert_equal(np.isna(a), [[1,0,0],[0,1,0]])
+    assert_equal(a_orig, [[2.5,3.5,4.5],[3.5,1.5,4.5]])
+
 def test_copyto():
     a = np.arange(6, dtype='i4').reshape(2,3)
 
@@ -120,5 +167,143 @@ def test_copyto():
     # 'dst' must be an array
     assert_raises(TypeError, np.copyto, [1,2,3], [2,3,4])
 
+def test_copyto_maskna():
+    a_orig = np.zeros((2,3), dtype='f8')
+    a = a_orig.view(maskna=True)
+
+    # Simple copy from non-masked to NA-masked
+    a[...] = np.NA
+    np.copyto(a, np.arange(6).reshape(2,3))
+    assert_equal(a, [[0,1,2],[3,4,5]])
+    a[...] = np.NA
+    np.copyto(a.T, np.arange(6).reshape(3,2) + 1)
+    assert_equal(a, [[1,3,5],[2,4,6]])
+
+    # Simple copy from NA-masked to NA-masked
+    a[...] = np.NA
+    a[1,2] = 12
+    tmp = np.arange(6, maskna=True).reshape(2,3)
+    tmp[0,1] = np.NA
+    tmp[1,2] = np.NA
+    np.copyto(a, tmp)
+    assert_equal(a_orig, [[0,3,2],[3,4,12]])
+    assert_equal(np.isna(a), [[0,1,0],[0,0,1]])
+
+    # Where-masked copy from non-masked to NA-masked
+    a[...] = np.NA
+    a[0,2] = 6
+    mask = np.array([[0,0,1],[0,1,0]], dtype='?')
+    tmp = np.arange(6).reshape(2,3) + 1
+    np.copyto(a, tmp, where=mask)
+    assert_equal(a_orig, [[0,3,3],[3,5,12]])
+    assert_equal(np.isna(a), ~mask)
+
+    # Where-masked copy from NA-masked to NA-masked
+    a[1,2] = 12
+    mask = np.array([[0,1,1],[0,0,1]], dtype='?')
+    tmp = np.arange(6, maskna=True).reshape(2,3) + 3
+    tmp[0,0] = np.NA
+    tmp[0,1] = np.NA
+    tmp[1,2] = np.NA
+    np.copyto(a, tmp, where=mask)
+    assert_equal(a_orig, [[0,3,5],[3,5,12]])
+    assert_equal(np.isna(a), [[1,1,0],[1,0,1]])
+
+    # Preserve-NA copy from non-masked to NA-masked
+    np.copyto(a, np.arange(6).reshape(2,3), preservena=True)
+    assert_equal(a_orig, [[0,3,2],[3,4,12]])
+    assert_equal(np.isna(a), [[1,1,0],[1,0,1]])
+
+    # Preserve-NA copy from NA-masked to NA-masked
+    tmp = np.arange(6, maskna=True).reshape(2,3) + 1
+    tmp[0,0] = np.NA
+    tmp[1,1] = np.NA
+    np.copyto(a, tmp, preservena=True)
+    assert_equal(a_orig, [[0,3,3],[3,4,12]])
+    assert_equal(np.isna(a), [[1,1,0],[1,1,1]])
+
+    # Where-masked preserve-NA copy from non-masked to NA-masked
+    tmp = np.arange(6).reshape(2,3) + 3
+    a[1,2] = 12
+    mask = np.array([[0,1,1],[0,1,0]], dtype='?')
+    np.copyto(a, tmp, where=mask, preservena=True)
+    assert_equal(a_orig, [[0,3,5],[3,4,12]])
+    assert_equal(np.isna(a), [[1,1,0],[1,1,0]])
+
+    # Where-masked preserve-NA copy from NA-masked to NA-masked
+    a[0,0] = 0
+    mask = np.array([[0,1,1],[1,0,1]], dtype='?')
+    tmp = np.arange(6, maskna=True).reshape(2,3) + 1
+    tmp[1,0] = np.NA
+    tmp[1,2] = np.NA
+    np.copyto(a, tmp, where=mask, preservena=True)
+    assert_equal(a_orig, [[0,3,3],[3,4,12]])
+    assert_equal(np.isna(a), [[0,1,0],[1,1,1]])
+
+def test_copy_order():
+    a = np.arange(24).reshape(2,3,4)
+    b = a.copy(order='F')
+    c = np.arange(24).reshape(2,4,3).swapaxes(1,2)
+
+    def check_copy_result(x, y, ccontig, fcontig, strides=False):
+        assert_(not (x is y))
+        assert_equal(x, y)
+        assert_equal(res.flags.c_contiguous, ccontig)
+        assert_equal(res.flags.f_contiguous, fcontig)
+        if strides:
+            assert_equal(x.strides, y.strides)
+        else:
+            assert_(x.strides != y.strides)
+
+    # Validate the initial state of a, b, and c
+    assert_(a.flags.c_contiguous)
+    assert_(not a.flags.f_contiguous)
+    assert_(not b.flags.c_contiguous)
+    assert_(b.flags.f_contiguous)
+    assert_(not c.flags.c_contiguous)
+    assert_(not c.flags.f_contiguous)
+
+    # Copy with order='C'
+    res = a.copy(order='C')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = b.copy(order='C')
+    check_copy_result(res, b, ccontig=True, fcontig=False, strides=False)
+    res = c.copy(order='C')
+    check_copy_result(res, c, ccontig=True, fcontig=False, strides=False)
+    res = np.copy(a, order='C')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = np.copy(b, order='C')
+    check_copy_result(res, b, ccontig=True, fcontig=False, strides=False)
+    res = np.copy(c, order='C')
+    check_copy_result(res, c, ccontig=True, fcontig=False, strides=False)
+
+    # Copy with order='F'
+    res = a.copy(order='F')
+    check_copy_result(res, a, ccontig=False, fcontig=True, strides=False)
+    res = b.copy(order='F')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = c.copy(order='F')
+    check_copy_result(res, c, ccontig=False, fcontig=True, strides=False)
+    res = np.copy(a, order='F')
+    check_copy_result(res, a, ccontig=False, fcontig=True, strides=False)
+    res = np.copy(b, order='F')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = np.copy(c, order='F')
+    check_copy_result(res, c, ccontig=False, fcontig=True, strides=False)
+
+    # Copy with order='K'
+    res = a.copy(order='K')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = b.copy(order='K')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = c.copy(order='K')
+    check_copy_result(res, c, ccontig=False, fcontig=False, strides=True)
+    res = np.copy(a, order='K')
+    check_copy_result(res, a, ccontig=True, fcontig=False, strides=True)
+    res = np.copy(b, order='K')
+    check_copy_result(res, b, ccontig=False, fcontig=True, strides=True)
+    res = np.copy(c, order='K')
+    check_copy_result(res, c, ccontig=False, fcontig=False, strides=True)
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index a624768b6..44ef60ee3 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -1310,7 +1310,7 @@ class TestDateTime(TestCase):
                          '2010-02-15')
         except ImportError:
             import warnings
-            warnings.warn("Need pytz library to test datetime timezones")
+            warnings.warn("pytz not found, pytz compatibility tests skipped")
 
     def test_datetime_arange(self):
         # With two datetimes provided as strings
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 6dc3e7554..3625d58b4 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -37,7 +37,8 @@ class TestBuiltin(TestCase):
         # Make sure invalid type strings raise exceptions
         for typestr in ['O3', 'O5', 'O7', 'b3', 'h4', 'I5', 'l4', 'l8',
                         'L4', 'L8', 'q8', 'q16', 'Q8', 'Q16', 'e3',
-                        'f5', 'd8', 't8', 'g12', 'g16']:
+                        'f5', 'd8', 't8', 'g12', 'g16',
+                        'NA[u4,0xffffffff]']:
             #print typestr
             assert_raises(TypeError, np.dtype, typestr)
 
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index d7136db51..b58890655 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -480,7 +480,7 @@ class TestEinSum(TestCase):
 
     def test_einsum_misc(self):
         # This call used to crash because of a bug in
-        # PyArray_FillWithZero
+        # PyArray_AssignZero
         a = np.ones((1,2))
         b = np.ones((2,2,1))
         assert_equal(np.einsum('ij...,j...->i...',a,b), [[[2],[2]]])
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
new file mode 100644
index 000000000..da19d2a2a
--- /dev/null
+++ b/numpy/core/tests/test_indexing.py
@@ -0,0 +1,25 @@
+import numpy as np
+from numpy.compat import asbytes
+from numpy.testing import *
+import sys, warnings
+
+# The C implementation of fancy indexing is relatively complicated,
+# and has many seeming inconsistencies. It also appears to lack any
+# kind of test suite, making any changes to the underlying code difficult
+# because of its fragility.
+
+# This file is to remedy the test suite part a little bit,
+# but hopefully NumPy indexing can be changed to be more systematic
+# at some point in the future.
+
+def test_boolean_indexing():
+    # Indexing a 2-dimensional array with a length-1 array of 'True'
+    a = np.array([[ 0.,  0.,  0.]])
+    b = np.array([ True], dtype=bool)
+    assert_equal(a[b], a)
+
+    a[b] = 1.
+    assert_equal(a, [[1., 1., 1.]])
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_maskna.py b/numpy/core/tests/test_maskna.py
new file mode 100644
index 000000000..bbda9f5fa
--- /dev/null
+++ b/numpy/core/tests/test_maskna.py
@@ -0,0 +1,1489 @@
+import numpy as np
+from numpy.compat import asbytes
+from numpy.testing import *
+import sys, warnings
+
+def test_array_maskna_flags():
+    a = np.arange(3)
+    assert_(not a.flags.maskna)
+    assert_(not a.flags.ownmaskna)
+    assert_(not a.flags['MASKNA'])
+    assert_(not a.flags['OWNMASKNA'])
+    # Add a mask by setting the flag
+    a.flags.maskna = True
+    assert_(a.flags.maskna)
+    assert_(a.flags.ownmaskna)
+    assert_(a.flags['MASKNA'])
+    assert_(a.flags['OWNMASKNA'])
+    # Can't remove the mask once it's created
+    def setmaskna(x, v):
+        x.maskna = v
+    assert_raises(ValueError, setmaskna, a.flags, False)
+    def setownmaskna(x, v):
+        x.ownmaskna = v
+    assert_raises(ValueError, setownmaskna, a.flags, False)
+
+def test_array_maskna_construction():
+    # Construction with NA inputs
+    a = np.array([1.0, 2.0, np.NA, 7.0], maskna=True)
+    assert_equal(a.dtype, np.dtype('f8'))
+    assert_(a.flags.maskna)
+    assert_equal(type(a[2]), np.NAType)
+    # Without the 'maskna=True', still produces an NA mask if NA is there
+    a = np.array([1.0, 2.0, np.NA, 7.0])
+    assert_equal(a.dtype, np.dtype('f8'))
+    assert_(a.flags.maskna)
+    assert_equal(type(a[2]), np.NAType)
+    # Without any NAs, does not produce an NA mask
+    a = np.array([1.0, 2.0, 4.0, 7.0])
+    assert_equal(a.dtype, np.dtype('f8'))
+    assert_(not a.flags.maskna)
+
+    # From np.NA as a straight scalar
+    a = np.array(np.NA, maskna=True)
+    assert_equal(type(a), np.ndarray)
+    assert_(np.isna(a))
+
+    # As a special case, converting np.NA to an array produces
+    # a zero-dimensional masked array
+    a = np.array(np.NA)
+    assert_equal(type(a), np.ndarray)
+    assert_(np.isna(a))
+
+    # The data type defaults to the same as an empty array if all is NA
+    a = np.array([np.NA], maskna=True)
+    b = np.array([])
+    assert_equal(a.dtype, b.dtype)
+    assert_(np.isna(a))
+
+    a = np.zeros((3,))
+    assert_(not a.flags.maskna)
+    a = np.zeros((3,), maskna=True)
+    assert_(a.flags.maskna)
+    assert_equal(np.isna(a), False)
+
+    a = np.ones((3,))
+    assert_(not a.flags.maskna)
+    a = np.ones((3,), maskna=True)
+    assert_(a.flags.maskna)
+    assert_equal(np.isna(a), False)
+
+    # np.empty returns all NAs if maskna is set to True
+    a = np.empty((3,))
+    assert_(not a.flags.maskna)
+    a = np.empty((3,), maskna=True)
+    assert_(a.flags.maskna)
+    assert_equal(np.isna(a), True)
+
+    # np.empty_like returns all NAs if maskna is set to True
+    tmp = np.arange(3)
+    a = np.empty_like(tmp)
+    assert_(not a.flags.maskna)
+    a = np.empty_like(tmp, maskna=True)
+    assert_(a.flags.maskna)
+    assert_equal(np.isna(a), True)
+
+def test_array_maskna_asarray():
+    a = np.arange(6).reshape(2,3)
+
+    # Should not add an NA mask by default
+    res = np.asarray(a)
+    assert_(res is a)
+    assert_(not res.flags.maskna)
+
+    # Should add an NA mask if requested
+    res = np.asarray(a, maskna=True)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+    res = np.asarray(a, ownmaskna=True)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    a.flags.maskna = True
+
+    # Should view or create a copy of the NA mask
+    res = np.asarray(a)
+    assert_(res is a)
+    res = np.asarray(a, maskna=True)
+    assert_(res is a)
+    res = np.asarray(a, ownmaskna=True)
+    assert_(res is a)
+
+    b = a.view()
+    assert_(not b.flags.ownmaskna)
+
+    res = np.asarray(b)
+    assert_(res is b)
+    res = np.asarray(b, maskna=True)
+    assert_(res is b)
+    res = np.asarray(b, ownmaskna=True)
+    assert_(not (res is b))
+    assert_(res.flags.ownmaskna)
+
+def test_array_maskna_copy():
+    a = np.array([1,2,3])
+    b = np.array([2,3,4], maskna=True)
+    c = np.array([3,4,np.NA], maskna=True)
+
+    # Make a copy, adding a mask
+    res = a.copy(maskna=True)
+    assert_equal(res, a)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    res = np.copy(a, maskna=True)
+    assert_equal(res, a)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    # Make a copy, removing a mask
+    res = b.copy(maskna=False)
+    assert_equal(res, b)
+    assert_(not res.flags.maskna)
+    assert_(not res.flags.ownmaskna)
+
+    res = np.copy(b, maskna=False)
+    assert_equal(res, b)
+    assert_(not res.flags.maskna)
+    assert_(not res.flags.ownmaskna)
+
+    # Copy with removing a mask doesn't work if there are NAs
+    assert_raises(ValueError, c.copy, maskna=False)
+    assert_raises(ValueError, np.copy, c, maskna=False)
+
+    # Make a copy, preserving non-masked
+    res = a.copy()
+    assert_equal(res, a)
+    assert_(not res.flags.maskna)
+    assert_(not res.flags.ownmaskna)
+
+    res = np.copy(a)
+    assert_equal(res, a)
+    assert_(not res.flags.maskna)
+    assert_(not res.flags.ownmaskna)
+
+    # Make a copy, preserving masked
+    res = b.copy()
+    assert_equal(res, b)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    res = np.copy(b)
+    assert_equal(res, b)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    # Make a copy, preserving masked with an NA
+    res = c.copy()
+    assert_array_equal(res, c)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+    res = np.copy(c)
+    assert_array_equal(res, c)
+    assert_(res.flags.maskna)
+    assert_(res.flags.ownmaskna)
+
+def test_array_maskna_repr():
+    # Test some simple reprs with NA in them
+    a = np.array(np.NA, maskna=True)
+    assert_equal(repr(a), 'array(NA, dtype=float64)')
+    a = np.array(3, maskna=True)
+    assert_equal(repr(a), 'array(3, maskna=True)')
+    a = np.array([np.NA, 3], maskna=True)
+    assert_equal(repr(a), 'array([NA, 3])')
+    a = np.array([np.NA, np.NA])
+    assert_equal(repr(a), 'array([ NA,  NA], dtype=float64)')
+    a = np.array([3.5, np.NA], maskna=True)
+    assert_equal(repr(a), 'array([ 3.5,   NA])')
+    a = np.array([3.75, 6.25], maskna=True)
+    assert_equal(repr(a), 'array([ 3.75,  6.25], maskna=True)')
+    a = np.array([3.75, 6.25], maskna=True, dtype='f4')
+    assert_equal(repr(a), 'array([ 3.75,  6.25], maskna=True, dtype=float32)')
+
+def test_isna():
+    # Objects which are not np.NA or ndarray all return False
+    assert_equal(np.isna(True), False)
+    assert_equal(np.isna("abc"), False)
+    assert_equal(np.isna([1,2,3]), False)
+    assert_equal(np.isna({3:5}), False)
+    # Various NA values return True
+    assert_equal(np.isna(np.NA), True)
+    assert_equal(np.isna(np.NA()), True)
+    assert_equal(np.isna(np.NA(5)), True)
+    assert_equal(np.isna(np.NA(dtype='f4')), True)
+    assert_equal(np.isna(np.NA(12,dtype='f4')), True)
+
+def test_array_maskna_item():
+    # With a zero-dimensional array
+    a = np.array(np.NA, maskna=True)
+
+    # Should return NA as the item
+    assert_equal(type(a.item()), np.NAType)
+
+    # Should be able to set the item
+    a.itemset(1.5)
+    assert_(not np.isna(a))
+    assert_equal(a, 1.5)
+    a.itemset(np.NA)
+    assert_(np.isna(a))
+
+    # With a one-dimensional array
+    a = np.array([1, np.NA, 2, np.NA], maskna=True)
+
+    # Should return the scalar or NA as the item
+    assert_(not np.isna(a.item(0)))
+    assert_equal(type(a.item(1)), np.NAType)
+
+    # Should be able to set the items
+    a.itemset(0, np.NA)
+    assert_(np.isna(a[0]))
+    a.itemset(1, 12)
+    assert_(not np.isna(a[1]))
+    assert_equal(a[1], 12)
+
+    # With a two-dimensional array
+    a = np.arange(6, maskna=True).reshape(2,3)
+    a[0,1] = np.NA
+    # Should return the scalar or NA as the item
+    assert_(not np.isna(a.item((0,0))))
+    assert_equal(type(a.item((0,1))), np.NAType)
+
+    # Should be able to set the items
+    a.itemset((0,1), 8)
+    assert_(not np.isna(a[0,1]))
+    assert_equal(a[0,1], 8)
+    a.itemset((1,1), np.NA)
+    assert_(np.isna(a[1,1]))
+
+def test_array_maskna_payload():
+    # Single numbered index
+    a = np.zeros((2,), maskna=True)
+    a[0] = np.NA
+    assert_equal(a[0].payload, None)
+
+    # Tuple index
+    a = np.zeros((2,3), maskna=True)
+    a[1,1] = np.NA
+    assert_equal(a[1,1].payload, None)
+
+def test_array_maskna_isna_1D():
+    a = np.arange(10)
+
+    # With no mask, it returns all False
+    assert_equal(np.isna(a), False)
+    assert_equal(np.isna(a).shape, (10,))
+
+    # With a mask but no NAs, it still returns all False
+    a.flags.maskna = True
+    assert_equal(np.isna(a), False)
+    assert_equal(np.isna(a).shape, (10,))
+
+    # Checking isna of a single value
+    assert_equal(np.isna(a[4]), False)
+    # Assigning NA to a single value
+    a[3] = np.NA
+    assert_equal(np.isna(a), [0,0,0,1,0,0,0,0,0,0])
+    # Checking isna of a single value
+    assert_equal(np.isna(a[3]), True)
+
+    # Checking isna of a slice
+    assert_equal(np.isna(a[1:6]), [0,0,1,0,0])
+    # Assigning NA to a slice
+    a[5:7] = np.NA
+    assert_equal(np.isna(a), [0,0,0,1,0,1,1,0,0,0])
+
+    # Checking isna of a strided slice
+    assert_equal(np.isna(a[1:8:2]), [0,1,1,0])
+    # Assigning NA to a strided slice
+    a[2:10:3] = np.NA
+    assert_equal(np.isna(a), [0,0,1,1,0,1,1,0,1,0])
+
+    # Checking isna of a boolean mask index
+    mask = np.array([1,1,0,0,0,1,0,1,1,0], dtype='?')
+    assert_equal(np.isna(a[mask]), [0,0,1,0,1])
+    # Assigning NA to a boolean masked index
+    a[mask] = np.NA
+    assert_equal(np.isna(a), [1,1,1,1,0,1,1,1,1,0])
+
+    # TODO: fancy indexing is next...
+
+def test_array_maskna_isna_2D():
+    a = np.zeros((3,4))
+
+    # With no mask, it returns all False
+    assert_equal(np.isna(a), False)
+    assert_equal(np.isna(a).shape, (3,4))
+
+    # With a mask but no NAs, it still returns all False
+    a.flags.maskna = True
+    assert_equal(np.isna(a), False)
+    assert_equal(np.isna(a).shape, (3,4))
+
+    # Checking isna of a single value
+    assert_equal(np.isna(a[1,2]), False)
+    # Assigning NA to a single value
+    a[1,2] = np.NA
+    assert_equal(np.isna(a), [[0,0,0,0],[0,0,1,0],[0,0,0,0]])
+    # Checking isna of a single value
+    assert_equal(np.isna(a[1,2]), True)
+
+    # Checking isna of a slice
+    assert_equal(np.isna(a[1:4,1:3]), [[0,1],[0,0]])
+    # Assigning NA to a slice
+    a[1:3,0:2] = np.NA
+    assert_equal(np.isna(a), [[0,0,0,0],[1,1,1,0],[1,1,0,0]])
+
+    # Checking isna of a strided slice
+    assert_equal(np.isna(a[1:,1:5:2]), [[1,0],[1,0]])
+    # Assigning NA to a strided slice
+    a[::2,::2] = np.NA
+    assert_equal(np.isna(a), [[1,0,1,0],[1,1,1,0],[1,1,1,0]])
+
+    # Checking isna of a boolean mask index
+    mask = np.array([[1,1,0,0],[0,1,0,1],[0,0,1,0]], dtype='?')
+    assert_equal(np.isna(a[mask]), [1,0,1,0,1])
+    # Assigning NA to a boolean masked index
+    a[mask] = np.NA
+    assert_equal(np.isna(a), [[1,1,1,0],[1,1,1,1],[1,1,1,0]])
+
+    # TODO: fancy indexing is next...
+
+def test_array_maskna_to_nomask():
+    # Assignment from an array with NAs to a non-masked array,
+    # excluding the NAs with a mask
+    a = np.array([[2,np.NA,5],[1,6,np.NA]], maskna=True)
+    mask = np.array([[1,0,0],[1,1,0]], dtype='?')
+    badmask = np.array([[1,0,0],[0,1,1]], dtype='?')
+    expected = np.array([[2,1,2],[1,6,5]])
+
+    # With masked indexing
+    b = np.arange(6).reshape(2,3)
+    b[mask] = a[mask]
+    assert_array_equal(b, expected)
+
+    # With copyto
+    b = np.arange(6).reshape(2,3)
+    np.copyto(b, a, where=mask)
+    assert_array_equal(b, expected)
+
+    # With masked indexing
+    b = np.arange(6).reshape(2,3)
+    def asn():
+        b[badmask] = a[badmask]
+    assert_raises(ValueError, asn)
+
+    # With copyto
+    b = np.arange(6).reshape(2,3)
+    assert_raises(ValueError, np.copyto, b, a, where=badmask)
+
+def test_array_maskna_view_function():
+    a = np.arange(10)
+
+    # Taking a view of a non-masked array, making sure there's a mask
+    b = a.view(maskna=True)
+    assert_(not a.flags.maskna)
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+
+    # Taking a view of a non-masked array, making sure there's an owned mask
+    b = a.view(ownmaskna=True)
+    assert_(not a.flags.maskna)
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+
+    # Taking a view of a masked array
+    c = b.view()
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+    assert_(c.flags.maskna)
+    assert_(not c.flags.ownmaskna)
+
+    # Taking a view of a masked array with maskna=False is invalid
+    assert_raises(ValueError, b.view, maskna=False)
+
+    # Taking a view of a masked array, making sure there's a mask
+    c = b.view(maskna = True)
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+    assert_(c.flags.maskna)
+    assert_(not c.flags.ownmaskna)
+
+    # Taking a view of a masked array, making sure there's an owned mask
+    c = b.view(ownmaskna = True)
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+
+def test_array_maskna_array_function_1D():
+    a = np.arange(10)
+    a_ref = a.copy()
+    b = a.view(maskna=True)
+    b[3:10:2] = np.NA
+    b_view = b.view()
+
+    # Ensure the setup is correct
+    assert_(not a.flags.maskna)
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+    assert_(b_view.flags.maskna)
+    assert_(not b_view.flags.ownmaskna)
+
+    # Should be able to add a mask with 'maskna='
+    c = np.array(a, maskna=True)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+    assert_(not (c is b))
+
+    # Should be able to add a mask with 'ownmaskna='
+    c = np.array(a, ownmaskna=True)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+    assert_(not (c is b))
+
+    # Should propagate mask
+    c = np.array(b)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+    assert_equal(np.isna(b), np.isna(c))
+    assert_(not (c is b))
+
+    # Should propagate mask with 'maskna=True'
+    c = np.array(b, maskna=True)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+    assert_equal(np.isna(b), np.isna(c))
+    assert_(not (c is b))
+
+    # Should propagate mask with 'ownmaskna=True'
+    c = np.array(b, ownmaskna=True)
+    assert_(c.flags.maskna)
+    assert_(c.flags.ownmaskna)
+    assert_equal(np.isna(b), np.isna(c))
+    assert_(not (c is b))
+
+    # Should be able to pass it through
+    c = np.array(b, copy=False)
+    assert_(c is b)
+
+    # Should be able to pass it through with 'maskna=True'
+    c = np.array(b, copy=False, maskna=True)
+    assert_(c is b)
+
+    # Should be able to pass it through with 'maskna=True'
+    c = np.array(b_view, copy=False, maskna=True)
+    assert_(c is b_view)
+
+    # Should be able to pass an owned mask through with 'ownmaskna=True'
+    c = np.array(b, copy=False, ownmaskna=True)
+    assert_(c is b)
+
+    # Should produce a view with an owned mask with 'ownmaskna=True'
+    c = np.array(b_view, copy=False, ownmaskna=True)
+    assert_(c.base is b_view)
+    assert_(c.flags.ownmaskna)
+    assert_(not (c is b_view))
+
+    # Should produce a view whose base is 'c', because 'c' owns
+    # the data for its mask
+    d = c.view()
+    assert_(d.base is c)
+    assert_(d.flags.maskna)
+    assert_(not d.flags.ownmaskna)
+
+def test_array_maskna_setasflat():
+    # Copy from a C to a F array with some NAs
+    a_orig = np.empty((2,3), order='C')
+    b_orig = np.empty((3,2), order='F')
+    a = a_orig.view(maskna=True)
+    b = b_orig.view(maskna=True)
+    a[...] = 1
+    a[0,1] = np.NA
+    a[1,2] = np.NA
+    b[...] = 2
+    b.setasflat(a)
+    assert_equal(np.isna(a), [[0,1,0],[0,0,1]])
+    assert_equal(b_orig, [[1,2],[1,1],[1,2]])
+    assert_equal(np.isna(b), [[0,1],[0,0],[0,1]])
+
+def test_array_maskna_ravel():
+    # From a C array
+    a = np.zeros((2,3), maskna=True, order='C')
+    a[0,1] = np.NA
+    a[1,2] = np.NA
+
+    # Ravel in C order returns a view
+    b = np.ravel(a)
+    assert_(b.base is a)
+    assert_equal(b.shape, (6,))
+    assert_(b.flags.maskna)
+    assert_(not b.flags.ownmaskna)
+    assert_equal(np.isna(b), [0,1,0,0,0,1])
+
+    # Ravel in F order returns a copy
+    b = np.ravel(a, order='F')
+    assert_(b.base is None)
+    assert_equal(b.shape, (6,))
+    assert_(b.flags.maskna)
+    assert_(b.flags.ownmaskna)
+    assert_equal(np.isna(b), [0,0,1,0,0,1])
+
+    a = np.arange(12, maskna=True).reshape(2,3,2).swapaxes(1,2)
+    assert_equal(a.ravel(order='K'), np.arange(12))
+
+def test_array_maskna_reshape():
+    # Simple reshape 1D -> 2D
+    a = np.arange(6, maskna=True)
+    a[1] = np.NA
+    a[5] = np.NA
+
+    # Reshape from 1D to C order
+    b = a.reshape(2,3)
+    assert_(b.base is a)
+    assert_equal(b.shape, (2,3))
+    assert_(b.flags.maskna)
+    assert_(not b.flags.ownmaskna)
+    assert_equal(np.isna(b), [[0,1,0],[0,0,1]])
+
+    # Reshape from 1D to F order
+    b = a.reshape(2,3,order='F')
+    assert_(b.base is a)
+    assert_equal(b.shape, (2,3))
+    assert_(b.flags.maskna)
+    assert_(not b.flags.ownmaskna)
+    assert_equal(np.isna(b), [[0,0,0],[1,0,1]])
+
+    # Add a new axis using 'newaxis'
+    a = np.array(np.NA, maskna=True)
+    assert_equal(np.isna(a[np.newaxis]), [True])
+
+def test_array_maskna_view_NA_assignment_1D():
+    a = np.arange(10)
+    a_ref = a.copy()
+
+    # Make sure that assigning NA doesn't affect the original data
+    b = a.view(maskna=True)
+    b[...] = np.NA
+    assert_equal(np.isna(b), True)
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[:] = np.NA
+    assert_equal(np.isna(b), True)
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[3:5] = np.NA
+    assert_equal(np.isna(b), [0,0,0,1,1,0,0,0,0,0])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[3:10:3] = np.NA
+    assert_equal(np.isna(b), [0,0,0,1,0,0,1,0,0,1])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[3] = np.NA
+    assert_equal(np.isna(b), [0,0,0,1,0,0,0,0,0,0])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    mask = np.array([0,1,0,1,1,0,0,0,1,1], dtype='?')
+    b[mask] = np.NA
+    assert_equal(np.isna(b), mask)
+    assert_equal(a, a_ref)
+
+    # TODO: fancy indexing is next...
+
+def test_array_maskna_view_NA_assignment_2D():
+    a = np.arange(6).reshape(2,3)
+    a_ref = a.copy()
+
+    # Make sure that assigning NA doesn't affect the original data
+    b = a.view(maskna=True)
+    b[...] = np.NA
+    assert_equal(np.isna(b), True)
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[:] = np.NA
+    assert_equal(np.isna(b), True)
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[0,:] = np.NA
+    assert_equal(np.isna(b[0]), True)
+    assert_equal(np.isna(b[1]), False)
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[1:,1:3] = np.NA
+    assert_equal(np.isna(b), [[0,0,0],[0,1,1]])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[1,::2] = np.NA
+    assert_equal(np.isna(b), [[0,0,0],[1,0,1]])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    b[0,2] = np.NA
+    assert_equal(np.isna(b), [[0,0,1],[0,0,0]])
+    assert_equal(a, a_ref)
+
+    b = a.view(maskna=True)
+    mask = np.array([[1,0,1],[1,1,0]], dtype='?')
+    b[mask] = np.NA
+    assert_equal(np.isna(b), mask)
+    assert_equal(a, a_ref)
+
+    # TODO: fancy indexing is next...
+
+def test_array_maskna_view_array_assignment_1D():
+    a = np.arange(5)
+    b = a.view(maskna=True)
+
+    # Assigning a constant scalar should unmask the values
+    b[...] = np.NA
+    b[...] = 3
+    assert_equal(a, 3)
+    assert_equal(np.isna(b), False)
+
+    # Assigning from a list should unmask the values
+    b[...] = np.NA
+    b[...] = [2]
+    assert_equal(a, [2,2,2,2,2])
+    assert_equal(np.isna(b), False)
+
+    # Assigning from a list should unmask the values
+    b[...] = np.NA
+    b[...] = [2,3,4,5,6]
+    assert_equal(a, [2,3,4,5,6])
+    assert_equal(np.isna(b), False)
+
+    # Assigning from a list with NAs should unmask the non-NA values
+    b[...] = np.NA
+    b[...] = [7,np.NA,2,0,np.NA]
+    assert_equal(a, [7,3,2,0,6])
+    assert_equal(np.isna(b), [0,1,0,0,1])
+
+    # Assigning from an unmasked array should unmask the values
+    b[...] = np.NA
+    b[...] = np.arange(5)
+    assert_equal(a, np.arange(5))
+    assert_equal(np.isna(b), False)
+
+    # Assigning from a masked array with no NAs should unmask the values
+    b[...] = np.NA
+    tmp = np.arange(5) + 1
+    tmp.flags.maskna = True
+    b[...] = tmp
+    assert_equal(a, np.arange(5) + 1)
+    assert_equal(np.isna(b), False)
+
+    # Assigning from a masked array with some NAs should unmask most
+    # of the values, and leave the value behind the NAs untouched
+    b[...] = np.NA
+    tmp = np.arange(5) + 5
+    tmp.flags.maskna = True
+    tmp[2] = np.NA
+    b[...] = tmp
+    assert_equal(a, [5,6,3,8,9])
+    assert_equal(np.isna(b), [0,0,1,0,0])
+
+    # Assigning to a single element should unmask the value
+    b[...] = np.NA
+    b[2] = 10
+    assert_equal(a, [5,6,10,8,9])
+    assert_equal(np.isna(b), [1,1,0,1,1])
+
+    # Assigning to a simple slice should unmask the values
+    b[...] = np.NA
+    b[2:] = 4
+    assert_equal(a, [5,6,4,4,4])
+    assert_equal(np.isna(b), [1,1,0,0,0])
+
+    # Assigning to a strided slice should unmask the values
+    b[...] = np.NA
+    b[3::-2] = 12
+    assert_equal(a, [5,12,4,12,4])
+    assert_equal(np.isna(b), [1,0,1,0,1])
+
+    # Assigning to a boolean index should unmask the values
+    b[...] = np.NA
+    mask = np.array([0,1,1,0,1], dtype='?')
+    b[mask] = 7
+    assert_equal(a, [5,7,7,12,7])
+    assert_equal(np.isna(b), [1,0,0,1,0])
+
+    # Assigning a list to a boolean index should unmask the values
+    b[...] = np.NA
+    mask = np.array([1,0,0,0,1], dtype='?')
+    b[mask] = [8,1]
+    assert_equal(a, [8,7,7,12,1])
+    assert_equal(np.isna(b), [0,1,1,1,0])
+
+    # Assigning a list with NA to a boolean index should unmask non-NA values
+    b[...] = np.NA
+    mask = np.array([0,1,1,0,0], dtype='?')
+    b[mask] = np.array([8,np.NA], maskna=True)
+    assert_equal(a, [8,8,7,12,1])
+    assert_equal(np.isna(b), [1,0,1,1,1])
+
+    # TODO: fancy indexing is next...
+
+def test_maskna_nonzero_1D():
+    a = np.zeros((5,), maskna=True)
+
+    # The nonzeros without any NAs
+    assert_equal(np.count_nonzero(a), 0)
+    assert_equal(np.nonzero(a)[0], [])
+    a[2] = 3
+    assert_equal(np.count_nonzero(a), 1)
+    assert_equal(np.nonzero(a)[0], [2])
+    a[3:] = 2
+    assert_equal(np.count_nonzero(a), 3)
+    assert_equal(np.nonzero(a)[0], [2,3,4])
+
+    # The nonzeros with an NA
+    a[2] = np.NA
+    assert_(np.isna(np.count_nonzero(a)))
+    assert_raises(ValueError, np.nonzero, a)
+
+def test_maskna_take_1D():
+    a = np.arange(5, maskna=True)
+    b = np.arange(3)
+    c = b.view(maskna=True)
+
+    # Take without any NAs
+    assert_equal(a.take([0,2,4]), [0,2,4])
+
+    # Take without any NAs, into non-NA output parameter
+    a.take([0,2,4], out=b)
+    assert_equal(b, [0,2,4])
+
+    # Take without any NAs, into NA output parameter
+    b[...] = 1
+    c[...] = np.NA
+    a.take([0,2,4], out=c)
+    assert_equal(c, [0,2,4])
+
+    # Take with some NAs
+    a[2] = np.NA
+    a[3] = np.NA
+    ret = a.take([0,2,4])
+    assert_equal([ret[0], ret[2]], [0,4])
+    assert_equal(np.isna(ret), [0,1,0])
+
+    # Take with some NAs, into NA output parameter
+    b[...] = 1
+    c[...] = np.NA
+    a.take([0,2,4], out=c)
+    assert_equal(b, [0,1,4])
+    assert_equal([c[0], c[2]], [0,4])
+    assert_equal(np.isna(c), [0,1,0])
+
+    c[...] = 1
+    a.take([0,2,4], out=c)
+    assert_equal(b, [0,1,4])
+    assert_equal([c[0], c[2]], [0,4])
+    assert_equal(np.isna(c), [0,1,0])
+
+    # Take with an NA just at the start
+    a = np.arange(5, maskna=True)
+    a[0] = np.NA
+    res = a.take([1,2,3,4])
+    assert_equal(res, [1,2,3,4])
+
+def test_maskna_ufunc_1D():
+    a_orig = np.arange(3)
+    a = a_orig.view(maskna=True)
+    b_orig = np.array([5,4,3])
+    b = b_orig.view(maskna=True)
+    c_orig = np.array([0,0,0])
+    c = c_orig.view(maskna=True)
+
+    # An NA mask is produced if an operand has one
+    res = a + b_orig
+    assert_(res.flags.maskna)
+    assert_equal(res, [5,5,5])
+
+    res = b_orig + a
+    assert_(res.flags.maskna)
+    assert_equal(res, [5,5,5])
+
+    # Can still output to a non-NA array if there are no NAs
+    np.add(a, b, out=c_orig)
+    assert_equal(c_orig, [5,5,5])
+
+    # Should unmask everything if the output has NA support but
+    # the inputs don't
+    c_orig[...] = 0
+    c[...] = np.NA
+    np.add(a_orig, b_orig, out=c)
+    assert_equal(c, [5,5,5])
+
+    # If the input has NA support but an output parameter doesn't,
+    # should work as long as the inputs contain no NAs
+    c_orig[...] = 0
+    np.add(a, b, out=c_orig)
+    assert_equal(c_orig, [5,5,5])
+
+    # An NA is produced if either operand has one
+    a[0] = np.NA
+    b[1] = np.NA
+    res = a + b
+    assert_equal(np.isna(res), [1,1,0])
+    assert_equal(res[2], 5)
+
+    # If the output contains NA, can't have out= parameter without
+    # NA support
+    assert_raises(ValueError, np.add, a, b, out=c_orig)
+
+    # Divide in-place with NA
+    a_orig = np.array([[3], [12.]])
+    a = a_orig.view(maskna=True)
+    a[0,0] = np.NA
+    a /= 3
+    # Shouldn't have touched the masked element
+    assert_array_equal(a_orig, [[3], [4.]])
+    assert_array_equal(a, [[np.NA], [4.]])
+    # double-check assertions
+    assert_equal(np.isna(a), [[1], [0]])
+    assert_equal(a[~np.isna(a)], [4.])
+
+def test_maskna_ufunc_sum_1D():
+    check_maskna_ufunc_sum_1D(np.sum)
+
+def test_maskna_ufunc_add_reduce_1D():
+    check_maskna_ufunc_sum_1D(np.add.reduce)
+
+def check_maskna_ufunc_sum_1D(sum_func):
+    a = np.arange(3.0, maskna=True)
+    b = np.array(0.5)
+    c_orig = np.array(0.5)
+    c = c_orig.view(maskna=True)
+
+    # Since 'a' has no NA values, this should work
+    sum_func(a, out=b)
+    assert_equal(b, 3.0)
+    b[...] = 7
+    sum_func(a, skipna=True, out=b)
+    assert_equal(b, 3.0)
+
+    ret = sum_func(a)
+    assert_equal(ret, 3.0)
+    ret = sum_func(a, skipna=True)
+    assert_equal(ret, 3.0)
+
+    # With an NA value, the reduce should throw with the non-NA output param
+    a[1] = np.NA
+    assert_raises(ValueError, sum_func, a, out=b)
+
+    # With an NA value, the output parameter can still be an NA-array
+    c_orig[...] = 0.5
+    sum_func(a, out=c)
+    assert_equal(c_orig, 0.5)
+    assert_(np.isna(c))
+
+    # Should not touch the out= element when assigning NA
+    b[...] = 1.0
+    d = b.view(maskna=True)
+    sum_func(a, out=d)
+    assert_(np.isna(d))
+    assert_equal(b, 1.0)
+
+    # Without an output parameter, return NA
+    ret = sum_func(a)
+    assert_(np.isna(ret))
+
+    # With 'skipna=True'
+    ret = sum_func(a, skipna=True)
+    assert_equal(ret, 2.0)
+
+    # With 'skipna=True', and out= parameter
+    b[...] = 0.5
+    sum_func(a, skipna=True, out=b)
+    assert_equal(b, 2.0)
+
+    # With 'skipna=True', and out= parameter with a mask
+    c[...] = 0.5
+    c[...] = np.NA
+    sum_func(a, skipna=True, out=c)
+    assert_(not np.isna(c))
+    assert_equal(c, 2.0)
+
+def test_ufunc_max_1D():
+    check_ufunc_max_1D(np.max)
+
+def test_ufunc_maximum_reduce_1D():
+    check_ufunc_max_1D(np.maximum.reduce)
+
+def check_ufunc_max_1D(max_func):
+    a_orig = np.array([0, 3, 2, 10, -1, 5, 7, -2])
+    a = a_orig.view(maskna=True)
+
+    # Straightforward reduce with no NAs
+    b = max_func(a)
+    assert_equal(b, 10)
+
+    # Set the biggest value to NA
+    a[3] = np.NA
+    b = max_func(a)
+    assert_(np.isna(b))
+
+    # Skip the NA
+    b = max_func(a, skipna=True)
+    assert_(not b.flags.maskna)
+    assert_(not np.isna(b))
+    assert_equal(b, 7)
+
+    # Set the first value to NA
+    a[0] = np.NA
+    b = max_func(a, skipna=True)
+    assert_(not b.flags.maskna)
+    assert_(not np.isna(b))
+    assert_equal(b, 7)
+
+    # Set all the values to NA - should raise the same error as
+    # for an empty array
+    a[...] = np.NA
+    assert_raises(ValueError, max_func, a, skipna=True)
+
+def test_ufunc_skipna_max_3D():
+    check_ufunc_skipna_max_3D(np.max)
+
+def test_ufunc_skipna_maximum_reduce_3D():
+    check_ufunc_skipna_max_3D(np.maximum.reduce)
+
+def check_ufunc_skipna_max_3D(max_func):
+    a_orig = np.array([[[29,  6, 24, 11, 24],
+                    [17, 26, 10, 29, 21],
+                    [ 4,  4,  7,  9, 30],
+                    [ 9, 20,  5, 12, 23]],
+                   [[ 8,  9, 10, 31, 22],
+                    [ 5, 20,  2, 29, 27],
+                    [21, 22, 13, 30, 20],
+                    [24, 27,  9, 20, 31]],
+                   [[14,  0, 13, 11, 22],
+                    [ 0, 16, 16, 14,  2],
+                    [ 0,  2,  1, 29, 12],
+                    [24, 25, 12, 11,  9]]])
+    a = a_orig.view(maskna=True)
+    b = a_orig.copy()
+
+    def check_all_axis_combos(x, y, badaxes=()):
+        if 0 not in badaxes:
+            res = max_func(x, axis=0, skipna=True)
+            assert_array_equal(res, max_func(y, axis=0, skipna=True))
+        if 1 not in badaxes:
+            res = max_func(x, axis=1, skipna=True)
+            assert_array_equal(res, max_func(y, axis=1, skipna=True))
+        if 2 not in badaxes:
+            res = max_func(x, axis=2, skipna=True)
+            assert_array_equal(res, max_func(y, axis=2, skipna=True))
+        res = max_func(x, axis=(0,1), skipna=True)
+        assert_array_equal(res, max_func(y, axis=(0,1), skipna=True))
+        res = max_func(x, axis=(0,2), skipna=True)
+        assert_array_equal(res, max_func(y, axis=(0,2), skipna=True))
+        res = max_func(x, axis=(1,2), skipna=True)
+        assert_array_equal(res, max_func(y, axis=(1,2), skipna=True))
+        res = max_func(x, axis=(0,1,2), skipna=True)
+        assert_array_equal(res, max_func(y, axis=(0,1,2), skipna=True))
+
+    # Straightforward reduce with no NAs
+    check_all_axis_combos(a, b)
+
+    # Set a few values in 'a' to NA, and set the corresponding
+    # values in 'b' to -1 to definitely eliminate them from the maximum
+    for coord in [(0,1,2), (1,2,2), (0,2,4), (2,1,0)]:
+        a[coord] = np.NA
+        b[coord] = -1
+    check_all_axis_combos(a, b)
+
+    # Set a few more values in 'a' to NA
+    for coord in [(2,1,1), (2,1,2), (2,1,3), (0,0,4), (0,3,4)]:
+        a[coord] = np.NA
+        b[coord] = -1
+    check_all_axis_combos(a, b)
+
+    # Set it so that there's a full set of NAs along the third dimension
+    for coord in [(2,1,4)]:
+        a[coord] = np.NA
+        b[coord] = -1
+    check_all_axis_combos(a, b, badaxes=(2,))
+    assert_raises(ValueError, max_func, a, axis=2, skipna=True)
+
+    # Set it so that there's a full set of NAs along the second dimension
+    for coord in [(0,1,4)]:
+        a[coord] = np.NA
+        b[coord] = -1
+    check_all_axis_combos(a, b, badaxes=(1,2))
+    assert_raises(ValueError, max_func, a, axis=1, skipna=True)
+    assert_raises(ValueError, max_func, a, axis=2, skipna=True)
+
+def test_ufunc_ndarray_any():
+    a = np.array([0,0,0,0,0], dtype='?', maskna=True)
+    assert_array_equal(a.any(), False)
+    a[0] = np.NA
+    assert_array_equal(a.any(), np.NA)
+    assert_array_equal(a.any(skipna=True), False)
+    a[0] = 0
+    a[-1] = np.NA
+    assert_array_equal(a.any(), np.NA)
+    assert_array_equal(a.any(skipna=True), False)
+    a[0] = 1
+    assert_array_equal(a.any(), True)
+    assert_array_equal(a.any(skipna=True), True)
+    a[-1] = 1
+    a[-2] = np.NA
+    assert_array_equal(a.any(), True)
+    assert_array_equal(a.any(skipna=True), True)
+
+    a = np.array([[0,0,0],[0,np.NA,0]], dtype='?')
+    assert_array_equal(a.any(axis=0), [False, np.NA, False])
+    assert_array_equal(a.any(axis=1), [False, np.NA])
+    assert_array_equal(a.any(axis=0, skipna=True), [False, False, False])
+    assert_array_equal(a.any(axis=1, skipna=True), [False, False])
+
+    a[0,1] = 1
+    assert_array_equal(a.any(axis=0), [False, True, False])
+    assert_array_equal(a.any(axis=1), [True, np.NA])
+    assert_array_equal(a.any(axis=0, skipna=True), [False, True, False])
+    assert_array_equal(a.any(axis=1, skipna=True), [True, False])
+
+    a[0,1] = np.NA
+    a[1,1] = 0
+    a[0,2] = 1
+    assert_array_equal(a.any(axis=0), [False, np.NA, True])
+    assert_array_equal(a.any(axis=1), [True, False])
+    assert_array_equal(a.any(axis=0, skipna=True), [False, False, True])
+    assert_array_equal(a.any(axis=1, skipna=True), [True, False])
+
+def test_ufunc_ndarray_all():
+    a = np.array([1,1,1,1,1], dtype='?', maskna=True)
+    assert_array_equal(a.all(), True)
+    a[0] = np.NA
+    assert_array_equal(a.all(), np.NA)
+    assert_array_equal(a.all(skipna=True), True)
+    a[0] = 1
+    a[-1] = np.NA
+    assert_array_equal(a.all(), np.NA)
+    assert_array_equal(a.all(skipna=True), True)
+    a[0] = 0
+    assert_array_equal(a.all(), False)
+    assert_array_equal(a.all(skipna=True), False)
+    a[-1] = 0
+    a[-2] = np.NA
+    assert_array_equal(a.all(), False)
+    assert_array_equal(a.all(skipna=True), False)
+
+    a = np.array([[1,1,1],[1,np.NA,1]], dtype='?')
+    assert_array_equal(a.all(axis=0), [True, np.NA, True])
+    assert_array_equal(a.all(axis=1), [True, np.NA])
+    assert_array_equal(a.all(axis=0, skipna=True), [True, True, True])
+    assert_array_equal(a.all(axis=1, skipna=True), [True, True])
+
+    a[0,1] = 0
+    assert_array_equal(a.all(axis=0), [True, False, True])
+    assert_array_equal(a.all(axis=1), [False, np.NA])
+    assert_array_equal(a.all(axis=0, skipna=True), [True, False, True])
+    assert_array_equal(a.all(axis=1, skipna=True), [False, True])
+
+    a[0,1] = np.NA
+    a[1,1] = 1
+    a[0,2] = 0
+    assert_array_equal(a.all(axis=0), [True, np.NA, False])
+    assert_array_equal(a.all(axis=1), [False, True])
+    assert_array_equal(a.all(axis=0, skipna=True), [True, True, False])
+    assert_array_equal(a.all(axis=1, skipna=True), [False, True])
+
+def test_count_reduce_items():
+    # np.count_reduce_items
+
+    # When skipna is False, it should always return the
+    # product of the reduction axes as a NumPy intp scalar
+    a = np.zeros((2,3,4))
+
+    res = np.count_reduce_items(a)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2))
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # This still holds if 'a' has an NA mask and some NA values
+    a = np.zeros((2,3,4), maskna=True)
+    a[1,2,2] = np.NA
+    a[0,1,2] = np.NA
+    a[1,0,3] = np.NA
+
+    res = np.count_reduce_items(a)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2))
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # If skipna is True, but the array has no NA mask, the result
+    # should still be the product of the reduction axes
+    a = np.zeros((2,3,4))
+
+    res = np.count_reduce_items(a, skipna=True)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0, skipna=True)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2), skipna=True)
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # Finally, when skipna is True AND the array has an NA mask,
+    # we get an array of counts
+    a = np.zeros((2,3,4), maskna=True)
+    a[1,2,2] = np.NA
+    a[0,1,2] = np.NA
+    a[1,0,3] = np.NA
+
+    # When doing a full reduction, should still get the scalar
+    res = np.count_reduce_items(a, skipna=True)
+    assert_equal(res, 21)
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+    res = np.count_reduce_items(a, axis=0, skipna=True)
+    assert_equal(res, [[2,2,2,1], [2,2,1,2], [2,2,1,2]])
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+    res = np.count_reduce_items(a, axis=(1,2), skipna=True)
+    assert_equal(res, [11,10])
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+
+def test_array_maskna_clip_method():
+    # ndarray.clip
+    a = np.array([2, np.NA, 10, 4, np.NA, 7], maskna=True)
+
+    b = np.clip(a, 3, None)
+    assert_equal(np.isna(b), [0,1,0,0,1,0])
+    assert_equal(b[~np.isna(b)], [3, 10, 4, 7])
+
+    res = np.clip(a, None, 6)
+    assert_equal(np.isna(res), [0,1,0,0,1,0])
+    assert_equal(res[~np.isna(res)], [2, 6, 4, 6])
+
+    res = np.clip(a, 4, 7)
+    assert_equal(np.isna(res), [0,1,0,0,1,0])
+    assert_equal(res[~np.isna(res)], [4, 7, 4, 7])
+
+def test_array_maskna_max_min_ptp_methods():
+    # ndarray.max, ndarray.min, ndarray.ptp
+    a = np.array([[2, np.NA, 10],
+                  [4, 8, 7],
+                  [12, 4, np.NA]], maskna=True)
+
+    res = a.max(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [12])
+
+    res = a.max(axis=-1)
+    assert_equal(np.isna(res), [1,0,1])
+    assert_equal(res[~np.isna(res)], [8])
+
+    res = a.min(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [2])
+
+    res = a.min(axis=-1)
+    assert_equal(np.isna(res), [1,0,1])
+    assert_equal(res[~np.isna(res)], [4])
+
+    res = a.ptp(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [10])
+
+    res = a.ptp(axis=-1)
+    assert_equal(np.isna(res), [1,0,1])
+    assert_equal(res[~np.isna(res)], [4])
+
+def test_array_maskna_sum_prod_methods():
+    # ndarray.sum, ndarray.prod
+    a = np.array([[2, np.NA, 10],
+                  [4, 8, 7],
+                  [12, 4, np.NA],
+                  [3, 2, 5]], maskna=True)
+
+    res = a.sum(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [21])
+
+    res = a.sum(axis=-1)
+    assert_equal(np.isna(res), [1,0,1,0])
+    assert_equal(res[~np.isna(res)], [19,10])
+
+    res = a.prod(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [2*4*12*3])
+
+    res = a.prod(axis=-1)
+    assert_equal(np.isna(res), [1,0,1,0])
+    assert_equal(res[~np.isna(res)], [4*8*7,3*2*5])
+
+    # Check also with Fortran-order
+    a = np.array([[2, np.NA, 10],
+                  [4, 8, 7],
+                  [12, 4, np.NA],
+                  [3, 2, 5]], maskna=True, order='F')
+
+    res = a.sum(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [21])
+
+    res = a.sum(axis=-1)
+    assert_equal(np.isna(res), [1,0,1,0])
+    assert_equal(res[~np.isna(res)], [19,10])
+
+
+def test_array_maskna_std_mean_methods():
+    # ndarray.std, ndarray.mean
+    a = np.array([[2, np.NA, 10],
+                  [4, 8, 7],
+                  [12, 4, np.NA]], maskna=True)
+
+    res = a.mean(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [np.array([2,4,12]).mean()])
+
+    res = a.mean(axis=-1)
+    assert_equal(np.isna(res), [1,0,1])
+    assert_equal(res[~np.isna(res)], [np.array([4,8,7]).mean()])
+
+    res = a.std(axis=0)
+    assert_equal(np.isna(res), [0,1,1])
+    assert_equal(res[~np.isna(res)], [np.array([2,4,12]).std()])
+
+    res = a.std(axis=-1)
+    assert_equal(np.isna(res), [1,0,1])
+    assert_equal(res[~np.isna(res)], [np.array([4,8,7]).std()])
+
+def test_array_maskna_conjugate_method():
+    # ndarray.conjugate
+    a = np.array([1j, 2+4j, np.NA, 2-1.5j, np.NA], maskna=True)
+
+    b = a.conjugate()
+    assert_equal(np.isna(b), [0,0,1,0,1])
+    assert_equal(b[~np.isna(b)], [-1j, 2-4j, 2+1.5j])
+
+def test_array_maskna_diagonal():
+    # ndarray.diagonal
+    a = np.arange(6, maskna=True)
+    a.shape = (2,3)
+    a[0,1] = np.NA
+
+    # Should produce a view into a
+    res = a.diagonal()
+    assert_(res.base is a)
+    assert_(res.flags.maskna)
+    assert_(not res.flags.ownmaskna)
+    assert_equal(res, [0, 4])
+
+    res = a.diagonal(-1)
+    assert_equal(res, [3])
+
+    res = a.diagonal(-2)
+    assert_equal(res, [])
+
+    # This diagonal has the NA
+    res = a.diagonal(1)
+    assert_equal(np.isna(res), [1,0])
+    assert_equal(res[~np.isna(res)], [5])
+
+    res = a.diagonal(2)
+    assert_equal(res, [2])
+
+    res = a.diagonal(3)
+    assert_equal(res, [])
+
+def test_array_maskna_concatenate():
+    # np.concatenate
+    a = np.arange(6, maskna=True, dtype='i4').reshape(2,3)
+    a[1,0] = np.NA
+
+    b = np.array([[12],[13]], dtype='i4')
+    res = np.concatenate([a, b], axis=1)
+    assert_equal(np.isna(res), [[0,0,0,0], [1,0,0,0]])
+    assert_equal(res[~np.isna(res)], [0,1,2,12,4,5,13])
+    assert_equal(res.strides, (16, 4))
+
+    b = np.array([[10, np.NA, 11]], maskna=True, dtype='i4')
+    res = np.concatenate([a,b], axis=0)
+    assert_equal(np.isna(res), [[0,0,0], [1,0,0], [0,1,0]])
+    assert_equal(res[~np.isna(res)], [0,1,2,4,5,10,11])
+    assert_equal(res.strides, (12, 4))
+
+    b = np.array([[np.NA, 10]], order='F', maskna=True, dtype='i4')
+    res = np.concatenate([a.T, b], axis=0)
+    assert_equal(np.isna(res), [[0,1], [0,0], [0,0], [1,0]])
+    assert_equal(res[~np.isna(res)], [0,1,4,2,5,10])
+    assert_equal(res.strides, (4, 16))
+
+def test_array_maskna_column_stack():
+    # np.column_stack
+    a = np.array((1,2,3), maskna=True)
+    b = np.array((2,3,4), maskna=True)
+    b[2] = np.NA
+    res = np.column_stack((a,b))
+    assert_equal(np.isna(res), [[0,0], [0,0], [0,1]])
+    assert_equal(res[~np.isna(res)], [1,2,2,3,3])
+
+def test_array_maskna_compress():
+    # ndarray.compress
+    a = np.arange(5., maskna=True)
+    a[0] = np.NA
+
+    mask = np.array([0,1,1,1,1], dtype='?')
+    res = a.compress(mask)
+    assert_equal(res, [1,2,3,4])
+
+def test_array_maskna_squeeze():
+    # np.squeeze
+    a = np.zeros((1,3,1,1,4,2,1), maskna=True)
+    a[0,1,0,0,3,0,0] = np.NA
+
+    res = np.squeeze(a)
+    assert_equal(res.shape, (3,4,2))
+    assert_(np.isna(res[1,3,0]))
+
+    res = np.squeeze(a, axis=(0,2,6))
+    assert_equal(res.shape, (3,1,4,2))
+    assert_(np.isna(res[1,0,3,0]))
+
+def test_array_maskna_mean():
+    # np.mean
+
+    # With an NA mask, but no NA
+    a = np.arange(6, maskna=True).reshape(2,3)
+
+    res = np.mean(a)
+    assert_equal(res, 2.5)
+    res = np.mean(a, axis=0)
+    assert_equal(res, [1.5, 2.5, 3.5])
+
+    # With an NA and skipna=False
+    a = np.arange(6, maskna=True).reshape(2,3)
+    a[0,1] = np.NA
+
+    res = np.mean(a)
+    assert_(type(res) is np.NAType)
+
+    res = np.mean(a, axis=0)
+    assert_array_equal(res, [1.5, np.NA, 3.5])
+
+    res = np.mean(a, axis=1)
+    assert_array_equal(res, [np.NA, 4.0])
+
+    # With an NA and skipna=True
+    res = np.mean(a, skipna=True)
+    assert_almost_equal(res, 2.8)
+
+    res = np.mean(a, axis=0, skipna=True)
+    assert_array_equal(res, [1.5, 4.0, 3.5])
+
+    res = np.mean(a, axis=1, skipna=True)
+    assert_array_equal(res, [1.0, 4.0])
+
+def test_array_maskna_var_std():
+    # np.var, np.std
+
+    # With an NA and skipna=False
+    a = np.arange(6, maskna=True).reshape(2,3)
+    a[0,1] = np.NA
+
+    res = np.var(a)
+    assert_(type(res) is np.NAType)
+    res = np.std(a)
+    assert_(type(res) is np.NAType)
+
+    res = np.var(a, axis=0)
+    assert_array_equal(res, [2.25, np.NA, 2.25])
+    res = np.std(a, axis=0)
+    assert_array_equal(res, [1.5, np.NA, 1.5])
+
+    res = np.var(a, axis=1)
+    assert_array_almost_equal(res, [np.NA, 0.66666666666666663])
+    res = np.std(a, axis=1)
+    assert_array_almost_equal(res, [np.NA, 0.81649658092772603])
+
+    # With an NA and skipna=True
+    a = np.arange(6, maskna=True).reshape(2,3)
+    a[0,1] = np.NA
+
+    res = np.var(a, skipna=True)
+    assert_almost_equal(res, 2.96)
+    res = np.std(a, skipna=True)
+    assert_almost_equal(res, 1.7204650534085253)
+
+    res = np.var(a, axis=0, skipna=True)
+    assert_array_equal(res, [2.25, 0, 2.25])
+    res = np.std(a, axis=0, skipna=True)
+    assert_array_equal(res, [1.5, 0, 1.5])
+
+    res = np.var(a, axis=1, skipna=True)
+    assert_array_almost_equal(res, [1.0, 0.66666666666666663])
+    res = np.std(a, axis=1, skipna=True)
+    assert_array_almost_equal(res, [1.0, 0.81649658092772603])
+
+def test_array_maskna_linspace_logspace():
+    # np.linspace, np.logspace
+
+    a = np.linspace(2.0, 3.0, num=5)
+    b = np.linspace(2.0, 3.0, num=5, maskna=True)
+    assert_equal(a, b)
+    assert_(not a.flags.maskna)
+    assert_(b.flags.maskna)
+
+    a = np.logspace(2.0, 3.0, num=4)
+    b = np.logspace(2.0, 3.0, num=4, maskna=True)
+    assert_equal(a, b)
+    assert_(not a.flags.maskna)
+    assert_(b.flags.maskna)
+
+def test_array_maskna_eye_identity():
+    # np.eye
+
+    # By default there should be no NA mask
+    a = np.eye(3)
+    assert_(not a.flags.maskna)
+    a = np.identity(3)
+    assert_(not a.flags.maskna)
+
+    a = np.eye(3, maskna=True)
+    assert_(a.flags.maskna)
+    assert_(a.flags.ownmaskna)
+    assert_equal(a, np.eye(3))
+
+    a = np.eye(3, k=2, maskna=True)
+    assert_(a.flags.maskna)
+    assert_(a.flags.ownmaskna)
+    assert_equal(a, np.eye(3, k=2))
+
+    a = np.identity(3, maskna=True)
+    assert_(a.flags.maskna)
+    assert_(a.flags.ownmaskna)
+    assert_equal(a, np.identity(3))
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b9dfcddf9..f1a7a2b65 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -328,7 +328,7 @@ class TestCreation(TestCase):
             assert_equal(array(nstr, dtype=type), result, err_msg=msg)
 
     def test_void(self):
-        arr = np.array([np.void(0)], dtype='V')
+        arr = np.array([], dtype='V')
         assert_equal(arr.dtype.kind, 'V')
 
     def test_non_sequence_sequence(self):
diff --git a/numpy/core/tests/test_na.py b/numpy/core/tests/test_na.py
new file mode 100644
index 000000000..2950598f8
--- /dev/null
+++ b/numpy/core/tests/test_na.py
@@ -0,0 +1,190 @@
+import numpy as np
+from numpy.compat import asbytes
+from numpy.testing import *
+import sys, warnings
+
+def test_na_construction():
+    # construct a new NA object
+    v = np.NA()
+    assert_(not v is np.NA)
+    assert_equal(v.payload, None)
+    assert_equal(v.dtype, None)
+
+    # Construct with a payload
+    v = np.NA(3)
+    assert_equal(v.payload, 3)
+    assert_equal(v.dtype, None)
+
+    # Construct with a dtype
+    v = np.NA(dtype='f4')
+    assert_equal(v.payload, None)
+    assert_equal(v.dtype, np.dtype('f4'))
+
+    # Construct with both a payload and a dtype
+    v = np.NA(5, dtype='f4,i2')
+    assert_equal(v.payload, 5)
+    assert_equal(v.dtype, np.dtype('f4,i2'))
+
+    # min and max payload values
+    v = np.NA(0)
+    assert_equal(v.payload, 0)
+    v = np.NA(127)
+    assert_equal(v.payload, 127)
+
+    # Out of bounds payload values
+    assert_raises(ValueError, np.NA, -1)
+    assert_raises(ValueError, np.NA, 128)
+
+def test_na_str():
+    # With no payload or dtype
+    assert_equal(str(np.NA), 'NA')
+    assert_equal(str(np.NA()), 'NA')
+
+    # With a payload
+    assert_equal(str(np.NA(10)), 'NA(10)')
+
+    # With just a dtype
+    assert_equal(str(np.NA(dtype='c16')), 'NA')
+
+    # With a payload and a dtype
+    assert_equal(str(np.NA(10, dtype='f4')), 'NA(10)')
+
+def test_na_repr():
+    # With no payload or dtype
+    assert_equal(repr(np.NA), 'NA')
+    assert_equal(repr(np.NA()), 'NA')
+
+    # With a payload
+    assert_equal(repr(np.NA(10)), 'NA(10)')
+
+    # With just a dtype
+    assert_equal(repr(np.NA(dtype='?')), "NA(dtype='bool')")
+    if sys.byteorder == 'little':
+        assert_equal(repr(np.NA(dtype='<c16')), "NA(dtype='complex128')")
+        assert_equal(repr(np.NA(dtype='>c16')), "NA(dtype='>c16')")
+    else:
+        assert_equal(repr(np.NA(dtype='>c16')), "NA(dtype='complex128')")
+        assert_equal(repr(np.NA(dtype='<c16')), "NA(dtype='<c16')")
+
+    # With a payload and a dtype
+    if sys.byteorder == 'little':
+        assert_equal(repr(np.NA(10, dtype='<f4')), "NA(10, dtype='float32')")
+        assert_equal(repr(np.NA(10, dtype='>f4')), "NA(10, dtype='>f4')")
+    else:
+        assert_equal(repr(np.NA(10, dtype='>f4')), "NA(10, dtype='float32')")
+        assert_equal(repr(np.NA(10, dtype='<f4')), "NA(10, dtype='<f4')")
+
+def test_na_comparison():
+    # NA cannot be converted to a boolean
+    assert_raises(ValueError, bool, np.NA)
+
+    # Comparison results should be np.NA(dtype='bool')
+    def check_comparison_result(res):
+        assert_(np.isna(res))
+        assert_(res.dtype == np.dtype('bool'))
+
+    # Comparison with different objects produces an NA with boolean type
+    check_comparison_result(np.NA < 3)
+    check_comparison_result(np.NA <= 3)
+    check_comparison_result(np.NA == 3)
+    check_comparison_result(np.NA != 3)
+    check_comparison_result(np.NA >= 3)
+    check_comparison_result(np.NA > 3)
+
+    # Should work with NA on the other side too
+    check_comparison_result(3 < np.NA)
+    check_comparison_result(3 <= np.NA)
+    check_comparison_result(3 == np.NA)
+    check_comparison_result(3 != np.NA)
+    check_comparison_result(3 >= np.NA)
+    check_comparison_result(3 > np.NA)
+
+    # Comparison with an array should produce an array
+    a = np.array([0,1,2]) < np.NA
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+    a = np.array([0,1,2]) == np.NA
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+    a = np.array([0,1,2]) != np.NA
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+
+    # Comparison with an array should work on the other side too
+    a = np.NA > np.array([0,1,2])
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+    a = np.NA == np.array([0,1,2])
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+    a = np.NA != np.array([0,1,2])
+    assert_equal(np.isna(a), [1,1,1])
+    assert_equal(a.dtype, np.dtype('bool'))
+
+def test_na_operations():
+    # The minimum of the payload is taken
+    assert_equal((np.NA + np.NA(3)).payload, None)
+    assert_equal((np.NA(12) + np.NA()).payload, None)
+    assert_equal((np.NA(2) - np.NA(6)).payload, 2)
+    assert_equal((np.NA(5) - np.NA(1)).payload, 1)
+
+    # The dtypes are promoted like np.promote_types
+    assert_equal((np.NA(dtype='f4') * np.NA(dtype='f8')).dtype,
+                 np.dtype('f8'))
+    assert_equal((np.NA(dtype='c8') * np.NA(dtype='f8')).dtype,
+                 np.dtype('c16'))
+    assert_equal((np.NA * np.NA(dtype='i8')).dtype,
+                 np.dtype('i8'))
+    assert_equal((np.NA(dtype='i2') / np.NA).dtype,
+                 np.dtype('i2'))
+
+def test_na_other_operations():
+    # Make sure we get NAs for all these operations
+    assert_equal(type(np.NA + 3), np.NAType)
+    assert_equal(type(3 + np.NA), np.NAType)
+    assert_equal(type(np.NA - 3.0), np.NAType)
+    assert_equal(type(3.0 - np.NA), np.NAType)
+    assert_equal(type(np.NA * 2j), np.NAType)
+    assert_equal(type(2j * np.NA), np.NAType)
+    assert_equal(type(np.NA / 2j), np.NAType)
+    assert_equal(type(2j / np.NA), np.NAType)
+    assert_equal(type(np.NA // 2j), np.NAType)
+    assert_equal(type(np.NA % 6), np.NAType)
+    assert_equal(type(6 % np.NA), np.NAType)
+    assert_equal(type(np.NA ** 2), np.NAType)
+    assert_equal(type(2 ** np.NA), np.NAType)
+    assert_equal(type(np.NA & 2), np.NAType)
+    assert_equal(type(2 & np.NA), np.NAType)
+    assert_equal(type(np.NA | 2), np.NAType)
+    assert_equal(type(2 | np.NA), np.NAType)
+    assert_equal(type(np.NA << 2), np.NAType)
+    assert_equal(type(2 << np.NA), np.NAType)
+    assert_equal(type(np.NA >> 2), np.NAType)
+    assert_equal(type(2 >> np.NA), np.NAType)
+    assert_(abs(np.NA) is np.NA)
+    assert_((-np.NA) is np.NA)
+    assert_((+np.NA) is np.NA)
+    assert_((~np.NA) is np.NA)
+
+    # The NA should get the dtype from the other operand
+    assert_equal((np.NA + 3).dtype, np.array(3).dtype)
+    assert_equal((np.NA - 3.0).dtype, np.array(3.0).dtype)
+    assert_equal((np.NA * 2j).dtype, np.array(2j).dtype)
+
+    # Should have type promotion if the NA already has a dtype
+    assert_equal((np.NA(dtype='f4') ** 3.0).dtype, np.dtype('f8'))
+
+    # Bitwise and/or are specialized slightly
+    # NOTE: The keywords 'and' and 'or' coerce to boolean, so we cannot
+    #       properly support them.
+    assert_equal(np.NA & False, False)
+    assert_equal(False & np.NA, False)
+    assert_equal(np.NA | True, True)
+    assert_equal(True | np.NA, True)
+    assert_equal(type(np.NA | False), np.NAType)
+    assert_equal(type(np.NA & True), np.NAType)
+    assert_equal((np.NA | False).dtype, np.array(False).dtype)
+    assert_equal((np.NA & True).dtype, np.array(True).dtype)
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index cf0a44c63..b0badc2d9 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -2295,6 +2295,23 @@ def test_iter_buffering_reduction():
     it.reset()
     assert_equal(it[0], [1,2,1,2])
 
+def test_iter_buffering_reduction_reuse_reduce_loops():
+    # There was a bug triggering reuse of the reduce loop inappropriately,
+    # which caused processing to happen in unnecessarily small chunks
+    # and overran the buffer.
+
+    a = np.zeros((2,7))
+    b = np.zeros((1,7))
+    it = np.nditer([a,b], flags=['reduce_ok', 'external_loop', 'buffered'],
+                    op_flags=[['readonly'], ['readwrite']],
+                    buffersize = 5)
+
+    bufsizes = []
+    for x, y in it:
+        bufsizes.append(x.shape[0])
+    assert_equal(bufsizes, [5,2,5,2])
+    assert_equal(sum(bufsizes), a.size)
+
 def test_iter_writemasked_badinput():
     a = np.zeros((2,3))
     b = np.zeros((3,))
@@ -2399,5 +2416,143 @@ def test_iter_writemasked():
     # were copied back
     assert_equal(a, [3,3,2.5])
 
+def test_iter_maskna():
+    a_orig = np.zeros((3,), dtype='f8')
+    b_orig = np.zeros((3,), dtype='f4')
+    a = a_orig.view(maskna=True)
+    b = b_orig.view(maskna=True)
+
+    # Default iteration with NA mask
+    a[...] = np.NA
+    it = np.nditer(a)
+    for x in it:
+        assert_equal(np.isna(x), True)
+
+    # readonly USE_MASKNA iteration of an array without an NA mask
+    # creates a virtual mask
+    it = np.nditer([a_orig,b_orig], [], [['readonly','use_maskna']]*2)
+    for x, y in it:
+        assert_(x.flags.maskna)
+        assert_(not x.flags.ownmaskna)
+        assert_(y.flags.maskna)
+        assert_(not y.flags.ownmaskna)
+        assert_equal(np.isna(x), False)
+        assert_equal(np.isna(y), False)
+
+    # buffered readonly USE_MASKNA iteration of an array without an NA mask
+    # creates a virtual mask
+    it = np.nditer([a_orig,b_orig], ['buffered'], [['readonly','use_maskna']]*2,
+                        op_dtypes=['i4','i8'], casting='unsafe')
+    for x, y in it:
+        assert_(x.flags.maskna)
+        assert_(not x.flags.ownmaskna)
+        assert_(y.flags.maskna)
+        assert_(not y.flags.ownmaskna)
+        assert_equal(np.isna(x), False)
+        assert_equal(np.isna(y), False)
+
+    # writeable USE_MASKNA iteration of an array without an NA mask
+    # is disallowed
+    assert_raises(ValueError, np.nditer, a_orig, [],
+                                [['readwrite','use_maskna']])
+    assert_raises(ValueError, np.nditer, a_orig, [],
+                                [['writeonly','use_maskna']])
+    assert_raises(ValueError, np.nditer, a_orig, ['buffered'],
+                                [['readwrite','use_maskna']])
+    assert_raises(ValueError, np.nditer, a_orig, ['buffered'],
+                                [['writeonly','use_maskna']])
+
+    # Assigning NAs and values in an iteration
+    a[...] = [0,1,2]
+    b_orig[...] = [1,2,2]
+    it = np.nditer([a,b_orig], [], [['writeonly','use_maskna'], ['readonly']])
+    for x, y in it:
+        if y == 2:
+            x[...] = np.NA
+        else:
+            x[...] = 5
+    assert_equal(a[0], 5)
+    assert_equal(np.isna(a), [0,1,1])
+
+    # Copying NA values in an iteration
+    b.flags.maskna = True
+    a[...] = [np.NA, np.NA, 1]
+    b[...] = [np.NA, 0, np.NA]
+    it = np.nditer([a,b], [], [['writeonly','use_maskna'],
+                               ['readonly','use_maskna']])
+    for x, y in it:
+        x[...] = y
+    assert_equal(a[1], 0)
+    assert_equal(np.isna(a), [1,0,1])
+
+    # Copying NA values with buffering
+    a_orig[...] = [1.5,2.5,3.5]
+    b_orig[...] = [4.5,5.5,6.5]
+    a[...] = [np.NA, np.NA, 5.5]
+    b[...] = [np.NA, 3.5, np.NA]
+    it = np.nditer([a,b], ['buffered'], [['writeonly','use_maskna'],
+                               ['readonly','use_maskna']],
+                    op_dtypes=['i4','i4'],
+                    casting='unsafe')
+    for x, y in it:
+        x[...] = y
+    # The 3.5 in b gets truncated to 3, because the iterator is processing
+    # elements as int32 values.
+    assert_equal(a[1], 3)
+    assert_equal(np.isna(a), [1,0,1])
+    assert_equal(a_orig, [1.5,3,5.5])
+
+    # Copying NA values with buffering and external_loop
+    a_orig[...] = [1.5,2.5,3.5]
+    b_orig[...] = [4.5,5.5,6.5]
+    a[...] = [np.NA, np.NA, 5.5]
+    b[...] = [np.NA, 3.5, np.NA]
+    it = np.nditer([a,b], ['buffered','external_loop'],
+                              [['writeonly','use_maskna'],
+                               ['readonly','use_maskna']],
+                    op_dtypes=['i4','i4'],
+                    casting='unsafe')
+    for x, y in it:
+        assert_equal(x.size, 3)
+        x[...] = y
+    # The 3.5 in b gets truncated to 3, because the iterator is processing
+    # elements as int32 values.
+    assert_equal(a[1], 3)
+    assert_equal(np.isna(a), [1,0,1])
+    assert_equal(a_orig, [1.5,3,5.5])
+
+    # WRITEMASKED and MASKNA aren't supported together yet
+    mask = np.array([1,1,0], dtype='?')
+    assert_raises(ValueError, np.nditer, [a,b,mask], [],
+                                [['writeonly','use_maskna','writemasked'],
+                                 ['readonly','use_maskna'],
+                                 ['readonly','arraymask']])
+    # when they are supported together, will probably require buffering
+    assert_raises(ValueError, np.nditer, [a,b,mask], ['buffered'],
+                                [['writeonly','use_maskna','writemasked'],
+                                 ['readonly','use_maskna'],
+                                 ['readonly','arraymask']])
+
+def test_iter_maskna_default_use_maskna():
+    # The Python exposure of nditer adds the USE_MASKNA flag automatically
+    a = np.array([3, 5, np.NA, 2, 1])
+    b = np.array([1, 1.0, 4.5, 2, 0])
+
+    # The output should automatically get an NA mask
+    it = np.nditer([a,b,None])
+    for x,y,z in it:
+        z[...] = x+y
+    assert_(it.operands[2].flags.maskna)
+    assert_array_equal(it.operands[2], a+b)
+
+    # This holds even when we specify the op_flags
+    it = np.nditer([a,b.copy(),None], op_flags=[['readonly'],
+                                    ['readwrite'], ['writeonly', 'allocate']])
+    for x,y,z in it:
+        y[...] = y[...] + 1
+        z[...] = x+y
+    assert_(it.operands[2].flags.maskna)
+    assert_array_equal(it.operands[2], a+b+1)
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 75ecd398a..205bf56c4 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -549,16 +549,20 @@ class TestFromiter(TestCase):
 class TestNonzero(TestCase):
     def test_nonzero_trivial(self):
         assert_equal(np.count_nonzero(array([])), 0)
+        assert_equal(np.count_nonzero(array([], dtype='?')), 0)
         assert_equal(np.nonzero(array([])), ([],))
 
         assert_equal(np.count_nonzero(array(0)), 0)
+        assert_equal(np.count_nonzero(array(0, dtype='?')), 0)
         assert_equal(np.nonzero(array(0)), ([],))
         assert_equal(np.count_nonzero(array(1)), 1)
+        assert_equal(np.count_nonzero(array(1, dtype='?')), 1)
         assert_equal(np.nonzero(array(1)), ([0],))
 
     def test_nonzero_onedim(self):
         x = array([1,0,2,-1,0,0,8])
         assert_equal(np.count_nonzero(x), 4)
+        assert_equal(np.count_nonzero(x), 4)
         assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
 
         x = array([(1,2),(0,0),(1,1),(-1,3),(0,7)],
@@ -590,6 +594,50 @@ class TestNonzero(TestCase):
         assert_equal(np.nonzero(x['a'].T), ([0,1,1,2],[1,1,2,0]))
         assert_equal(np.nonzero(x['b'].T), ([0,0,1,2,2],[0,1,2,0,2]))
 
+    def test_count_nonzero_axis(self):
+        a = array([[0,1,0],[2,3,0]])
+        assert_equal(np.count_nonzero(a, axis=()), [[0,1,0],[1,1,0]])
+        assert_equal(np.count_nonzero(a, axis=0), [1,2,0])
+        assert_equal(np.count_nonzero(a, axis=1), [1,2])
+        assert_equal(np.count_nonzero(a, axis=(0,1)), 3)
+
+        res = array([-1,-1,-1], dtype='i2')
+        np.count_nonzero(a, axis=0, out=res)
+        assert_equal(res, [1,2,0])
+
+        # A 3-dimensional array with an NA
+        a = array([[[0,1,0],[2,np.NA,0]], [[0,1,0],[2,3,0]]], maskna=True)
+
+        # Test that the NA reduces correctly
+        assert_array_equal(np.count_nonzero(a, axis=()),
+                            [[[0,1,0],[1,np.NA,0]], [[0,1,0],[1,1,0]]])
+        assert_array_equal(np.count_nonzero(a, axis=0), [[0,2,0], [2,np.NA,0]])
+        assert_array_equal(np.count_nonzero(a, axis=1), [[1,np.NA,0], [1,2,0]])
+        assert_array_equal(np.count_nonzero(a, axis=2), [[1,np.NA], [1,2]])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1)), [2,np.NA,0])
+        assert_array_equal(np.count_nonzero(a, axis=(0,2)), [2,np.NA])
+        assert_array_equal(np.count_nonzero(a, axis=(1,2)), [np.NA,3])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1,2)),
+                                                np.NA(dtype=np.intp))
+        assert_array_equal(np.count_nonzero(a, axis=None),
+                                                np.NA(dtype=np.intp))
+
+        # Test that the NA gets skipped correctly
+        assert_array_equal(np.count_nonzero(a, axis=(), skipna=True),
+                            [[[0,1,0],[1,0,0]], [[0,1,0],[1,1,0]]])
+        assert_array_equal(np.count_nonzero(a, axis=0, skipna=True),
+                            [[0,2,0], [2,1,0]])
+        assert_array_equal(np.count_nonzero(a, axis=1, skipna=True),
+                            [[1,1,0], [1,2,0]])
+        assert_array_equal(np.count_nonzero(a, axis=2, skipna=True),
+                            [[1,1], [1,2]])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1), skipna=True),
+                            [2,3,0])
+        assert_array_equal(np.count_nonzero(a, axis=(0,2), skipna=True), [2,3])
+        assert_array_equal(np.count_nonzero(a, axis=(1,2), skipna=True), [2,3])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1,2), skipna=True), 5)
+        assert_array_equal(np.count_nonzero(a, axis=None, skipna=True), 5)
+
 class TestIndex(TestCase):
     def test_boolean(self):
         a = rand(3,5,8)
@@ -599,6 +647,13 @@ class TestIndex(TestCase):
         V[g1,g2] = -V[g1,g2]
         assert_((array([a[0][V>0],a[1][V>0],a[2][V>0]]) == a[:,V>0]).all())
 
+    def test_boolean_edgecase(self):
+        a = np.array([], dtype='int32')
+        b = np.array([], dtype='bool')
+        c = a[b]
+        assert_equal(c, [])
+        assert_equal(c.dtype, np.dtype('int32'))
+
 
 class TestBinaryRepr(TestCase):
     def test_zero(self):
@@ -1269,7 +1324,7 @@ class TestLikeFuncs(TestCase):
             if not value is None:
                 assert_(all(dz == value))
 
-        # Test the 'subok' parameter'
+        # Test the 'subok' parameter
         a = np.matrix([[1,2],[3,4]])
 
         b = like_function(a)
@@ -1278,6 +1333,20 @@ class TestLikeFuncs(TestCase):
         b = like_function(a, subok=False)
         assert_(not (type(b) is np.matrix))
 
+        # Test that 'maskna=True' works
+        a = np.arange(6).reshape(2,3)
+        res = like_function(a, maskna=True)
+        assert_(res.flags.maskna)
+        assert_(res.flags.ownmaskna)
+        assert_equal(res.shape, a.shape)
+        assert_equal(res.dtype, a.dtype)
+
+        # Test that no NA mask is created when the prototype is NA-masked
+        a = np.arange(6, maskna=True).reshape(2,3)
+        assert_(a.flags.maskna)
+        res = like_function(a)
+        assert_(not res.flags.maskna)
+
     def test_ones_like(self):
         self.check_like_function(np.ones_like, 1)
 
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 1d5bf7303..03c5f026b 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -638,8 +638,9 @@ class TestRegression(TestCase):
     def test_bool_indexing_invalid_nr_elements(self, level=rlevel):
         s = np.ones(10,dtype=float)
         x = np.array((15,),dtype=float)
-        def ia(x,s): x[(s>0)]=1.0
-        self.assertRaises(ValueError,ia,x,s)
+        def ia(x,s,v): x[(s>0)]=v
+        self.assertRaises(ValueError,ia,x,s,np.zeros(9,dtype=float))
+        self.assertRaises(ValueError,ia,x,s,np.zeros(11,dtype=float))
 
     def test_mem_scalar_indexing(self, level=rlevel):
         """Ticket #603"""
@@ -1625,5 +1626,11 @@ class TestRegression(TestCase):
             a = np.empty((100000000,), dtype='i1')
             del a
 
+    def test_ufunc_reduce_memoryleak(self):
+        a = np.arange(6)
+        acnt = sys.getrefcount(a)
+        res = np.add.reduce(a)
+        assert_equal(sys.getrefcount(a), acnt)
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index f282dbe25..17a9cbbde 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -141,3 +141,5 @@ class TestVstack(TestCase):
         desired = array([[1,2],[1,2]])
         assert_array_equal(res,desired)
 
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 773ce9a3b..725047d75 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -120,9 +120,9 @@ class TestUfunc(TestCase):
 
         # class to use in testing object method loops
         class foo(object):
-            def logical_not(self) :
+            def conjugate(self) :
                 return np.bool_(1)
-            def logical_and(self, obj) :
+            def logical_xor(self, obj) :
                 return np.bool_(1)
 
         # check unary PyUFunc_O_O
@@ -134,7 +134,7 @@ class TestUfunc(TestCase):
         x = np.zeros(10, dtype=np.object)[0::2]
         for i in range(len(x)) :
             x[i] = foo()
-        assert_(np.all(np.logical_not(x) == True), msg)
+        assert_(np.all(np.conjugate(x) == True), msg)
 
         # check binary PyUFunc_OO_O
         msg = "PyUFunc_OO_O"
@@ -145,7 +145,7 @@ class TestUfunc(TestCase):
         x = np.zeros(10, dtype=np.object)[0::2]
         for i in range(len(x)) :
             x[i] = foo()
-        assert_(np.all(np.logical_and(x,x) == 1), msg)
+        assert_(np.all(np.logical_xor(x,x)), msg)
 
         # check PyUFunc_On_Om
         # fixme -- I don't know how to do this yet
@@ -471,6 +471,67 @@ class TestUfunc(TestCase):
 
         assert_equal(ref, True, err_msg="reference check")
 
+    def test_object_logical(self):
+        a = np.array([3, None, True, False, "test", ""], dtype=object)
+        assert_equal(np.logical_or(a, None),
+                        np.array([x or None for x in a], dtype=object))
+        assert_equal(np.logical_or(a, True),
+                        np.array([x or True for x in a], dtype=object))
+        assert_equal(np.logical_or(a, 12),
+                        np.array([x or 12 for x in a], dtype=object))
+        assert_equal(np.logical_or(a, "blah"),
+                        np.array([x or "blah" for x in a], dtype=object))
+
+        assert_equal(np.logical_and(a, None),
+                        np.array([x and None for x in a], dtype=object))
+        assert_equal(np.logical_and(a, True),
+                        np.array([x and True for x in a], dtype=object))
+        assert_equal(np.logical_and(a, 12),
+                        np.array([x and 12 for x in a], dtype=object))
+        assert_equal(np.logical_and(a, "blah"),
+                        np.array([x and "blah" for x in a], dtype=object))
+
+        assert_equal(np.logical_not(a),
+                        np.array([not x for x in a], dtype=object))
+
+        assert_equal(np.logical_or.reduce(a), 3)
+        assert_equal(np.logical_and.reduce(a), None)
+
+    def test_zerosize_reduction(self):
+        assert_equal(np.sum([]), 0)
+        assert_equal(np.prod([]), 1)
+        assert_equal(np.any([]), False)
+        assert_equal(np.all([]), True)
+        assert_raises(ValueError, np.max, [])
+        assert_raises(ValueError, np.min, [])
+
+    def test_axis_out_of_bounds(self):
+        a = np.array([False, False])
+        assert_raises(ValueError, a.all, axis=1)
+        a = np.array([False, False])
+        assert_raises(ValueError, a.all, axis=-2)
+
+        a = np.array([False, False])
+        assert_raises(ValueError, a.any, axis=1)
+        a = np.array([False, False])
+        assert_raises(ValueError, a.any, axis=-2)
+
+    def test_scalar_reduction(self):
+        # The functions 'sum', 'prod', etc allow specifying axis=0
+        # even for scalars
+        assert_equal(np.sum(3, axis=0), 3)
+        assert_equal(np.prod(3.5, axis=0), 3.5)
+        assert_equal(np.any(True, axis=0), True)
+        assert_equal(np.all(False, axis=0), False)
+        assert_equal(np.max(3, axis=0), 3)
+        assert_equal(np.min(2.5, axis=0), 2.5)
+
+        # Make sure that scalars are coming out from this operation
+        assert_(type(np.prod(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.sum(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.max(np.float32(2.5), axis=0)) is np.float32)
+        assert_(type(np.min(np.float32(2.5), axis=0)) is np.float32)
+
     def test_casting_out_param(self):
         # Test that it's possible to do casts on output
         a = np.ones((200,100), np.int64)
@@ -510,5 +571,89 @@ class TestUfunc(TestCase):
         np.add(a, b, out=c, where=[1,0,0,1,0,0,1,1,1,0])
         assert_equal(c, [2,1.5,1.5,2,1.5,1.5,2,2,2,1.5])
 
+    def check_identityless_reduction(self, a):
+        # np.minimum.reduce is a identityless reduction
+
+        # Verify that it sees the zero at various positions
+        a[...] = 1
+        a[1,0,0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [0,1,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [1,0])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[0,1,1,1], [1,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1,1,1,1], [0,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1,1,1], [0,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0,1,0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [0,1,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [1,0,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [0,1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1,1,1,1], [0,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[0,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1,0,1], [1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0,0,1] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [1,0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [0,1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1,0,1,1], [1,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1,0,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[0,1,1], [1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+    def test_identityless_reduction_corder(self):
+        a = np.empty((2,3,4), order='C')
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_forder(self):
+        a = np.empty((2,3,4), order='F')
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_otherorder(self):
+        a = np.empty((2,4,3), order='C').swapaxes(1,2)
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_noncontig(self):
+        a = np.empty((3,5,4), order='C').swapaxes(1,2)
+        a = a[1:, 1:, 1:]
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_noncontig_unaligned(self):
+        a = np.empty((3*4*5*8 + 1,), dtype='i1')
+        a = a[1:].view(dtype='f8')
+        a.shape = (3,4,5)
+        a = a[1:, 1:, 1:]
+        self.check_identityless_reduction(a)
+
+    def test_identityless_reduction_nonreorderable(self):
+        a = np.array([[8.0, 2.0, 2.0], [1.0, 0.5, 0.25]])
+
+        res = np.divide.reduce(a, axis=0)
+        assert_equal(res, [8.0, 4.0, 8.0])
+
+        res = np.divide.reduce(a, axis=1)
+        assert_equal(res, [2.0, 8.0])
+
+        res = np.divide.reduce(a, axis=())
+        assert_equal(res, a)
+
+        assert_raises(ValueError, np.divide.reduce, a, axis=(0,1))
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index 98597bc46..c4441d8a5 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -229,7 +229,7 @@ def intersect1d(ar1, ar2, assume_unique=False):
         ar2 = unique(ar2)
     aux = np.concatenate( (ar1, ar2) )
     aux.sort()
-    return aux[aux[1:] == aux[:-1]]
+    return aux[:-1][aux[1:] == aux[:-1]]
 
 def setxor1d(ar1, ar2, assume_unique=False):
     """
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index caef5c709..b269d98a1 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -778,7 +778,7 @@ def select(condlist, choicelist, default=0):
             S = S*ones(asarray(pfac).shape, S.dtype)
     return choose(S, tuple(choicelist))
 
-def copy(a):
+def copy(a, order='C', maskna=None):
     """
     Return an array copy of the given object.
 
@@ -786,6 +786,15 @@ def copy(a):
     ----------
     a : array_like
         Input data.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout of the copy. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of `a` as closely
+        as possible.
+    maskna : bool, optional
+        If specifies, forces the copy to have or to not have an
+        NA mask. This is a way to remove an NA mask from an array
+        while making a copy.
 
     Returns
     -------
@@ -815,7 +824,7 @@ def copy(a):
     False
 
     """
-    return array(a, copy=True)
+    return array(a, order=order, copy=True, maskna=maskna)
 
 # Basic operations
 
@@ -3317,6 +3326,7 @@ def delete(arr, obj, axis=None):
                     "invalid entry")
         newshape[axis]-=1;
         new = empty(newshape, arr.dtype, arr.flags.fnc)
+        new.flags.maskna = arr.flags.maskna
         slobj[axis] = slice(None, obj)
         new[slobj] = arr[slobj]
         slobj[axis] = slice(obj,None)
@@ -3333,6 +3343,7 @@ def delete(arr, obj, axis=None):
                 return arr.copy()
         newshape[axis] -= numtodel
         new = empty(newshape, arr.dtype, arr.flags.fnc)
+        new.flags.maskna = arr.flags.maskna
         # copy initial chunk
         if start == 0:
             pass
@@ -3464,6 +3475,7 @@ def insert(arr, obj, values, axis=None):
                     "in dimension %d" % (obj, N, axis))
         newshape[axis] += 1;
         new = empty(newshape, arr.dtype, arr.flags.fnc)
+        new.flags.maskna = arr.flags.maskna
         slobj[axis] = slice(None, obj)
         new[slobj] = arr[slobj]
         slobj[axis] = obj
@@ -3490,6 +3502,7 @@ def insert(arr, obj, values, axis=None):
     index2 = setdiff1d(arange(numnew+N),index1)
     newshape[axis] += numnew
     new = empty(newshape, arr.dtype, arr.flags.fnc)
+    new.flags.maskna = arr.flags.maskna
     slobj2 = [slice(None)]*ndim
     slobj[axis] = index1
     slobj2[axis] = index2
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 105389d6d..0b6b1e19d 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -452,7 +452,7 @@ class TestTrapz(TestCase):
     def test_simple(self):
         r = trapz(exp(-1.0 / 2 * (arange(-10, 10, .1)) ** 2) / sqrt(2 * pi), dx=0.1)
         #check integral of normal equals 1
-        assert_almost_equal(sum(r, axis=0), 1, 7)
+        assert_almost_equal(r, 1, 7)
 
     def test_ndim(self):
         x = linspace(0, 1, 3)
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index e4c0bde93..2c6500a57 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -147,6 +147,10 @@ class TestIndexExpression(TestCase):
         assert_equal(a[:,:3,[1,2]], a[index_exp[:,:3,[1,2]]])
         assert_equal(a[:,:3,[1,2]], a[s_[:,:3,[1,2]]])
 
+def test_c_():
+    a = np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]
+    assert_equal(a, [[1, 2, 3, 0, 0, 4, 5, 6]])
+
 def test_fill_diagonal():
     a = zeros((3, 3),int)
     fill_diagonal(a, 5)
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 0a3f1e3e3..3d2d1e983 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -626,7 +626,7 @@ class TestJoinBy2(TestCase):
                         dtype=[('a', int), ('b', int), ('d', int)])
 
     def test_no_r1postfix(self):
-        "Basic test of join_by"
+        "Basic test of join_by no_r1postfix"
         a, b = self.a, self.b
 
         test = join_by('a', a, b, r1postfix='', r2postfix='2', jointype='inner')
@@ -644,7 +644,7 @@ class TestJoinBy2(TestCase):
         self.assertRaises(ValueError, join_by, 'a', self.a, self.b, r1postfix='', r2postfix='')
 
     def test_no_r2postfix(self):
-        "Basic test of join_by"
+        "Basic test of join_by no_r2postfix"
         a, b = self.a, self.b
 
         test = join_by('a', a, b, r1postfix='1', r2postfix='', jointype='inner')
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 9d6cd0551..56178e8af 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -258,6 +258,13 @@ class TestSqueeze(TestCase):
         assert_array_equal(squeeze(b),reshape(b,(20,10,20)))
         assert_array_equal(squeeze(c),reshape(c,(20,10)))
 
+        # Squeezing to 0-dim should still give an ndarray
+        a = [[[1.5]]]
+        res = squeeze(a)
+        assert_equal(res, 1.5)
+        assert_equal(res.ndim, 0)
+        assert_equal(type(res), ndarray)
+
 
 class TestKron(TestCase):
     def test_return_type(self):
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index d95a59e3f..12bba99a6 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -9,7 +9,7 @@ __all__ = ['diag','diagflat','eye','fliplr','flipud','rot90','tri','triu',
 
 from numpy.core.numeric import asanyarray, equal, subtract, arange, \
      zeros, greater_equal, multiply, ones, asarray, alltrue, where, \
-     empty
+     empty, diagonal
 
 def fliplr(m):
     """
@@ -166,7 +166,7 @@ def rot90(m, k=1):
         # k == 3
         return fliplr(m.swapaxes(0,1))
 
-def eye(N, M=None, k=0, dtype=float):
+def eye(N, M=None, k=0, dtype=float, maskna=False):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
 
@@ -182,6 +182,8 @@ def eye(N, M=None, k=0, dtype=float):
       to a lower diagonal.
     dtype : data-type, optional
       Data-type of the returned array.
+    maskna : boolean
+      If this is true, the returned array will have an NA mask.
 
     Returns
     -------
@@ -207,24 +209,20 @@ def eye(N, M=None, k=0, dtype=float):
     """
     if M is None:
         M = N
-    m = zeros((N, M), dtype=dtype)
-    if k >= M:
-        return m
-    if k >= 0:
-        i = k
-    else:
-        i = (-k) * M
-    m[:M-k].flat[i::M+1] = 1
+    m = zeros((N, M), dtype=dtype, maskna=maskna)
+    diagonal(m, k)[...] = 1
     return m
 
 def diag(v, k=0):
     """
     Extract a diagonal or construct a diagonal array.
 
+    As of NumPy 1.7, extracting a diagonal always returns a view into `v`.
+
     Parameters
     ----------
     v : array_like
-        If `v` is a 2-D array, return a copy of its `k`-th diagonal.
+        If `v` is a 2-D array, return a view of its `k`-th diagonal.
         If `v` is a 1-D array, return a 2-D array with `v` on the `k`-th
         diagonal.
     k : int, optional
@@ -278,16 +276,7 @@ def diag(v, k=0):
         res[:n-k].flat[i::n+1] = v
         return res
     elif len(s) == 2:
-        if k >= s[1]:
-            return empty(0, dtype=v.dtype)
-        if v.flags.f_contiguous:
-            # faster slicing
-            v, k, s = v.T, -k, s[::-1]
-        if k >= 0:
-            i = k
-        else:
-            i = (-k) * s[1]
-        return v[:s[1]-k].flat[i::s[1]+1]
+        return v.diagonal(k)
     else:
         raise ValueError("Input must be 1- or 2-d.")
 
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index e2e954a97..fdd5d2705 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -3040,7 +3040,7 @@ class MaskedArray(ndarray):
             mindx = mask_or(_mask[indx], mval, copy=True)
             dindx = self._data[indx]
             if dindx.size > 1:
-                dindx[~mindx] = dval
+                np.copyto(dindx, dval, where=~mindx)
             elif mindx is nomask:
                 dindx = dval
             ndarray.__setitem__(_data, indx, dindx)
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 27abcb2c1..bbc17d165 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -1090,7 +1090,7 @@ def intersect1d(ar1, ar2, assume_unique=False):
         # Might be faster than unique( intersect1d( ar1, ar2 ) )?
         aux = ma.concatenate((unique(ar1), unique(ar2)))
     aux.sort()
-    return aux[aux[1:] == aux[:-1]]
+    return aux[:-1][aux[1:] == aux[:-1]]
 
 
 def setxor1d(ar1, ar2, assume_unique=False):
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index d82f3bd81..95219beb1 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -1102,13 +1102,13 @@ class TestMaskedArrayArithmetic(TestCase):
             output.fill(-9999)
             result = npfunc(xm, axis=0, out=output)
             # ... the result should be the given output
-            self.assertTrue(result is output)
+            assert_(result is output)
             assert_equal(result, xmmeth(axis=0, out=output))
             #
             output = empty(4, dtype=int)
             result = xmmeth(axis=0, out=output)
-            self.assertTrue(result is output)
-            self.assertTrue(output[0] is masked)
+            assert_(result is output)
+            assert_(output[0] is masked)
 
 
     def test_eq_on_structured(self):
@@ -2034,7 +2034,7 @@ class TestMaskedArrayMethods(TestCase):
 
     def test_allany_oddities(self):
         "Some fun with all and any"
-        store = empty(1, dtype=bool)
+        store = empty((), dtype=bool)
         full = array([1, 2, 3], mask=True)
         #
         self.assertTrue(full.all() is masked)
@@ -2043,7 +2043,7 @@ class TestMaskedArrayMethods(TestCase):
         self.assertTrue(store._mask, True)
         self.assertTrue(store is not masked)
         #
-        store = empty(1, dtype=bool)
+        store = empty((), dtype=bool)
         self.assertTrue(full.any() is masked)
         full.any(out=store)
         self.assertTrue(not store)
diff --git a/numpy/matrixlib/defmatrix.py b/numpy/matrixlib/defmatrix.py
index aac123e59..dbf2909fd 100644
--- a/numpy/matrixlib/defmatrix.py
+++ b/numpy/matrixlib/defmatrix.py
@@ -375,6 +375,15 @@ class matrix(N.ndarray):
         else:
             raise ValueError("unsupported axis")
 
+    def _collapse(self, axis):
+        """A convenience function for operations that want to collapse
+        to a scalar like _align, but are using keepdims=True
+        """
+        if axis is None:
+            return self[0,0]
+        else:
+            return self
+
     # Necessary because base-class tolist expects dimension
     #  reduction by x[0]
     def tolist(self):
@@ -432,7 +441,7 @@ class matrix(N.ndarray):
                 [ 7.]])
 
         """
-        return N.ndarray.sum(self, axis, dtype, out)._align(axis)
+        return N.ndarray.sum(self, axis, dtype, out, keepdims=True)._collapse(axis)
 
     def mean(self, axis=None, dtype=None, out=None):
         """
@@ -466,7 +475,7 @@ class matrix(N.ndarray):
                 [ 9.5]])
 
         """
-        return N.ndarray.mean(self, axis, dtype, out)._align(axis)
+        return N.ndarray.mean(self, axis, dtype, out, keepdims=True)._collapse(axis)
 
     def std(self, axis=None, dtype=None, out=None, ddof=0):
         """
@@ -500,7 +509,7 @@ class matrix(N.ndarray):
                 [ 1.11803399]])
 
         """
-        return N.ndarray.std(self, axis, dtype, out, ddof)._align(axis)
+        return N.ndarray.std(self, axis, dtype, out, ddof, keepdims=True)._collapse(axis)
 
     def var(self, axis=None, dtype=None, out=None, ddof=0):
         """
@@ -534,7 +543,7 @@ class matrix(N.ndarray):
                 [ 1.25]])
 
         """
-        return N.ndarray.var(self, axis, dtype, out, ddof)._align(axis)
+        return N.ndarray.var(self, axis, dtype, out, ddof, keepdims=True)._collapse(axis)
 
     def prod(self, axis=None, dtype=None, out=None):
         """
@@ -567,7 +576,7 @@ class matrix(N.ndarray):
                 [7920]])
 
         """
-        return N.ndarray.prod(self, axis, dtype, out)._align(axis)
+        return N.ndarray.prod(self, axis, dtype, out, keepdims=True)._collapse(axis)
 
     def any(self, axis=None, out=None):
         """
@@ -590,7 +599,7 @@ class matrix(N.ndarray):
                 returns `ndarray`
 
         """
-        return N.ndarray.any(self, axis, out)._align(axis)
+        return N.ndarray.any(self, axis, out, keepdims=True)._collapse(axis)
 
     def all(self, axis=None, out=None):
         """
@@ -630,7 +639,7 @@ class matrix(N.ndarray):
                 [False]], dtype=bool)
 
         """
-        return N.ndarray.all(self, axis, out)._align(axis)
+        return N.ndarray.all(self, axis, out, keepdims=True)._collapse(axis)
 
     def max(self, axis=None, out=None):
         """
@@ -665,7 +674,7 @@ class matrix(N.ndarray):
                 [11]])
 
         """
-        return N.ndarray.max(self, axis, out)._align(axis)
+        return N.ndarray.max(self, axis, out, keepdims=True)._collapse(axis)
 
     def argmax(self, axis=None, out=None):
         """
@@ -735,7 +744,7 @@ class matrix(N.ndarray):
                 [-11]])
 
         """
-        return N.ndarray.min(self, axis, out)._align(axis)
+        return N.ndarray.min(self, axis, out, keepdims=True)._collapse(axis)
 
     def argmin(self, axis=None, out=None):
         """
diff --git a/numpy/matrixlib/tests/test_defmatrix.py b/numpy/matrixlib/tests/test_defmatrix.py
index 09a4b4892..0a181fca3 100644
--- a/numpy/matrixlib/tests/test_defmatrix.py
+++ b/numpy/matrixlib/tests/test_defmatrix.py
@@ -65,29 +65,45 @@ class TestProperties(TestCase):
         sumall = 30
         assert_array_equal(sum0, M.sum(axis=0))
         assert_array_equal(sum1, M.sum(axis=1))
-        assert_(sumall == M.sum())
+        assert_equal(sumall, M.sum())
+
+        assert_array_equal(sum0, np.sum(M, axis=0))
+        assert_array_equal(sum1, np.sum(M, axis=1))
+        assert_equal(sumall, np.sum(M))
 
 
     def test_prod(self):
         x = matrix([[1,2,3],[4,5,6]])
-        assert_(x.prod() == 720)
-        assert_(all(x.prod(0) == matrix([[4,10,18]])))
-        assert_(all(x.prod(1) == matrix([[6],[120]])))
+        assert_equal(x.prod(), 720)
+        assert_equal(x.prod(0), matrix([[4,10,18]]))
+        assert_equal(x.prod(1), matrix([[6],[120]]))
+
+        assert_equal(np.prod(x), 720)
+        assert_equal(np.prod(x, axis=0), matrix([[4,10,18]]))
+        assert_equal(np.prod(x, axis=1), matrix([[6],[120]]))
 
         y = matrix([0,1,3])
         assert_(y.prod() == 0)
 
     def test_max(self):
         x = matrix([[1,2,3],[4,5,6]])
-        assert_(x.max() == 6)
-        assert_(all(x.max(0) == matrix([[4,5,6]])))
-        assert_(all(x.max(1) == matrix([[3],[6]])))
+        assert_equal(x.max(), 6)
+        assert_equal(x.max(0), matrix([[4,5,6]]))
+        assert_equal(x.max(1), matrix([[3],[6]]))
+
+        assert_equal(np.max(x), 6)
+        assert_equal(np.max(x, axis=0), matrix([[4,5,6]]))
+        assert_equal(np.max(x, axis=1), matrix([[3],[6]]))
 
     def test_min(self):
         x = matrix([[1,2,3],[4,5,6]])
-        assert_(x.min() == 1)
-        assert_(all(x.min(0) == matrix([[1,2,3]])))
-        assert_(all(x.min(1) == matrix([[1],[4]])))
+        assert_equal(x.min(), 1)
+        assert_equal(x.min(0), matrix([[1,2,3]]))
+        assert_equal(x.min(1), matrix([[1],[4]]))
+
+        assert_equal(np.min(x), 1)
+        assert_equal(np.min(x, axis=0), matrix([[1,2,3]]))
+        assert_equal(np.min(x, axis=1), matrix([[1],[4]]))
 
     def test_ptp(self):
         x = np.arange(4).reshape((2,2))
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index eae343304..a0e395c45 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -567,7 +567,7 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
 
 def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                          header=''):
-    from numpy.core import array, isnan, isinf, any
+    from numpy.core import array, isnan, isinf, isna, any, all, inf
     x = array(x, copy=False, subok=True)
     y = array(y, copy=False, subok=True)
 
@@ -598,22 +598,64 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             if not cond :
                 raise AssertionError(msg)
 
-        if (isnumber(x) and isnumber(y)) and (any(isnan(x)) or any(isnan(y))):
-            x_id = isnan(x)
-            y_id = isnan(y)
-            chk_same_position(x_id, y_id, hasval='nan')
-            # If only one item, it was a nan, so just return
-            if x.size == y.size == 1:
+        if isnumber(x) and isnumber(y):
+            x_isna, y_isna = isna(x), isna(y)
+            x_isnan, y_isnan = isnan(x), isnan(y)
+            x_isinf, y_isinf = isinf(x), isinf(y)
+
+            # Remove any NAs from the isnan and isinf arrays
+            if x.ndim == 0:
+                if x_isna:
+                    x_isnan = False
+                    x_isinf = False
+            else:
+                x_isnan[x_isna] = False
+                x_isinf[x_isna] = False
+            if y.ndim == 0:
+                if y_isna:
+                    y_isnan = False
+                    y_isinf = False
+            else:
+                y_isnan[y_isna] = False
+                y_isinf[y_isna] = False
+
+
+            # Validate that the special values are in the same place
+            if any(x_isnan) or any(y_isnan):
+                chk_same_position(x_isnan, y_isnan, hasval='nan')
+            if any(x_isinf) or any(y_isinf):
+                # Check +inf and -inf separately, since they are different
+                chk_same_position(x == +inf, y == +inf, hasval='+inf')
+                chk_same_position(x == -inf, y == -inf, hasval='-inf')
+            if any(x_isna) or any(y_isna):
+                chk_same_position(x_isna, y_isna, hasval='NA')
+
+            # Combine all the special values
+            x_id, y_id = x_isnan, y_isnan
+            x_id |= x_isinf
+            y_id |= y_isinf
+            x_id |= x_isna
+            y_id |= y_isna
+
+            # Only do the comparison if actual values are left
+            if all(x_id):
                 return
-            val = comparison(x[~x_id], y[~y_id])
-        elif (isnumber(x) and isnumber(y)) and (any(isinf(x)) or any(isinf(y))):
-            x_id = isinf(x)
-            y_id = isinf(y)
-            chk_same_position(x_id, y_id, hasval='inf')
-            # If only one item, it was a inf, so just return
-            if x.size == y.size == 1:
+
+            if any(x_id):
+                val = comparison(x[~x_id], y[~y_id])
+            else:
+                val = comparison(x, y)
+        # field-NA isn't supported yet, so skip struct dtypes for this
+        elif (not x.dtype.names and not y.dtype.names) and \
+                    (any(isna(x)) or any(isna(y))):
+            x_isna, y_isna = isna(x), isna(y)
+
+            if any(x_isna) or any(y_isna):
+                chk_same_position(x_isna, y_isna, hasval='NA')
+
+            if all(x_isna):
                 return
-            val = comparison(x[~x_id], y[~y_id])
+            val = comparison(x[~x_isna], y[~y_isna])
         else:
             val = comparison(x,y)
 
@@ -634,7 +676,10 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             if not cond :
                 raise AssertionError(msg)
     except ValueError, e:
-        header = 'error during assertion:\n%s\n\n%s' % (e, header)
+        import traceback
+        efmt = traceback.format_exc()
+        header = 'error during assertion:\n\n%s\n\n%s' % (efmt, header)
+
         msg = build_err_msg([x, y], err_msg, verbose=verbose, header=header,
                             names=('x', 'y'))
         raise ValueError(msg)
@@ -647,7 +692,9 @@ def assert_array_equal(x, y, err_msg='', verbose=True):
     elements of these objects are equal. An exception is raised at
     shape mismatch or conflicting values. In contrast to the standard usage
     in numpy, NaNs are compared like numbers, no assertion is raised if
-    both objects have NaNs in the same positions.
+    both objects have NaNs in the same positions. Similarly, NAs are compared
+    like numbers, no assertion is raised if both objects have NAs in the
+    same positions.
 
     The usual caution for verifying equality with floating point numbers is
     advised.
author	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 21:46:08 -0600
committer	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 21:46:08 -0600
commit	9ecd91b7bf8c77d696ec9856ba10896d8f60309a (patch)
tree	9884131ece5eada06212538c591965bf5928afa2 /numpy
parent	aa55ba7437fbe6b8772a360a641b5aa7d3e669e0 (diff)
parent	10fac981763e87f949bed15c66127fc380fa9b27 (diff)
download	numpy-9ecd91b7bf8c77d696ec9856ba10896d8f60309a.tar.gz