Merge branch 'new_iterator' - new iterator, ufunc update, restore 1.5 ABI

New Iterator - Read doc/neps/new-iterator-ufunc.rst. UFunc Update - Change all ufunc functions to use the new iterator. This replaces the inline buffering with iterator buffering, except for the reductions and generalized ufunc which use updateifcopy at the moment. Also adds out= and order= parameters to all ufuncs. Restore 1.5 ABI - This was done by moving the new type numbers to the end of the type enumeration, and replacing all type promotion code with a table-based approach. The ArrFuncs was restored by putting the new type cast functions into the cast dictionary, originally designed just for custom types. Conflicts: numpy/core/src/multiarray/ctors.c numpy/core/tests/test_regression.py
author: Mark Wiebe <mwwiebe@gmail.com> 2011-01-28 16:27:56 -0800
committer: Mark Wiebe <mwwiebe@gmail.com> 2011-01-28 16:27:56 -0800
commit: 67e5476a4178de55451501cfb01794c22d340b7a (patch)
tree: 2a24b021001658deb92230692f8fad62e9355791 /numpy
parent: cdac1209a517bf0808f12340d21ac9d334f69485 (diff)
parent: aedce0eb9fa63e7dec3c865374a64e11374c284c (diff)
download: numpy-67e5476a4178de55451501cfb01794c22d340b7a.tar.gz
62 files changed, 24973 insertions, 4433 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index d7a8569fa..51826c5ff 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -413,6 +413,58 @@ add_newdoc('numpy.core.multiarray', 'empty',
 
     """)
 
+add_newdoc('numpy.core.multiarray', 'empty_like',
+    """
+    empty_like(a, dtype=None, order='K')
+
+    Return a new array with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : array_like
+        The shape and data-type of `a` define these same attributes of the
+        returned array.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if ``a`` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of ``a`` as closely
+        as possible.
+
+    Returns
+    -------
+    out : ndarray
+        Array of uninitialized (arbitrary) data with the same
+        shape and type as `a`.
+
+    See Also
+    --------
+    ones_like : Return an array of ones with shape and type of input.
+    zeros_like : Return an array of zeros with shape and type of input.
+    empty : Return a new uninitialized array.
+    ones : Return a new array setting values to one.
+    zeros : Return a new array setting values to zero.
+
+    Notes
+    -----
+    This function does *not* initialize the returned array; to do that use
+    `zeros_like` or `ones_like` instead.  It may be marginally faster than
+    the functions that do set the array values.
+
+    Examples
+    --------
+    >>> a = ([1,2,3], [4,5,6])                         # a is array-like
+    >>> np.empty_like(a)
+    array([[-1073741821, -1073741821,           3],    #random
+           [          0,           0, -1073741821]])
+    >>> a = np.array([[1., 2., 3.],[4.,5.,6.]])
+    >>> np.empty_like(a)
+    array([[ -2.00000715e+000,   1.48219694e-323,  -2.00000572e+000],#random
+           [  4.38791518e-305,  -2.00000715e+000,   4.17269252e-309]])
+
+    """)
+
 
 add_newdoc('numpy.core.multiarray', 'scalar',
     """
@@ -481,6 +533,35 @@ add_newdoc('numpy.core.multiarray', 'zeros',
 
     """)
 
+add_newdoc('numpy.core.multiarray', 'count_nonzero',
+    """
+    count_nonzero(a)
+
+    Counts the number of non-zero values in the array ``a``.
+
+    Parameters
+    ----------
+    a : array_like
+        The array for which to count non-zeros.
+
+    Returns
+    -------
+    count : int
+        Number of non-zero values in the array.
+
+    See Also
+    --------
+    nonzero : Return the coordinates of all the non-zero values.
+
+    Examples
+    --------
+    >>> np.count_nonzero(np.eye(4))
+    4
+
+    >>> np.count_nonzero([[0,1,7,0,0],[3,0,0,2,19]])
+    5
+    """)
+
 add_newdoc('numpy.core.multiarray','set_typeDict',
     """set_typeDict(dict)
 
@@ -797,6 +878,7 @@ add_newdoc('numpy.core', 'inner',
     --------
     tensordot : Sum products over arbitrary axes.
     dot : Generalised matrix product, using second last dimension of `b`.
+    einsum : Einstein summation convention.
 
     Notes
     -----
@@ -1115,9 +1197,12 @@ add_newdoc('numpy.core.multiarray', 'lexsort',
 
 add_newdoc('numpy.core.multiarray', 'can_cast',
     """
-    can_cast(fromtype, totype)
+    can_cast(from, totype, casting = 'safe')
 
-    Returns True if cast between data types can occur without losing precision.
+    Returns True if cast between data types can occur according to the
+    casting rule.  If from is a scalar or array scalar, also returns
+    True if the scalar value can be cast without overflow or truncation
+    to an integer.
 
     Parameters
     ----------
@@ -1125,14 +1210,19 @@ add_newdoc('numpy.core.multiarray', 'can_cast',
         Data type to cast from.
     totype : dtype or dtype specifier
         Data type to cast to.
+    casting : casting rule
+        May be any of 'no', 'equiv', 'safe', 'same_kind', or 'unsafe'.
 
     Returns
     -------
     out : bool
-        True if cast can occur without losing precision.
+        True if cast can occur according to the casting rule.
 
     Examples
     --------
+
+    Basic examples
+
     >>> np.can_cast(np.int32, np.int64)
     True
     >>> np.can_cast(np.float64, np.complex)
@@ -1147,6 +1237,171 @@ add_newdoc('numpy.core.multiarray', 'can_cast',
     >>> np.can_cast('i4', 'S4')
     True
 
+    Casting scalars
+
+    >>> np.can_cast(100, 'i1')
+    True
+    >>> np.can_cast(150, 'i1')
+    False
+    >>> np.can_cast(150, 'u1')
+    True
+
+    >>> np.can_cast(3.5e100, np.float32)
+    False
+    >>> np.can_cast(1000.0, np.float32)
+    True
+
+    Array scalar checks the value, array does not
+
+    >>> np.can_cast(np.array(1000.0), np.float32)
+    True
+    >>> np.can_cast(np.array([1000.0]), np.float32)
+    False
+
+    Using the casting rules
+
+    >>> np.can_cast('i8', 'i8', 'no')
+    True
+    >>> np.can_cast('<i8', '>i8', 'no')
+    False
+
+    >>> np.can_cast('<i8', '>i8', 'equiv')
+    True
+    >>> np.can_cast('<i4', '>i8', 'equiv')
+    False
+
+    >>> np.can_cast('<i4', '>i8', 'safe')
+    True
+    >>> np.can_cast('<i8', '>i4', 'safe')
+    False
+
+    >>> np.can_cast('<i8', '>i4', 'same_kind')
+    True
+    >>> np.can_cast('<i8', '>u4', 'same_kind')
+    False
+
+    >>> np.can_cast('<i8', '>u4', 'unsafe')
+    True
+
+    """)
+
+add_newdoc('numpy.core.multiarray', 'promote_types',
+    """
+    promote_types(type1, type2)
+
+    Returns the data type with the smallest size and smallest scalar
+    kind to which both ``type1`` and ``type2`` may be safely cast.
+    The returned data type is always in native byte order.
+
+    Parameters
+    ----------
+    type1 : dtype or dtype specifier
+        First data type.
+    type2 : dtype or dtype specifier
+        Second data type.
+
+    Returns
+    -------
+    out : dtype
+        The promoted data type.
+
+    Examples
+    --------
+    >>> np.promote_types('f4', 'f8')
+    dtype('float64')
+
+    >>> np.promote_types('i8', 'f4')
+    dtype('float64')
+
+    >>> np.promote_types('>i8', '<c8')
+    dtype('complex128')
+
+    >>> np.promote_types('i1', 'S8')
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: invalid type promotion
+    """)
+
+add_newdoc('numpy.core.multiarray', 'min_scalar_type',
+    """
+    min_scalar_type(a)
+
+    For scalar ``a``, returns the data type with the smallest size
+    and smallest scalar kind which can hold its value.  For non-scalar
+    array ``a``, returns the vector's dtype unmodified.
+
+    As a special case, floating point values are not demoted to integers,
+    and complex values are not demoted to floats.
+
+    Parameters
+    ----------
+    a : scalar or array_like
+        The value whose minimal data type is to be found.
+
+    Returns
+    -------
+    out : dtype
+        The minimal data type.
+
+    Examples
+    --------
+    >>> np.min_scalar_type(10)
+    dtype('uint8')
+
+    >>> np.min_scalar_type(-260)
+    dtype('int16')
+
+    >>> np.min_scalar_type(3.1)
+    dtype('float16')
+
+    >>> np.min_scalar_type(1e50)
+    dtype('float64')
+
+    >>> np.min_scalar_type(np.arange(4,dtype='f8'))
+    dtype('float64')
+
+    """)
+
+add_newdoc('numpy.core.multiarray', 'result_type',
+    """
+    result_type(*arrays_and_dtypes)
+
+    Returns the type that results from applying the NumPy
+    type promotion rules to the arguments.
+
+    Type promotion in NumPy works similarly to the rules in languages
+    like C++, with some slight differences.  When both scalars and
+    arrays are used, the array's type takes precedence and the actual value
+    of the scalar is taken into account.
+    
+    For example, calculating 3*a, where a is an array of 32-bit floats,
+    intuitively should result in a 32-bit float output.  If the 3 is a
+    32-bit integer, the NumPy rules indicate it can't convert losslessly
+    into a 32-bit float, so a 64-bit float should be the result type.
+    By examining the value of the constant, '3', we see that it fits in
+    an 8-bit integer, which can be cast losslessly into the 32-bit float.
+
+    Parameters
+    ----------
+    arrays_and_dtypes : list of arrays and dtypes
+        The operands of some operation whose result type is needed.
+
+    Returns
+    -------
+    out : dtype
+        The result type.
+
+    Examples
+    --------
+    >>> np.result_type(3, np.arange(7, dtype='i1'))
+    dtype('int8')
+
+    >>> np.result_type('i4', 'c8')
+    dtype('complex128')
+
+    >>> np.result_type(3.0, -2)
+    dtype('float64')
+
     """)
 
 add_newdoc('numpy.core.multiarray','newbuffer',
@@ -1227,6 +1482,7 @@ add_newdoc('numpy.core', 'dot',
     --------
     vdot : Complex-conjugating dot product.
     tensordot : Sum products over arbitrary axes.
+    einsum : Einstein summation convention.
 
     Examples
     --------
@@ -1255,6 +1511,160 @@ add_newdoc('numpy.core', 'dot',
 
     """)
 
+add_newdoc('numpy.core', 'einsum',
+    """
+    einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe')
+
+    Evaluates the Einstein summation convention on the operands.
+
+    Using the Einstein summation convention, many common multi-dimensional
+    array operations can be represented in a simple fashion.  This function
+    provides a way compute such summations.
+
+    The best way to understand this function is to try the examples below,
+    which show how many common NumPy functions can be implemented as
+    calls to einsum.
+    
+    The subscripts string is a comma-separated list of subscript labels,
+    where each label refers to a dimension of the corresponding operand.
+    Repeated subscripts labels in one operand take the diagonal.  For example,
+    ``np.einsum('ii', a)`` is equivalent to ``np.trace(a)``.
+    
+    Whenever a label is repeated, it is summed, so ``np.einsum('i,i', a, b)``
+    is equivalent to ``np.inner(a,b)``.  If a label appears only once,
+    it is not summed, so ``np.einsum('i', a)`` produces a view of ``a``
+    with no changes.
+
+    The order of labels in the output is by default alphabetical.  This
+    means that ``np.einsum('ij', a)`` doesn't affect a 2D array, while
+    ``np.einsum('ji', a)`` takes its transpose.
+
+    The output can be controlled by specifying output subscript labels
+    as well.  This specifies the label order, and allows summing to be
+    disallowed or forced when desired.  The call ``np.einsum('i->', a)``
+    is equivalent to ``np.sum(a, axis=-1)``, and
+    ``np.einsum('ii->i', a)`` is equivalent to ``np.diag(a)``.
+
+    It is also possible to control how broadcasting occurs using
+    an ellipsis.  To take the trace along the first and last axes,
+    you can do ``np.einsum('i...i', a)``, or to do a matrix-matrix
+    product with the left-most indices instead of rightmost, you can do
+    ``np.einsum('ij...,jk...->ik...', a, b)``.
+
+    When there is only one operand, no axes are summed, and no output
+    parameter is provided, a view into the operand is returned instead
+    of a new array.  Thus, taking the diagonal as ``np.einsum('ii->i', a)``
+    produces a view.
+
+    Parameters
+    ----------
+    subscripts : string
+        Specifies the subscripts for summation.
+    operands : list of array_like
+        These are the arrays for the operation.
+    out : None or array
+        If provided, the calculation is done into this array.
+    dtype : None or data type
+        If provided, forces the calculation to use the data type specified.
+        Note that you may have to also give a more liberal ``casting``
+        parameter to allow the conversions.
+    order : 'C', 'F', 'A', or 'K'
+        Controls the memory layout of the output. 'C' means it should
+        be C contiguous. 'F' means it should be Fortran contiguous,
+        'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise.
+        'K' means it should be as close to the layout as the inputs as
+        is possible, including arbitrarily permuted axes.
+    casting : 'no', 'equiv', 'safe', 'same_kind', 'unsafe'
+        Controls what kind of data casting may occur.  Setting this to
+        'unsafe' is not recommended, as it can adversely affect accumulations.
+        'no' means the data types should not be cast at all. 'equiv' means
+        only byte-order changes are allowed. 'safe' means only casts
+        which can preserve values are allowed. 'same_kind' means only
+        safe casts or casts within a kind, like float64 to float32, are
+        allowed.  'unsafe' means any data conversions may be done.
+
+    Returns
+    -------
+    output : ndarray
+        The calculation based on the Einstein summation convention.
+
+    See Also
+    --------
+    dot, inner, outer, tensordot
+
+    
+    Examples
+    --------
+
+    >>> a = np.arange(25).reshape(5,5)
+    >>> b = np.arange(5)
+    >>> c = np.arange(6).reshape(2,3)
+
+    >>> np.einsum('ii', a)
+    60
+    >>> np.trace(a)
+    60
+
+    >>> np.einsum('ii->i', a)
+    array([ 0,  6, 12, 18, 24])
+    >>> np.diag(a)
+    array([ 0,  6, 12, 18, 24])
+
+    >>> np.einsum('ij,j', a, b)
+    array([ 30,  80, 130, 180, 230])
+    >>> np.dot(a, b)
+    array([ 30,  80, 130, 180, 230])
+
+    >>> np.einsum('ji', c)
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+    >>> c.T
+    array([[0, 3],
+           [1, 4],
+           [2, 5]])
+
+    >>> np.einsum(',', 3, c)
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+    >>> np.multiply(3, c)
+    array([[ 0,  3,  6],
+           [ 9, 12, 15]])
+
+    >>> np.einsum('i,i', b, b)
+    30
+    >>> np.inner(b,b)
+    30
+
+    >>> np.einsum('i,j', np.arange(2)+1, b)
+    array([[0, 1, 2, 3, 4],
+           [0, 2, 4, 6, 8]])
+    >>> np.outer(np.arange(2)+1, b)
+    array([[0, 1, 2, 3, 4],
+           [0, 2, 4, 6, 8]])
+
+    >>> np.einsum('i...->', a)
+    array([50, 55, 60, 65, 70])
+    >>> np.sum(a, axis=0)
+    array([50, 55, 60, 65, 70])
+
+    >>> a = np.arange(60.).reshape(3,4,5)
+    >>> b = np.arange(24.).reshape(4,3,2)
+    >>> np.einsum('ijk,jil->kl', a, b)
+    array([[ 4400.,  4730.],
+           [ 4532.,  4874.],
+           [ 4664.,  5018.],
+           [ 4796.,  5162.],
+           [ 4928.,  5306.]])
+    >>> np.tensordot(a,b, axes=([1,0],[0,1]))
+    array([[ 4400.,  4730.],
+           [ 4532.,  4874.],
+           [ 4664.,  5018.],
+           [ 4796.,  5162.],
+           [ 4928.,  5306.]])
+
+    """)
+
 add_newdoc('numpy.core', 'alterdot',
     """
     Change `dot`, `vdot`, and `innerproduct` to use accelerated BLAS functions.
@@ -2613,6 +3023,37 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('itemset',
     """))
 
 
+add_newdoc('numpy.core.multiarray', 'ndarray', ('setasflat',
+    """
+    a.setasflat(arr)
+
+    Equivalent to a.flat = arr.flat, but is generally more efficient.
+    This function does not check for overlap, so if ``arr`` and ``a``
+    are viewing the same data with different strides, the results will
+    be unpredictable.
+
+    Parameters
+    ----------
+    arr : array_like
+        The array to copy into a.
+
+    Examples
+    --------
+    >>> a = np.arange(2*4).reshape(2,4)[:,:-1]; a
+    array([[0, 1, 2],
+           [4, 5, 6]])
+    >>> b = np.arange(3*3, dtype='f4').reshape(3,3).T[::-1,:-1]; b
+    array([[ 2.,  5.],
+           [ 1.,  4.],
+           [ 0.,  3.]], dtype=float32)
+    >>> a.setasflat(b)
+    >>> a
+    array([[2, 5, 1],
+           [4, 0, 3]])
+
+    """))
+
+
 add_newdoc('numpy.core.multiarray', 'ndarray', ('max',
     """
     a.max(axis=None, out=None)
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 5cac71a72..7c6b29e73 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -44,6 +44,9 @@ API_FILES = [join('multiarray', 'methods.c'),
              join('multiarray', 'conversion_utils.c'),
              join('multiarray', 'buffer.c'),
              join('multiarray', 'datetime.c'),
+             join('multiarray', 'new_iterator.c.src'),
+             join('multiarray', 'new_iterator_pywrap.c'),
+             join('multiarray', 'einsum.c.src'),
              join('umath', 'ufunc_object.c'),
              join('umath', 'loops.c.src'),
             ]
@@ -57,7 +60,7 @@ def remove_whitespace(s):
     return ''.join(s.split())
 
 def _repl(str):
-    return str.replace('intp', 'npy_intp').replace('Bool','npy_bool')
+    return str.replace('Bool','npy_bool')
 
 class Function(object):
     def __init__(self, name, return_type, args, doc=''):
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 9474a131a..47c292ac8 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -13,7 +13,7 @@ exception, so it should hopefully not get unnoticed).
 """
 
 multiarray_global_vars = {
-    'NPY_NUMUSERTYPES':             6,
+    'NPY_NUMUSERTYPES':             7,
 }
 
 multiarray_global_vars_types = {
@@ -21,50 +21,53 @@ multiarray_global_vars_types = {
 }
 
 multiarray_scalar_bool_values = {
-    '_PyArrayScalar_BoolValues':    8
+    '_PyArrayScalar_BoolValues':    9
 }
 
 multiarray_types_api = {
-    'PyArray_Type':                     1,
-    'PyArrayDescr_Type':                2,
-    'PyArrayFlags_Type':                3,
-    'PyArrayIter_Type':                 4,
-    'PyArrayMultiIter_Type':            5,
-    'PyBoolArrType_Type':               7,
-    'PyGenericArrType_Type':            9,
-    'PyNumberArrType_Type':             10,
-    'PyIntegerArrType_Type':            11,
-    'PySignedIntegerArrType_Type':      12,
-    'PyUnsignedIntegerArrType_Type':    13,
-    'PyInexactArrType_Type':            14,
-    'PyFloatingArrType_Type':           15,
-    'PyComplexFloatingArrType_Type':    16,
-    'PyFlexibleArrType_Type':           17,
-    'PyCharacterArrType_Type':          18,
-    'PyByteArrType_Type':               19,
-    'PyShortArrType_Type':              20,
-    'PyIntArrType_Type':                21,
-    'PyLongArrType_Type':               22,
-    'PyLongLongArrType_Type':           23,
-    'PyUByteArrType_Type':              24,
-    'PyUShortArrType_Type':             25,
-    'PyUIntArrType_Type':               26,
-    'PyULongArrType_Type':              27,
-    'PyULongLongArrType_Type':          28,
-    'PyFloatArrType_Type':              29,
-    'PyDoubleArrType_Type':             30,
-    'PyLongDoubleArrType_Type':         31,
-    'PyCFloatArrType_Type':             32,
-    'PyCDoubleArrType_Type':            33,
-    'PyCLongDoubleArrType_Type':        34,
-    'PyObjectArrType_Type':             35,
-    'PyStringArrType_Type':             36,
-    'PyUnicodeArrType_Type':            37,
-    'PyVoidArrType_Type':               38,
-    'PyTimeIntegerArrType_Type':        39,
-    'PyDatetimeArrType_Type':           40,
-    'PyTimedeltaArrType_Type':          41,
-    'PyHalfArrType_Type':               221,
+    'PyBigArray_Type':                  1,
+    'PyArray_Type':                     2,
+    'PyArrayDescr_Type':                3,
+    'PyArrayFlags_Type':                4,
+    'PyArrayIter_Type':                 5,
+    'PyArrayMultiIter_Type':            6,
+    'PyBoolArrType_Type':               8,
+    'PyGenericArrType_Type':            10,
+    'PyNumberArrType_Type':             11,
+    'PyIntegerArrType_Type':            12,
+    'PySignedIntegerArrType_Type':      13,
+    'PyUnsignedIntegerArrType_Type':    14,
+    'PyInexactArrType_Type':            15,
+    'PyFloatingArrType_Type':           16,
+    'PyComplexFloatingArrType_Type':    17,
+    'PyFlexibleArrType_Type':           18,
+    'PyCharacterArrType_Type':          19,
+    'PyByteArrType_Type':               20,
+    'PyShortArrType_Type':              21,
+    'PyIntArrType_Type':                22,
+    'PyLongArrType_Type':               23,
+    'PyLongLongArrType_Type':           24,
+    'PyUByteArrType_Type':              25,
+    'PyUShortArrType_Type':             26,
+    'PyUIntArrType_Type':               27,
+    'PyULongArrType_Type':              28,
+    'PyULongLongArrType_Type':          29,
+    'PyFloatArrType_Type':              30,
+    'PyDoubleArrType_Type':             31,
+    'PyLongDoubleArrType_Type':         32,
+    'PyCFloatArrType_Type':             33,
+    'PyCDoubleArrType_Type':            34,
+    'PyCLongDoubleArrType_Type':        35,
+    'PyObjectArrType_Type':             36,
+    'PyStringArrType_Type':             37,
+    'PyUnicodeArrType_Type':            38,
+    'PyVoidArrType_Type':               39,
+    # End 1.5 API
+    'PyTimeIntegerArrType_Type':        214,
+    'PyDatetimeArrType_Type':           215,
+    'PyTimedeltaArrType_Type':          216,
+    'PyHalfArrType_Type':               217,
+    'NpyIter_Type':                     218,
 }
 
 #define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
@@ -73,185 +76,239 @@ multiarray_types_api = {
 
 multiarray_funcs_api = {
     'PyArray_GetNDArrayCVersion':           0,
-    'PyArray_SetNumericOps':                42,
-    'PyArray_GetNumericOps':                43,
-    'PyArray_INCREF':                       44,
-    'PyArray_XDECREF':                      45,
-    'PyArray_SetStringFunction':            46,
-    'PyArray_DescrFromType':                47,
-    'PyArray_TypeObjectFromType':           48,
-    'PyArray_Zero':                         49,
-    'PyArray_One':                          50,
-    'PyArray_CastToType':                   51,
-    'PyArray_CastTo':                       52,
-    'PyArray_CastAnyTo':                    53,
-    'PyArray_CanCastSafely':                54,
-    'PyArray_CanCastTo':                    55,
-    'PyArray_ObjectType':                   56,
-    'PyArray_DescrFromObject':              57,
-    'PyArray_ConvertToCommonType':          58,
-    'PyArray_DescrFromScalar':              59,
-    'PyArray_DescrFromTypeObject':          60,
-    'PyArray_Size':                         61,
-    'PyArray_Scalar':                       62,
-    'PyArray_FromScalar':                   63,
-    'PyArray_ScalarAsCtype':                64,
-    'PyArray_CastScalarToCtype':            65,
-    'PyArray_CastScalarDirect':             66,
-    'PyArray_ScalarFromObject':             67,
-    'PyArray_GetCastFunc':                  68,
-    'PyArray_FromDims':                     69,
-    'PyArray_FromDimsAndDataAndDescr':      70,
-    'PyArray_FromAny':                      71,
-    'PyArray_EnsureArray':                  72,
-    'PyArray_EnsureAnyArray':               73,
-    'PyArray_FromFile':                     74,
-    'PyArray_FromString':                   75,
-    'PyArray_FromBuffer':                   76,
-    'PyArray_FromIter':                     77,
-    'PyArray_Return':                       78,
-    'PyArray_GetField':                     79,
-    'PyArray_SetField':                     80,
-    'PyArray_Byteswap':                     81,
-    'PyArray_Resize':                       82,
-    'PyArray_MoveInto':                     83,
-    'PyArray_CopyInto':                     84,
-    'PyArray_CopyAnyInto':                  85,
-    'PyArray_CopyObject':                   86,
-    'PyArray_NewCopy':                      87,
-    'PyArray_ToList':                       88,
-    'PyArray_ToString':                     89,
-    'PyArray_ToFile':                       90,
-    'PyArray_Dump':                         91,
-    'PyArray_Dumps':                        92,
-    'PyArray_ValidType':                    93,
-    'PyArray_UpdateFlags':                  94,
-    'PyArray_New':                          95,
-    'PyArray_NewFromDescr':                 96,
-    'PyArray_DescrNew':                     97,
-    'PyArray_DescrNewFromType':             98,
-    'PyArray_GetPriority':                  99,
-    'PyArray_IterNew':                      100,
-    'PyArray_MultiIterNew':                 101,
-    'PyArray_PyIntAsInt':                   102,
-    'PyArray_PyIntAsIntp':                  103,
-    'PyArray_Broadcast':                    104,
-    'PyArray_FillObjectArray':              105,
-    'PyArray_FillWithScalar':               106,
-    'PyArray_CheckStrides':                 107,
-    'PyArray_DescrNewByteorder':            108,
-    'PyArray_IterAllButAxis':               109,
-    'PyArray_CheckFromAny':                 110,
-    'PyArray_FromArray':                    111,
-    'PyArray_FromInterface':                112,
-    'PyArray_FromStructInterface':          113,
-    'PyArray_FromArrayAttr':                114,
-    'PyArray_ScalarKind':                   115,
-    'PyArray_CanCoerceScalar':              116,
-    'PyArray_NewFlagsObject':               117,
-    'PyArray_CanCastScalar':                118,
-    'PyArray_CompareUCS4':                  119,
-    'PyArray_RemoveSmallest':               120,
-    'PyArray_ElementStrides':               121,
-    'PyArray_Item_INCREF':                  122,
-    'PyArray_Item_XDECREF':                 123,
-    'PyArray_FieldNames':                   124,
-    'PyArray_Transpose':                    125,
-    'PyArray_TakeFrom':                     126,
-    'PyArray_PutTo':                        127,
-    'PyArray_PutMask':                      128,
-    'PyArray_Repeat':                       129,
-    'PyArray_Choose':                       130,
-    'PyArray_Sort':                         131,
-    'PyArray_ArgSort':                      132,
-    'PyArray_SearchSorted':                 133,
-    'PyArray_ArgMax':                       134,
-    'PyArray_ArgMin':                       135,
-    'PyArray_Reshape':                      136,
-    'PyArray_Newshape':                     137,
-    'PyArray_Squeeze':                      138,
-    'PyArray_View':                         139,
-    'PyArray_SwapAxes':                     140,
-    'PyArray_Max':                          141,
-    'PyArray_Min':                          142,
-    'PyArray_Ptp':                          143,
-    'PyArray_Mean':                         144,
-    'PyArray_Trace':                        145,
-    'PyArray_Diagonal':                     146,
-    'PyArray_Clip':                         147,
-    'PyArray_Conjugate':                    148,
-    'PyArray_Nonzero':                      149,
-    'PyArray_Std':                          150,
-    'PyArray_Sum':                          151,
-    'PyArray_CumSum':                       152,
-    'PyArray_Prod':                         153,
-    'PyArray_CumProd':                      154,
-    'PyArray_All':                          155,
-    'PyArray_Any':                          156,
-    'PyArray_Compress':                     157,
-    'PyArray_Flatten':                      158,
-    'PyArray_Ravel':                        159,
-    'PyArray_MultiplyList':                 160,
-    'PyArray_MultiplyIntList':              161,
-    'PyArray_GetPtr':                       162,
-    'PyArray_CompareLists':                 163,
-    'PyArray_AsCArray':                     164,
-    'PyArray_As1D':                         165,
-    'PyArray_As2D':                         166,
-    'PyArray_Free':                         167,
-    'PyArray_Converter':                    168,
-    'PyArray_IntpFromSequence':             169,
-    'PyArray_Concatenate':                  170,
-    'PyArray_InnerProduct':                 171,
-    'PyArray_MatrixProduct':                172,
-    'PyArray_CopyAndTranspose':             173,
-    'PyArray_Correlate':                    174,
-    'PyArray_TypestrConvert':               175,
-    'PyArray_DescrConverter':               176,
-    'PyArray_DescrConverter2':              177,
-    'PyArray_IntpConverter':                178,
-    'PyArray_BufferConverter':              179,
-    'PyArray_AxisConverter':                180,
-    'PyArray_BoolConverter':                181,
-    'PyArray_ByteorderConverter':           182,
-    'PyArray_OrderConverter':               183,
-    'PyArray_EquivTypes':                   184,
-    'PyArray_Zeros':                        185,
-    'PyArray_Empty':                        186,
-    'PyArray_Where':                        187,
-    'PyArray_Arange':                       188,
-    'PyArray_ArangeObj':                    189,
-    'PyArray_SortkindConverter':            190,
-    'PyArray_LexSort':                      191,
-    'PyArray_Round':                        192,
-    'PyArray_EquivTypenums':                193,
-    'PyArray_RegisterDataType':             194,
-    'PyArray_RegisterCastFunc':             195,
-    'PyArray_RegisterCanCast':              196,
-    'PyArray_InitArrFuncs':                 197,
-    'PyArray_IntTupleFromIntp':             198,
-    'PyArray_TypeNumFromName':              199,
-    'PyArray_ClipmodeConverter':            200,
-    'PyArray_OutputConverter':              201,
-    'PyArray_BroadcastToShape':             202,
-    '_PyArray_SigintHandler':               203,
-    '_PyArray_GetSigintBuf':                204,
-    'PyArray_DescrAlignConverter':          205,
-    'PyArray_DescrAlignConverter2':         206,
-    'PyArray_SearchsideConverter':          207,
-    'PyArray_CheckAxis':                    208,
-    'PyArray_OverflowMultiplyList':         209,
-    'PyArray_CompareString':                210,
-    'PyArray_MultiIterFromObjects':         211,
-    'PyArray_GetEndianness':                212,
-    'PyArray_GetNDArrayCFeatureVersion':    213,
-    'PyArray_Correlate2':                   214,
-    'PyArray_NeighborhoodIterNew':          215,
-    'PyArray_SetDatetimeParseFunction':     216,
-    'PyArray_DatetimeToDatetimeStruct':     217,
-    'PyArray_TimedeltaToTimedeltaStruct':   218,
-    'PyArray_DatetimeStructToDatetime':     219,
-    'PyArray_TimedeltaStructToTimedelta':   220,
+    'PyArray_SetNumericOps':                40,
+    'PyArray_GetNumericOps':                41,
+    'PyArray_INCREF':                       42,
+    'PyArray_XDECREF':                      43,
+    'PyArray_SetStringFunction':            44,
+    'PyArray_DescrFromType':                45,
+    'PyArray_TypeObjectFromType':           46,
+    'PyArray_Zero':                         47,
+    'PyArray_One':                          48,
+    'PyArray_CastToType':                   49,
+    'PyArray_CastTo':                       50,
+    'PyArray_CastAnyTo':                    51,
+    'PyArray_CanCastSafely':                52,
+    'PyArray_CanCastTo':                    53,
+    'PyArray_ObjectType':                   54,
+    'PyArray_DescrFromObject':              55,
+    'PyArray_ConvertToCommonType':          56,
+    'PyArray_DescrFromScalar':              57,
+    'PyArray_DescrFromTypeObject':          58,
+    'PyArray_Size':                         59,
+    'PyArray_Scalar':                       60,
+    'PyArray_FromScalar':                   61,
+    'PyArray_ScalarAsCtype':                62,
+    'PyArray_CastScalarToCtype':            63,
+    'PyArray_CastScalarDirect':             64,
+    'PyArray_ScalarFromObject':             65,
+    'PyArray_GetCastFunc':                  66,
+    'PyArray_FromDims':                     67,
+    'PyArray_FromDimsAndDataAndDescr':      68,
+    'PyArray_FromAny':                      69,
+    'PyArray_EnsureArray':                  70,
+    'PyArray_EnsureAnyArray':               71,
+    'PyArray_FromFile':                     72,
+    'PyArray_FromString':                   73,
+    'PyArray_FromBuffer':                   74,
+    'PyArray_FromIter':                     75,
+    'PyArray_Return':                       76,
+    'PyArray_GetField':                     77,
+    'PyArray_SetField':                     78,
+    'PyArray_Byteswap':                     79,
+    'PyArray_Resize':                       80,
+    'PyArray_MoveInto':                     81,
+    'PyArray_CopyInto':                     82,
+    'PyArray_CopyAnyInto':                  83,
+    'PyArray_CopyObject':                   84,
+    'PyArray_NewCopy':                      85,
+    'PyArray_ToList':                       86,
+    'PyArray_ToString':                     87,
+    'PyArray_ToFile':                       88,
+    'PyArray_Dump':                         89,
+    'PyArray_Dumps':                        90,
+    'PyArray_ValidType':                    91,
+    'PyArray_UpdateFlags':                  92,
+    'PyArray_New':                          93,
+    'PyArray_NewFromDescr':                 94,
+    'PyArray_DescrNew':                     95,
+    'PyArray_DescrNewFromType':             96,
+    'PyArray_GetPriority':                  97,
+    'PyArray_IterNew':                      98,
+    'PyArray_MultiIterNew':                 99,
+    'PyArray_PyIntAsInt':                   100,
+    'PyArray_PyIntAsIntp':                  101,
+    'PyArray_Broadcast':                    102,
+    'PyArray_FillObjectArray':              103,
+    'PyArray_FillWithScalar':               104,
+    'PyArray_CheckStrides':                 105,
+    'PyArray_DescrNewByteorder':            106,
+    'PyArray_IterAllButAxis':               107,
+    'PyArray_CheckFromAny':                 108,
+    'PyArray_FromArray':                    109,
+    'PyArray_FromInterface':                110,
+    'PyArray_FromStructInterface':          111,
+    'PyArray_FromArrayAttr':                112,
+    'PyArray_ScalarKind':                   113,
+    'PyArray_CanCoerceScalar':              114,
+    'PyArray_NewFlagsObject':               115,
+    'PyArray_CanCastScalar':                116,
+    'PyArray_CompareUCS4':                  117,
+    'PyArray_RemoveSmallest':               118,
+    'PyArray_ElementStrides':               119,
+    'PyArray_Item_INCREF':                  120,
+    'PyArray_Item_XDECREF':                 121,
+    'PyArray_FieldNames':                   122,
+    'PyArray_Transpose':                    123,
+    'PyArray_TakeFrom':                     124,
+    'PyArray_PutTo':                        125,
+    'PyArray_PutMask':                      126,
+    'PyArray_Repeat':                       127,
+    'PyArray_Choose':                       128,
+    'PyArray_Sort':                         129,
+    'PyArray_ArgSort':                      130,
+    'PyArray_SearchSorted':                 131,
+    'PyArray_ArgMax':                       132,
+    'PyArray_ArgMin':                       133,
+    'PyArray_Reshape':                      134,
+    'PyArray_Newshape':                     135,
+    'PyArray_Squeeze':                      136,
+    'PyArray_View':                         137,
+    'PyArray_SwapAxes':                     138,
+    'PyArray_Max':                          139,
+    'PyArray_Min':                          140,
+    'PyArray_Ptp':                          141,
+    'PyArray_Mean':                         142,
+    'PyArray_Trace':                        143,
+    'PyArray_Diagonal':                     144,
+    'PyArray_Clip':                         145,
+    'PyArray_Conjugate':                    146,
+    'PyArray_Nonzero':                      147,
+    'PyArray_Std':                          148,
+    'PyArray_Sum':                          149,
+    'PyArray_CumSum':                       150,
+    'PyArray_Prod':                         151,
+    'PyArray_CumProd':                      152,
+    'PyArray_All':                          153,
+    'PyArray_Any':                          154,
+    'PyArray_Compress':                     155,
+    'PyArray_Flatten':                      156,
+    'PyArray_Ravel':                        157,
+    'PyArray_MultiplyList':                 158,
+    'PyArray_MultiplyIntList':              159,
+    'PyArray_GetPtr':                       160,
+    'PyArray_CompareLists':                 161,
+    'PyArray_AsCArray':                     162,
+    'PyArray_As1D':                         163,
+    'PyArray_As2D':                         164,
+    'PyArray_Free':                         165,
+    'PyArray_Converter':                    166,
+    'PyArray_IntpFromSequence':             167,
+    'PyArray_Concatenate':                  168,
+    'PyArray_InnerProduct':                 169,
+    'PyArray_MatrixProduct':                170,
+    'PyArray_CopyAndTranspose':             171,
+    'PyArray_Correlate':                    172,
+    'PyArray_TypestrConvert':               173,
+    'PyArray_DescrConverter':               174,
+    'PyArray_DescrConverter2':              175,
+    'PyArray_IntpConverter':                176,
+    'PyArray_BufferConverter':              177,
+    'PyArray_AxisConverter':                178,
+    'PyArray_BoolConverter':                179,
+    'PyArray_ByteorderConverter':           180,
+    'PyArray_OrderConverter':               181,
+    'PyArray_EquivTypes':                   182,
+    'PyArray_Zeros':                        183,
+    'PyArray_Empty':                        184,
+    'PyArray_Where':                        185,
+    'PyArray_Arange':                       186,
+    'PyArray_ArangeObj':                    187,
+    'PyArray_SortkindConverter':            188,
+    'PyArray_LexSort':                      189,
+    'PyArray_Round':                        190,
+    'PyArray_EquivTypenums':                191,
+    'PyArray_RegisterDataType':             192,
+    'PyArray_RegisterCastFunc':             193,
+    'PyArray_RegisterCanCast':              194,
+    'PyArray_InitArrFuncs':                 195,
+    'PyArray_IntTupleFromIntp':             196,
+    'PyArray_TypeNumFromName':              197,
+    'PyArray_ClipmodeConverter':            198,
+    'PyArray_OutputConverter':              199,
+    'PyArray_BroadcastToShape':             200,
+    '_PyArray_SigintHandler':               201,
+    '_PyArray_GetSigintBuf':                202,
+    'PyArray_DescrAlignConverter':          203,
+    'PyArray_DescrAlignConverter2':         204,
+    'PyArray_SearchsideConverter':          205,
+    'PyArray_CheckAxis':                    206,
+    'PyArray_OverflowMultiplyList':         207,
+    'PyArray_CompareString':                208,
+    'PyArray_MultiIterFromObjects':         209,
+    'PyArray_GetEndianness':                210,
+    'PyArray_GetNDArrayCFeatureVersion':    211,
+    'PyArray_Correlate2':                   212,
+    'PyArray_NeighborhoodIterNew':          213,
+    # End 1.5 API
+    'PyArray_SetDatetimeParseFunction':     219,
+    'PyArray_DatetimeToDatetimeStruct':     220,
+    'PyArray_TimedeltaToTimedeltaStruct':   221,
+    'PyArray_DatetimeStructToDatetime':     222,
+    'PyArray_TimedeltaStructToTimedelta':   223,
+    # New Iterator API
+    'NpyIter_New':                          224,
+    'NpyIter_MultiNew':                     225,
+    'NpyIter_Copy':                         226,
+    'NpyIter_Deallocate':                   227,
+    'NpyIter_HasDelayedBufAlloc':           228,
+    'NpyIter_HasInnerLoop':                 229,
+    'NpyIter_RemoveInnerLoop':              230,
+    'NpyIter_GetInnerStrideArray':          231,
+    'NpyIter_GetInnerLoopSizePtr':          232,
+    'NpyIter_Reset':                        233,
+    'NpyIter_ResetBasePointers':            234,
+    'NpyIter_ResetToIterIndexRange':        235,
+    'NpyIter_GetNDim':                      236,
+    'NpyIter_GetNIter':                     237,
+    'NpyIter_GetIterNext':                  238,
+    'NpyIter_GetIterSize':                  239,
+    'NpyIter_GetIterIndexRange':            240,
+    'NpyIter_GetIterIndex':                 241,
+    'NpyIter_GotoIterIndex':                242,
+    'NpyIter_HasCoords':                    243,
+    'NpyIter_GetShape':                     244,
+    'NpyIter_GetGetCoords':                 245,
+    'NpyIter_GotoCoords':                   246,
+    'NpyIter_RemoveCoords':                 247,
+    'NpyIter_HasIndex':                     248,
+    'NpyIter_IsBuffered':                   249,
+    'NpyIter_IsGrowInner':                  250,
+    'NpyIter_GetBufferSize':                251,
+    'NpyIter_GetIndexPtr':                  252,
+    'NpyIter_GotoIndex':                    253,
+    'NpyIter_GetDataPtrArray':              254,
+    'NpyIter_GetDescrArray':                255,
+    'NpyIter_GetOperandArray':              256,
+    'NpyIter_GetIterView':                  257,
+    'NpyIter_GetReadFlags':                 258,
+    'NpyIter_GetWriteFlags':                259,
+    'NpyIter_DebugPrint':                   260,
+    'NpyIter_IterationNeedsAPI':            261,
+    'NpyIter_GetInnerFixedStrideArray':     262,
+    'NpyIter_RemoveAxis':                   263,
+    'NpyIter_GetAxisStrideArray':           264,
+    #
+    'PyArray_CastingConverter':             265,
+    'PyArray_CountNonzero':                 266,
+    'PyArray_PromoteTypes':                 267,
+    'PyArray_MinScalarType':                268,
+    'PyArray_ResultType':                   269,
+    'PyArray_CanCastArrayTo':               270,
+    'PyArray_CanCastTypeTo':                271,
+    'PyArray_EinsteinSum':                  272,
+    'PyArray_FillWithZero':                 273,
+    'PyArray_NewLikeArray':                 274,
 }
 
 ufunc_types_api = {
@@ -291,6 +348,7 @@ ufunc_funcs_api = {
     'PyUFunc_ReplaceLoopBySignature':           30,
     'PyUFunc_FromFuncAndDataAndSignature':      31,
     'PyUFunc_SetUsesArraysAsData':              32,
+    # End 1.5 API
     'PyUFunc_e_e':                              33,
     'PyUFunc_e_e_As_f_f':                       34,
     'PyUFunc_e_e_As_d_d':                       35,
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index e97e50c87..d66e5cb68 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1051,13 +1051,16 @@ def ravel(a, order='C'):
     Parameters
     ----------
     a : array_like
-        Input array.  The elements in `a` are read in the order specified by
+        Input array.  The elements in ``a`` are read in the order specified by
         `order`, and packed as a 1-D array.
-    order : {'C','F', 'A'}, optional
-        The elements of `a` are read in this order.  It can be
-        'C' for row-major order, `F` for column-major order, or
-        'A' to preserve the order of `a` when possible.
-        By default, row-major order is used.
+    order : {'C','F', 'A', 'K'}, optional
+        The elements of ``a`` are read in this order. 'C' means to view
+        the elements in C (row-major) order. 'F' means to view the elements
+        in Fortran (column-major) order. 'A' means to view the elements
+        in 'F' order if a is Fortran contiguous, 'C' order otherwise.
+        'K' means to view the elements in the order they occur in memory,
+        except for reversing the data when strides are negative.
+        By default, 'C' order is used.
 
     Returns
     -------
@@ -1092,12 +1095,33 @@ def ravel(a, order='C'):
     >>> print np.ravel(x, order='F')
     [1 4 2 5 3 6]
 
-    When `order` is 'A', it will preserve the array's 'C' or 'F' ordering:
+    When ``order`` is 'A', it will preserve the array's 'C' or 'F' ordering:
 
     >>> print np.ravel(x.T)
     [1 4 2 5 3 6]
     >>> print np.ravel(x.T, order='A')
     [1 2 3 4 5 6]
+
+    When ``order`` is 'K', it will preserve orderings that are neither 'C'
+    nor 'F', but won't reverse axes:
+
+    >>> a = np.arange(3)[::-1]; a
+    array([2, 1, 0])
+    >>> a.ravel(order='C')
+    array([2, 1, 0])
+    >>> a.ravel(order='K')
+    array([2, 1, 0])
+
+    >>> a = np.arange(12).reshape(2,3,2).swapaxes(1,2); a
+    array([[[ 0,  2,  4],
+            [ 1,  3,  5]],
+           [[ 6,  8, 10],
+            [ 7,  9, 11]]])
+    >>> a.ravel(order='C')
+    array([ 0,  2,  4,  1,  3,  5,  6,  8, 10,  7,  9, 11])
+    >>> a.ravel(order='K')
+    array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
+
     """
     return asarray(a).ravel(order)
 
@@ -1136,6 +1160,8 @@ def nonzero(a):
         array.
     ndarray.nonzero :
         Equivalent ndarray method.
+    count_nonzero :
+        Counts the number of non-zero elements in the input array.
 
     Examples
     --------
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 98d02287a..4ca1b2cae 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -65,16 +65,24 @@ enum NPY_TYPES {    NPY_BOOL=0,
                     NPY_INT, NPY_UINT,
                     NPY_LONG, NPY_ULONG,
                     NPY_LONGLONG, NPY_ULONGLONG,
-                    NPY_HALF, NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE,
+                    NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE,
                     NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE,
-                    NPY_DATETIME, NPY_TIMEDELTA,
-                    NPY_OBJECT=20,
+                    NPY_OBJECT=17,
                     NPY_STRING, NPY_UNICODE,
                     NPY_VOID,
+                    /*
+                     * New 1.6 types appended, may be integrated
+                     * into the above in 2.0.
+                     */
+                    NPY_DATETIME, NPY_TIMEDELTA, NPY_HALF, 
+
                     NPY_NTYPES,
                     NPY_NOTYPE,
                     NPY_CHAR,      /* special flag */
-                    NPY_USERDEF=256  /* leave room for characters */
+                    NPY_USERDEF=256,  /* leave room for characters */
+
+                    /* The number of types not including the new 1.6 types */
+                    NPY_NTYPES_ABI_COMPATIBLE=21
 };
 
 #define NPY_METADATA_DTSTR "__frequency__"
@@ -88,8 +96,7 @@ enum NPY_TYPES {    NPY_BOOL=0,
 /* default scalar priority */
 #define NPY_SCALAR_PRIORITY -1000000.0
 
-/*TODO HALF - This is used as how many complex floating point types in the code */
-/* How many floating point types are there */
+/* How many floating point types are there (excluding half) */
 #define NPY_NUM_FLOATTYPE 3
 
 /*
@@ -174,12 +181,31 @@ typedef enum {
 } NPY_SCALARKIND;
 #define NPY_NSCALARKINDS (NPY_OBJECT_SCALAR + 1)
 
+/* For specifying array memory layout or iteration order */
 typedef enum {
+        /* Fortran order if inputs are all Fortran, C otherwise */
         NPY_ANYORDER=-1,
+        /* C order */
         NPY_CORDER=0,
-        NPY_FORTRANORDER=1
+        /* Fortran order */
+        NPY_FORTRANORDER=1,
+        /* An order as close to the inputs as possible */
+        NPY_KEEPORDER=2
 } NPY_ORDER;
 
+/* For specifying allowed casting in operations which support it */
+typedef enum {
+        /* Only allow identical types */
+        NPY_NO_CASTING=0,
+        /* Allow identical and byte swapped types */
+        NPY_EQUIV_CASTING=1,
+        /* Only allow safe casts */
+        NPY_SAFE_CASTING=2,
+        /* Allow safe casts or casts within the same kind */
+        NPY_SAME_KIND_CASTING=3,
+        /* Allow any casts */
+        NPY_UNSAFE_CASTING=4
+} NPY_CASTING;
 
 typedef enum {
         NPY_CLIP=0,
@@ -394,6 +420,14 @@ typedef struct {
 } PyArray_Dims;
 
 typedef struct {
+        /*
+         * Functions to cast to most other standard types
+         * Can have some NULL entries. The types
+         * DATETIME, TIMEDELTA, and HALF go into the castdict
+         * even though they are built-in.
+         */
+        PyArray_VectorUnaryFunc *cast[NPY_NTYPES_ABI_COMPATIBLE];
+
         /* The next four functions *cannot* be NULL */
 
         /*
@@ -487,22 +521,6 @@ typedef struct {
         PyArray_FastClipFunc *fastclip;
         PyArray_FastPutmaskFunc *fastputmask;
         PyArray_FastTakeFunc *fasttake;
-
-        /*
-         * A little room to grow --- should use generic function
-         * interface for most additions
-         */
-        void *pad1;
-        void *pad2;
-        void *pad3;
-        void *pad4;
-
-        /*
-         * Functions to cast to all other standard types
-         * Can have some NULL entries
-         */
-        PyArray_VectorUnaryFunc *cast[NPY_NTYPES];
-
 } PyArray_ArrFuncs;
 
 /* The item must be reference counted when it is inserted or extracted. */
@@ -554,8 +572,7 @@ typedef struct _PyArray_Descr {
                                  * '>' (big), '<' (little), '|'
                                  * (not-applicable), or '=' (native).
                                  */
-        char unused;
-        int flags;              /* flag describing data type */
+        char flags;             /* flags describing data type */
         int type_num;           /* number representing this type */
         int elsize;             /* element size for this type */
         int alignment;          /* alignment needed for this type */
@@ -778,8 +795,9 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
  */
 #define NPY_MIN_BUFSIZE ((int)sizeof(cdouble))
 #define NPY_MAX_BUFSIZE (((int)sizeof(cdouble))*1000000)
-#define NPY_BUFSIZE 10000
-/* #define NPY_BUFSIZE 80*/
+#define NPY_BUFSIZE 8192
+/* buffer stress test size: */
+/*#define NPY_BUFSIZE 17*/
 
 #define PyArray_MAX(a,b) (((a)>(b))?(a):(b))
 #define PyArray_MIN(a,b) (((a)<(b))?(a):(b))
@@ -807,6 +825,8 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 #define PyArray_ISWRITEABLE(m) PyArray_CHKFLAGS(m, NPY_WRITEABLE)
 #define PyArray_ISALIGNED(m) PyArray_CHKFLAGS(m, NPY_ALIGNED)
 
+#define PyArray_IS_C_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_C_CONTIGUOUS)
+#define PyArray_IS_F_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_F_CONTIGUOUS)
 
 #if NPY_ALLOW_THREADS
 #define NPY_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
@@ -840,6 +860,76 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 #endif
 
 /*****************************
+ * New iterator object
+ *****************************/
+
+/* The actual structure of the iterator is an internal detail */
+typedef struct NpyIter_InternalOnly NpyIter;
+
+/* Iterator function pointers that may be specialized */
+typedef int (*NpyIter_IterNext_Fn )(NpyIter *iter);
+typedef void (*NpyIter_GetCoords_Fn )(NpyIter *iter,
+                                      npy_intp *outcoords);
+
+/*** Global flags that may be passed to the iterator constructors ***/
+
+/* Track an index representing C order */
+#define NPY_ITER_C_INDEX                    0x00000001
+/* Track an index representing Fortran order */
+#define NPY_ITER_F_INDEX                    0x00000002
+/* Track coordinates */
+#define NPY_ITER_COORDS                     0x00000004
+/* Let the caller handle the inner loop of iteration */
+#define NPY_ITER_NO_INNER_ITERATION         0x00000008
+/* Convert all the operands to a common data type */
+#define NPY_ITER_COMMON_DTYPE               0x00000010
+/* Operands may hold references, requiring API access during iteration */
+#define NPY_ITER_REFS_OK                    0x00000020
+/* Zero-sized operands should be permitted, iteration checks IterSize for 0 */
+#define NPY_ITER_ZEROSIZE_OK                0x00000040
+/* Permits reductions (size-0 stride with dimension size > 1) */
+#define NPY_ITER_REDUCE_OK                  0x00000080
+/* Enables sub-range iteration */
+#define NPY_ITER_RANGED                     0x00000100
+/* Enables buffering */
+#define NPY_ITER_BUFFERED                   0x00000200
+/* When buffering is enabled, grows the inner loop if possible */
+#define NPY_ITER_GROWINNER                  0x00000400
+/* Delay allocation of buffers until first Reset* call */
+#define NPY_ITER_DELAY_BUFALLOC             0x00000800
+/* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */
+#define NPY_ITER_DONT_REVERSE_AXES          0x00001000
+
+/*** Per-operand flags that may be passed to the iterator constructors ***/
+
+/* The operand will be read from and written to */
+#define NPY_ITER_READWRITE                  0x00010000
+/* The operand will only be read from */
+#define NPY_ITER_READONLY                   0x00020000
+/* The operand will only be written to */
+#define NPY_ITER_WRITEONLY                  0x00040000
+/* The operand's data must be in native byte order */
+#define NPY_ITER_NBO                        0x00080000
+/* The operand's data must be aligned */
+#define NPY_ITER_ALIGNED                    0x00100000
+/* The operand's data must be contiguous (within the inner loop) */
+#define NPY_ITER_CONTIG                     0x00200000
+/* The operand may be copied to satisfy requirements */
+#define NPY_ITER_COPY                       0x00400000
+/* The operand may be copied with UPDATEIFCOPY to satisfy requirements */
+#define NPY_ITER_UPDATEIFCOPY               0x00800000
+/* Allocate the operand if it is NULL */
+#define NPY_ITER_ALLOCATE                   0x01000000
+/* If an operand is allocated, don't use any subtype */
+#define NPY_ITER_NO_SUBTYPE                 0x02000000
+/* Require that the dimension match the iterator dimensions exactly */
+#define NPY_ITER_NO_BROADCAST               0x08000000
+
+#define NPY_ITER_GLOBAL_FLAGS               0x0000ffff
+#define NPY_ITER_PER_OP_FLAGS               0xffff0000
+
+
+/*****************************
  * Basic iterator object
  *****************************/
 
@@ -1232,10 +1322,12 @@ PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
 #define PyTypeNum_ISINTEGER(type) (((type) >= NPY_BYTE) &&     \
                                 ((type) <= NPY_ULONGLONG))
 
-#define PyTypeNum_ISFLOAT(type) (((type) >= NPY_HALF) &&      \
-                              ((type) <= NPY_LONGDOUBLE))
+#define PyTypeNum_ISFLOAT(type) ((((type) >= NPY_FLOAT) && \
+                              ((type) <= NPY_LONGDOUBLE)) || \
+                              ((type) == NPY_HALF))
 
-#define PyTypeNum_ISNUMBER(type) ((type) <= NPY_CLONGDOUBLE)
+#define PyTypeNum_ISNUMBER(type) (((type) <= NPY_CLONGDOUBLE) || \
+                                  ((type) == NPY_HALF))
 
 #define PyTypeNum_ISSTRING(type) (((type) == NPY_STRING) ||    \
                                   ((type) == NPY_UNICODE))
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 0fec32183..b6f534425 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -11,31 +11,65 @@ typedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *)
 
 typedef struct {
         PyObject_HEAD
+        /*
+         * nin: Number of inputs
+         * nout: Number of outputs
+         * nargs: Always nin + nout (Why is it stored?)
+         */
         int nin, nout, nargs;
+
+        /* Identity for reduction, either PyUFunc_One or PyUFunc_Zero */
         int identity;
+
+        /* Array of one-dimensional core loops */
         PyUFuncGenericFunction *functions;
+        /* Array of funcdata that gets passed into the functions */
         void **data;
+        /* The number of elements in 'functions' and 'data' */
         int ntypes;
+
+        /* Does not appear to be used */
         int check_return;
-        char *name, *types;
+
+        /* The name of the ufunc */
+        char *name;
+
+        /* Array of type numbers, of size ('nargs' * 'ntypes') */
+        char *types;
+
+        /* Documentation string */
         char *doc;
+
         void *ptr;
         PyObject *obj;
         PyObject *userloops;
     
-        /* generalized ufunc */
-        int core_enabled;      /* 0 for scalar ufunc; 1 for generalized ufunc */
-        int core_num_dim_ix;   /* number of distinct dimension names in
-                                  signature */
+        /* generalized ufunc parameters */
+
+        /* 0 for scalar ufunc; 1 for generalized ufunc */
+        int core_enabled;
+        /* number of distinct dimension names in signature */
+        int core_num_dim_ix;
  
-        /* dimension indices of input/output argument k are stored in
-           core_dim_ixs[core_offsets[k]..core_offsets[k]+core_num_dims[k]-1] */
-        int *core_num_dims;    /* numbers of core dimensions of each argument */
-        int *core_dim_ixs;     /* dimension indices in a flatted form; indices
-                                  are in the range of [0,core_num_dim_ix) */
-        int *core_offsets;     /* positions of 1st core dimensions of each
-                                  argument in core_dim_ixs */
-        char *core_signature;  /* signature string for printing purpose */
+        /*
+         * dimension indices of input/output argument k are stored in
+         * core_dim_ixs[core_offsets[k]..core_offsets[k]+core_num_dims[k]-1]
+         */
+
+        /* numbers of core dimensions of each argument */
+        int *core_num_dims;
+        /*
+         * dimension indices in a flatted form; indices
+         * are in the range of [0,core_num_dim_ix)
+         */
+        int *core_dim_ixs;
+        /*
+         * positions of 1st core dimensions of each
+         * argument in core_dim_ixs
+         */
+        int *core_offsets;
+        /* signature string for printing purpose */
+        char *core_signature;
 } PyUFuncObject;
 
 #include "arrayobject.h"
@@ -68,7 +102,8 @@ typedef struct {
 #define UFUNC_FPE_UNDERFLOW     4
 #define UFUNC_FPE_INVALID       8
 
-#define UFUNC_ERR_DEFAULT  0      /* Error mode that avoids look-up (no checking) */
+/* Error mode that avoids look-up (no checking) */
+#define UFUNC_ERR_DEFAULT       0
 
 #define UFUNC_OBJ_ISOBJECT      1
 #define UFUNC_OBJ_NEEDS_API     2
@@ -79,126 +114,6 @@ typedef struct {
         (UFUNC_ERR_PRINT << UFUNC_SHIFT_OVERFLOW) +      \
         (UFUNC_ERR_PRINT << UFUNC_SHIFT_INVALID)
 
-        /* Only internal -- not exported, yet*/
-typedef struct {
-        /* Multi-iterator portion --- needs to be present in this order
-           to work with PyArray_Broadcast */
-        PyObject_HEAD
-        int  numiter;
-        npy_intp size;
-        npy_intp index;
-        int nd;
-        npy_intp dimensions[NPY_MAXDIMS];
-        PyArrayIterObject *iters[NPY_MAXARGS];
-        /*  End of Multi-iterator portion */
-
-        /* The ufunc */
-        PyUFuncObject *ufunc;
-
-        /* The error handling */
-        int errormask;         /* Integer showing desired error handling */
-        PyObject *errobj;      /* currently a tuple with
-                                  (string, func or obj with write method or None)
-                               */
-        int first;
-
-        /* Specific function and data to use */
-        PyUFuncGenericFunction function;
-        void *funcdata;
-
-        /* Loop method */
-        int meth;
-
-        /* Whether we need to copy to a buffer or not.*/
-        int needbuffer[NPY_MAXARGS];
-        int leftover;
-        int ninnerloops;
-        int lastdim;
-
-        /* Whether or not to swap */
-        int swap[NPY_MAXARGS];
-
-        /* Buffers for the loop */
-        char *buffer[NPY_MAXARGS];
-        int bufsize;
-        npy_intp bufcnt;
-        char *dptr[NPY_MAXARGS];
-
-        /* For casting */
-        char *castbuf[NPY_MAXARGS];
-        PyArray_VectorUnaryFunc *cast[NPY_MAXARGS];
-
-        /* usually points to buffer but when a cast is to be
-           done it switches for that argument to castbuf.
-        */
-        char *bufptr[NPY_MAXARGS];
-
-        /* Steps filled in from iters or sizeof(item)
-           depending on loop method.
-        */
-        npy_intp steps[NPY_MAXARGS];
-
-        int obj;  /* This loop uses object arrays or needs the Python API */
-                  /* Flags: UFUNC_OBJ_ISOBJECT, UFUNC_OBJ_NEEDS_API */
-        int notimplemented; /* The loop caused notimplemented */
-        int objfunc; /* This loop calls object functions
-                        (an inner-loop function with argument types */
-    
-        /* generalized ufunc */
-        npy_intp *core_dim_sizes;   /* stores sizes of core dimensions;
-                                       contains 1 + core_num_dim_ix elements */
-        npy_intp *core_strides;     /* strides of loop and core dimensions */
-} PyUFuncLoopObject;
-
-/* Could make this more clever someday */
-#define UFUNC_MAXIDENTITY 32
-
-typedef struct {
-        PyObject_HEAD
-        PyArrayIterObject *it;
-        PyArrayObject *ret;
-        PyArrayIterObject *rit;   /* Needed for Accumulate */
-        int  outsize;
-        npy_intp  index;
-        npy_intp  size;
-        char idptr[UFUNC_MAXIDENTITY];
-
-        /* The ufunc */
-        PyUFuncObject *ufunc;
-
-        /* The error handling */
-        int errormask;
-        PyObject *errobj;
-        int first;
-
-        PyUFuncGenericFunction function;
-        void *funcdata;
-        int meth;
-        int swap;
-
-        char *buffer;
-        int bufsize;
-
-        char *castbuf;
-        PyArray_VectorUnaryFunc *cast;
-
-        char *bufptr[3];
-        npy_intp steps[3];
-
-        npy_intp N;
-        int  instrides;
-        int  insize;
-        char *inptr;
-
-        /* For copying small arrays */
-        PyObject *decref;
-
-        int obj;
-        int retbase;
-
-} PyUFuncReduceObject;
-
-
 #if NPY_ALLOW_THREADS
 #define NPY_LOOP_BEGIN_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) _save = PyEval_SaveThread();} while (0)
 #define NPY_LOOP_END_THREADS   do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) PyEval_RestoreThread(_save);} while (0)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index dbad73cfe..fc935b4cb 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1,12 +1,12 @@
-__all__ = ['newaxis', 'ndarray', 'flatiter', 'ufunc',
-           'arange', 'array', 'zeros', 'empty', 'broadcast', 'dtype',
-           'fromstring', 'fromfile', 'frombuffer',
+__all__ = ['newaxis', 'ndarray', 'flatiter', 'newiter', 'nested_iters', 'ufunc',
+           'arange', 'array', 'zeros', 'count_nonzero', 'empty', 'broadcast',
+           'dtype', 'fromstring', 'fromfile', 'frombuffer',
            'int_asbuffer', 'where', 'argwhere',
-           'concatenate', 'fastCopyAndTranspose', 'lexsort',
-           'set_numeric_ops', 'can_cast',
+           'concatenate', 'fastCopyAndTranspose', 'lexsort', 'set_numeric_ops',
+           'can_cast', 'promote_types', 'min_scalar_type', 'result_type',
            'asarray', 'asanyarray', 'ascontiguousarray', 'asfortranarray',
            'isfortran', 'empty_like', 'zeros_like',
-           'correlate', 'convolve', 'inner', 'dot', 'outer', 'vdot',
+           'correlate', 'convolve', 'inner', 'dot', 'einsum', 'outer', 'vdot',
            'alterdot', 'restoredot', 'roll', 'rollaxis', 'cross', 'tensordot',
            'array2string', 'get_printoptions', 'set_printoptions',
            'array_repr', 'array_str', 'set_string_function',
@@ -54,23 +54,32 @@ BUFSIZE = multiarray.BUFSIZE
 
 ndarray = multiarray.ndarray
 flatiter = multiarray.flatiter
+newiter = multiarray.newiter
+nested_iters = multiarray.nested_iters
 broadcast = multiarray.broadcast
 dtype = multiarray.dtype
 ufunc = type(sin)
 
 
 # originally from Fernando Perez's IPython
-def zeros_like(a):
+def zeros_like(a, dtype=None, order='K'):
     """
     Return an array of zeros with the same shape and type as a given array.
 
-    Equivalent to ``a.copy().fill(0)``.
+    With default paramters, is equivalent to ``a.copy().fill(0)``.
 
     Parameters
     ----------
     a : array_like
         The shape and data-type of `a` define these same attributes of
         the returned array.
+    dtype : data-type, optional
+        Overrides the data type of the result.
+    order : {'C', 'F', 'A', or 'K'}, optional
+        Overrides the memory layout of the result. 'C' means C-order,
+        'F' means F-order, 'A' means 'F' if ``a`` is Fortran contiguous,
+        'C' otherwise. 'K' means match the layout of ``a`` as closely
+        as possible.
 
     Returns
     -------
@@ -103,72 +112,8 @@ def zeros_like(a):
     array([ 0.,  0.,  0.])
 
     """
-    if isinstance(a, ndarray):
-        res = ndarray.__new__(type(a), a.shape, a.dtype, order=a.flags.fnc)
-        res.fill(0)
-        return res
-    try:
-        wrap = a.__array_wrap__
-    except AttributeError:
-        wrap = None
-    a = asarray(a)
-    res = zeros(a.shape, a.dtype)
-    if wrap:
-        res = wrap(res)
-    return res
-
-def empty_like(a):
-    """
-    Return a new array with the same shape and type as a given array.
-
-    Parameters
-    ----------
-    a : array_like
-        The shape and data-type of `a` define these same attributes of the
-        returned array.
-
-    Returns
-    -------
-    out : ndarray
-        Array of random data with the same shape and type as `a`.
-
-    See Also
-    --------
-    ones_like : Return an array of ones with shape and type of input.
-    zeros_like : Return an array of zeros with shape and type of input.
-    empty : Return a new uninitialized array.
-    ones : Return a new array setting values to one.
-    zeros : Return a new array setting values to zero.
-
-    Notes
-    -----
-    This function does *not* initialize the returned array; to do that use
-    `zeros_like` or `ones_like` instead.  It may be marginally faster than
-    the functions that do set the array values.
-
-    Examples
-    --------
-    >>> a = ([1,2,3], [4,5,6])                         # a is array-like
-    >>> np.empty_like(a)
-    array([[-1073741821, -1073741821,           3],    #random
-           [          0,           0, -1073741821]])
-    >>> a = np.array([[1., 2., 3.],[4.,5.,6.]])
-    >>> np.empty_like(a)
-    array([[ -2.00000715e+000,   1.48219694e-323,  -2.00000572e+000],#random
-           [  4.38791518e-305,  -2.00000715e+000,   4.17269252e-309]])
-
-    """
-    if isinstance(a, ndarray):
-        res = ndarray.__new__(type(a), a.shape, a.dtype, order=a.flags.fnc)
-        return res
-    try:
-        wrap = a.__array_wrap__
-    except AttributeError:
-        wrap = None
-    a = asarray(a)
-    res = empty(a.shape, a.dtype)
-    if wrap:
-        res = wrap(res)
+    res = empty_like(a, dtype=dtype, order=order)
+    res.fill(0)
     return res
 
 # end Fernando's utilities
@@ -195,7 +140,9 @@ newaxis = None
 arange = multiarray.arange
 array = multiarray.array
 zeros = multiarray.zeros
+count_nonzero = multiarray.count_nonzero
 empty = multiarray.empty
+empty_like = multiarray.empty_like
 fromstring = multiarray.fromstring
 fromiter = multiarray.fromiter
 fromfile = multiarray.fromfile
@@ -209,9 +156,13 @@ concatenate = multiarray.concatenate
 fastCopyAndTranspose = multiarray._fastCopyAndTranspose
 set_numeric_ops = multiarray.set_numeric_ops
 can_cast = multiarray.can_cast
+promote_types = multiarray.promote_types
+min_scalar_type = multiarray.min_scalar_type
+result_type = multiarray.result_type
 lexsort = multiarray.lexsort
 compare_chararrays = multiarray.compare_chararrays
 putmask = multiarray.putmask
+einsum = multiarray.einsum
 
 def asarray(a, dtype=None, order=None):
     """
@@ -810,6 +761,10 @@ def outer(a,b):
     out : ndarray, shape (M, N)
         ``out[i, j] = a[i] * b[j]``
 
+    See also
+    --------
+    numpy.inner, numpy.einsum
+
     References
     ----------
     .. [1] : G. H. Golub and C. F. van Loan, *Matrix Computations*, 3rd
@@ -900,7 +855,7 @@ def tensordot(a, b, axes=2):
 
     See Also
     --------
-    numpy.dot
+    numpy.dot, numpy.einsum
 
     Notes
     -----
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index c6ba3880a..cd726d09c 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -600,7 +600,10 @@ def configuration(parent_package='',top_path=None):
 
         subpath = join('src', 'multiarray')
         sources = [join(local_dir, subpath, 'scalartypes.c.src'),
-                   join(local_dir, subpath, 'arraytypes.c.src')]
+                   join(local_dir, subpath, 'arraytypes.c.src'),
+                   join(local_dir, subpath, 'new_iterator.c.src'),
+                   join(local_dir, subpath, 'lowlevel_strided_loops.c.src'),
+                   join(local_dir, subpath, 'einsum.c.src')]
 
         # numpy.distutils generate .c from .c.src in weird directories, we have
         # to add them there as they depend on the build_dir
@@ -735,7 +738,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'sequence.h'),
             join('src', 'multiarray', 'shape.h'),
             join('src', 'multiarray', 'ucsnarrow.h'),
-            join('src', 'multiarray', 'usertypes.h')]
+            join('src', 'multiarray', 'usertypes.h'),
+            join('src', 'multiarray', 'lowlevel_strided_loops.h')]
 
     multiarray_src = [join('src', 'multiarray', 'multiarraymodule.c'),
         join('src', 'multiarray', 'hashdescr.c'),
@@ -764,7 +768,12 @@ def configuration(parent_package='',top_path=None):
         join('src', 'multiarray', 'scalarapi.c'),
         join('src', 'multiarray', 'refcount.c'),
         join('src', 'multiarray', 'arraytypes.c.src'),
-        join('src', 'multiarray', 'scalartypes.c.src')]
+        join('src', 'multiarray', 'scalartypes.c.src'),
+        join('src', 'multiarray', 'new_iterator.c.src'),
+        join('src', 'multiarray', 'lowlevel_strided_loops.c.src'),
+        join('src', 'multiarray', 'dtype_transfer.c'),
+        join('src', 'multiarray', 'new_iterator_pywrap.c'),
+        join('src', 'multiarray', 'einsum.c.src')]
 
     if PYTHON_HAS_UNICODE_WIDE:
         multiarray_src.append(join('src', 'multiarray', 'ucsnarrow.c'))
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 2423ab0ef..0a4f070f3 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -21,7 +21,7 @@ from distutils.ccompiler import CompileError
 # Binary compatibility version number. This number is increased whenever the
 # C-API is changed such that binary compatibility is broken, i.e. whenever a
 # recompile of extension modules is needed.
-C_ABI_VERSION = 0x02000000
+C_ABI_VERSION = 0x01000009
 
 # Minor API version.  This number is increased whenever a change is made to the
 # C-API -- whether it breaks binary compatibility or not.  Some changes, such
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 03760d3fa..e72ce534a 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -53,7 +53,7 @@ maintainer email:  oliphant.travis@ieee.org
 /*NUMPY_API
   Compute the size of an array (in number of items)
 */
-NPY_NO_EXPORT intp
+NPY_NO_EXPORT npy_intp
 PyArray_Size(PyObject *op)
 {
     if (PyArray_Check(op)) {
@@ -78,7 +78,7 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
      */
     if (dest->descr->type == PyArray_CHARLTR && dest->nd > 0 \
         && PyString_Check(src_object)) {
-        intp n_new, n_old;
+        npy_intp n_new, n_old;
         char *new_string;
         PyObject *tmp;
 
@@ -216,12 +216,12 @@ array_dealloc(PyArrayObject *self) {
 
 static int
 dump_data(char **string, int *n, int *max_n, char *data, int nd,
-          intp *dimensions, intp *strides, PyArrayObject* self)
+          npy_intp *dimensions, npy_intp *strides, PyArrayObject* self)
 {
     PyArray_Descr *descr=self->descr;
     PyObject *op, *sp;
     char *ostring;
-    intp i, N;
+    npy_intp i, N;
 
 #define CHECK_MEMORY do { if (*n >= *max_n-16) {         \
         *max_n *= 2;                                     \
@@ -444,15 +444,15 @@ _myunincmp(PyArray_UCS4 *s1, PyArray_UCS4 *s2, int len1, int len2)
     PyArray_UCS4 *sptr;
     PyArray_UCS4 *s1t=s1, *s2t=s2;
     int val;
-    intp size;
+    npy_intp size;
     int diff;
 
-    if ((intp)s1 % sizeof(PyArray_UCS4) != 0) {
+    if ((npy_intp)s1 % sizeof(PyArray_UCS4) != 0) {
         size = len1*sizeof(PyArray_UCS4);
         s1t = malloc(size);
         memcpy(s1t, s1, size);
     }
-    if ((intp)s2 % sizeof(PyArray_UCS4) != 0) {
+    if ((npy_intp)s2 % sizeof(PyArray_UCS4) != 0) {
         size = len2*sizeof(PyArray_UCS4);
         s2t = malloc(size);
         memcpy(s2t, s2, size);
@@ -658,7 +658,7 @@ _compare_strings(PyObject *result, PyArrayMultiIterObject *multi,
 {
     PyArrayIterObject *iself, *iother;
     Bool *dptr;
-    intp size;
+    npy_intp size;
     int val;
     int N1, N2;
     int (*cmpfunc)(void *, void *, int, int);
@@ -827,7 +827,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
         PyObject *key, *value, *temp2;
         PyObject *op;
         Py_ssize_t pos = 0;
-        intp result_ndim = PyArray_NDIM(self) > PyArray_NDIM(other) ?
+        npy_intp result_ndim = PyArray_NDIM(self) > PyArray_NDIM(other) ?
                             PyArray_NDIM(self) : PyArray_NDIM(other);
 
         op = (cmp_op == Py_EQ ? n_ops.logical_and : n_ops.logical_or);
@@ -863,7 +863,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
                 /* If the type was multidimensional, collapse that part to 1-D
                  */
                 if (PyArray_NDIM(temp) != result_ndim+1) {
-                    intp dimensions[NPY_MAXDIMS];
+                    npy_intp dimensions[NPY_MAXDIMS];
                     PyArray_Dims newdims;
 
                     newdims.ptr = dimensions;
@@ -871,7 +871,8 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
                     memcpy(dimensions, PyArray_DIMS(temp),
                            sizeof(intp)*result_ndim);
                     dimensions[result_ndim] = -1;
-                    temp2 = PyArray_Newshape(temp, &newdims, PyArray_ANYORDER);
+                    temp2 = PyArray_Newshape((PyArrayObject *)temp,
+                                             &newdims, PyArray_ANYORDER);
                     if (temp2 == NULL) {
                         Py_DECREF(temp);
                         Py_XDECREF(res);
@@ -881,7 +882,8 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
                     temp = temp2;
                 }
                 /* Reduce the extra dimension of `temp` using `op` */
-                temp2 = PyArray_GenericReduceFunction(temp, op, result_ndim,
+                temp2 = PyArray_GenericReduceFunction((PyArrayObject *)temp,
+                                                      op, result_ndim,
                                                       PyArray_BOOL, NULL);
                 if (temp2 == NULL) {
                     Py_DECREF(temp);
@@ -1099,7 +1101,7 @@ PyArray_ElementStrides(PyObject *arr)
 {
     int itemsize = PyArray_ITEMSIZE(arr);
     int i, N = PyArray_NDIM(arr);
-    intp *strides = PyArray_STRIDES(arr);
+    npy_intp *strides = PyArray_STRIDES(arr);
 
     for (i = 0; i < N; i++) {
         if ((strides[i] % itemsize) != 0) {
@@ -1128,13 +1130,13 @@ PyArray_ElementStrides(PyObject *arr)
 
 /*NUMPY_API*/
 NPY_NO_EXPORT Bool
-PyArray_CheckStrides(int elsize, int nd, intp numbytes, intp offset,
-                     intp *dims, intp *newstrides)
+PyArray_CheckStrides(int elsize, int nd, npy_intp numbytes, npy_intp offset,
+                     npy_intp *dims, npy_intp *newstrides)
 {
     int i;
-    intp byte_begin;
-    intp begin;
-    intp end;
+    npy_intp byte_begin;
+    npy_intp begin;
+    npy_intp end;
 
     if (numbytes == 0) {
         numbytes = PyArray_MultiplyList(dims, nd) * elsize;
@@ -1201,7 +1203,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
     }
 
     if (strides.ptr != NULL) {
-        intp nb, off;
+        npy_intp nb, off;
         if (strides.len != dims.len) {
             PyErr_SetString(PyExc_ValueError,
                             "strides, if given, must be "   \
@@ -1215,7 +1217,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
         }
         else {
             nb = buffer.len;
-            off = (intp) offset;
+            off = (npy_intp) offset;
         }
 
 
@@ -1252,10 +1254,10 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
     else {
         /* buffer given -- use it */
         if (dims.len == 1 && dims.ptr[0] == -1) {
-            dims.ptr[0] = (buffer.len-(intp)offset) / itemsize;
+            dims.ptr[0] = (buffer.len-(npy_intp)offset) / itemsize;
         }
         else if ((strides.ptr == NULL) &&
-                 (buffer.len < (offset + (((intp)itemsize)*
+                 (buffer.len < (offset + (((npy_intp)itemsize)*
                                           PyArray_MultiplyList(dims.ptr,
                                                                dims.len))))) {
             PyErr_SetString(PyExc_TypeError,
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index de84911c1..fef53e19d 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -439,15 +439,11 @@ STRING_setitem(PyObject *op, char *ov, PyArrayObject *ap)
     PyObject *temp = NULL;
 
     if (!PyBytes_Check(op) && !PyUnicode_Check(op)
-            && PySequence_Check(op) && PySequence_Size(op) > 0) {
+            && PySequence_Check(op) && PySequence_Size(op) != 0) {
         PyErr_SetString(PyExc_ValueError,
-                "setting an array element with a sequence");
+                "cannot set an array element with a sequence");
         return -1;
     }
-    /* Sequence_Size might have returned an error */
-    if (PyErr_Occurred()) {
-        PyErr_Clear();
-    }
 #if defined(NPY_PY3K)
     if (PyUnicode_Check(op)) {
         /* Assume ASCII codec -- function similarly as Python 2 */
@@ -501,28 +497,15 @@ STRING_setitem(PyObject *op, char *ov, PyArrayObject *ap)
 static PyObject *
 OBJECT_getitem(char *ip, PyArrayObject *ap)
 {
-    /* TODO: We might be able to get away with just the "else" clause now */
-    if (!ap || PyArray_ISALIGNED(ap)) {
-        if (*(PyObject **)ip == NULL) {
-            Py_INCREF(Py_None);
-            return Py_None;
-        }
-        else {
-            Py_INCREF(*(PyObject **)ip);
-            return *(PyObject **)ip;
-        }
+    PyObject *obj;
+    NPY_COPY_PYOBJECT_PTR(&obj, ip);
+    if (obj == NULL) {
+        Py_INCREF(Py_None);
+        return Py_None;
     }
     else {
-        PyObject *obj;
-        NPY_COPY_PYOBJECT_PTR(&obj, ip);
-        if (obj == NULL) {
-            Py_INCREF(Py_None);
-            return Py_None;
-        }
-        else {
-            Py_INCREF(obj);
-            return obj;
-        }
+        Py_INCREF(obj);
+        return obj;
     }
 }
 
@@ -530,18 +513,14 @@ OBJECT_getitem(char *ip, PyArrayObject *ap)
 static int
 OBJECT_setitem(PyObject *op, char *ov, PyArrayObject *ap)
 {
+    PyObject *obj;
+
+    NPY_COPY_PYOBJECT_PTR(&obj, ov);
+    Py_XDECREF(obj);
+
     Py_INCREF(op);
-    /* TODO: We might be able to get away with just the "else" clause now */
-    if (!ap || PyArray_ISALIGNED(ap)) {
-        Py_XDECREF(*(PyObject **)ov);
-        *(PyObject **)ov = op;
-    }
-    else {
-        PyObject *obj;
-        NPY_COPY_PYOBJECT_PTR(&obj, ov);
-        Py_XDECREF(obj);
-        NPY_COPY_PYOBJECT_PTR(ov, &op);
-    }
+    NPY_COPY_PYOBJECT_PTR(ov, &op);
+
     return PyErr_Occurred() ? -1 : 0;
 }
 
@@ -3622,6 +3601,29 @@ static int
  * #endian = |, |, =#
 */
 static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
+    {
+        (PyArray_VectorUnaryFunc*)@from@_to_BOOL,
+        (PyArray_VectorUnaryFunc*)@from@_to_BYTE,
+        (PyArray_VectorUnaryFunc*)@from@_to_UBYTE,
+        (PyArray_VectorUnaryFunc*)@from@_to_SHORT,
+        (PyArray_VectorUnaryFunc*)@from@_to_USHORT,
+        (PyArray_VectorUnaryFunc*)@from@_to_INT,
+        (PyArray_VectorUnaryFunc*)@from@_to_UINT,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_ULONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONGLONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_ULONGLONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_FLOAT,
+        (PyArray_VectorUnaryFunc*)@from@_to_DOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONGDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_CFLOAT,
+        (PyArray_VectorUnaryFunc*)@from@_to_CDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_CLONGDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_OBJECT,
+        (PyArray_VectorUnaryFunc*)@from@_to_STRING,
+        (PyArray_VectorUnaryFunc*)@from@_to_UNICODE,
+        (PyArray_VectorUnaryFunc*)@from@_to_VOID
+    },
     (PyArray_GetItemFunc*)@from@_getitem,
     (PyArray_SetItemFunc*)@from@_setitem,
     (PyArray_CopySwapNFunc*)@from@_copyswapn,
@@ -3646,34 +3648,7 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     NULL,
     (PyArray_FastClipFunc *)NULL,
     (PyArray_FastPutmaskFunc *)NULL,
-    (PyArray_FastTakeFunc *)NULL,
-    NULL, NULL, NULL, NULL,
-    {
-        (PyArray_VectorUnaryFunc*)@from@_to_BOOL,
-        (PyArray_VectorUnaryFunc*)@from@_to_BYTE,
-        (PyArray_VectorUnaryFunc*)@from@_to_UBYTE,
-        (PyArray_VectorUnaryFunc*)@from@_to_SHORT,
-        (PyArray_VectorUnaryFunc*)@from@_to_USHORT,
-        (PyArray_VectorUnaryFunc*)@from@_to_INT,
-        (PyArray_VectorUnaryFunc*)@from@_to_UINT,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_ULONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONGLONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_ULONGLONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_HALF,
-        (PyArray_VectorUnaryFunc*)@from@_to_FLOAT,
-        (PyArray_VectorUnaryFunc*)@from@_to_DOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONGDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_CFLOAT,
-        (PyArray_VectorUnaryFunc*)@from@_to_CDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_CLONGDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_DATETIME,
-        (PyArray_VectorUnaryFunc*)@from@_to_TIMEDELTA,
-        (PyArray_VectorUnaryFunc*)@from@_to_OBJECT,
-        (PyArray_VectorUnaryFunc*)@from@_to_STRING,
-        (PyArray_VectorUnaryFunc*)@from@_to_UNICODE,
-        (PyArray_VectorUnaryFunc*)@from@_to_VOID
-    }
+    (PyArray_FastTakeFunc *)NULL
 };
 
 /*
@@ -3686,7 +3661,6 @@ static PyArray_Descr @from@_Descr = {
     PyArray_@from@LTR,
     '@endian@',
     0,
-    0,
     PyArray_@from@,
     0,
     _ALIGN(@align@),
@@ -3718,6 +3692,29 @@ static PyArray_Descr @from@_Descr = {
  * #isobject= 0*18,NPY_OBJECT_DTYPE_FLAGS,0*2#
  */
 static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
+    {
+        (PyArray_VectorUnaryFunc*)@from@_to_BOOL,
+        (PyArray_VectorUnaryFunc*)@from@_to_BYTE,
+        (PyArray_VectorUnaryFunc*)@from@_to_UBYTE,
+        (PyArray_VectorUnaryFunc*)@from@_to_SHORT,
+        (PyArray_VectorUnaryFunc*)@from@_to_USHORT,
+        (PyArray_VectorUnaryFunc*)@from@_to_INT,
+        (PyArray_VectorUnaryFunc*)@from@_to_UINT,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_ULONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONGLONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_ULONGLONG,
+        (PyArray_VectorUnaryFunc*)@from@_to_FLOAT,
+        (PyArray_VectorUnaryFunc*)@from@_to_DOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_LONGDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_CFLOAT,
+        (PyArray_VectorUnaryFunc*)@from@_to_CDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_CLONGDOUBLE,
+        (PyArray_VectorUnaryFunc*)@from@_to_OBJECT,
+        (PyArray_VectorUnaryFunc*)@from@_to_STRING,
+        (PyArray_VectorUnaryFunc*)@from@_to_UNICODE,
+        (PyArray_VectorUnaryFunc*)@from@_to_VOID
+    },
     (PyArray_GetItemFunc*)@from@_getitem,
     (PyArray_SetItemFunc*)@from@_setitem,
     (PyArray_CopySwapNFunc*)@from@_copyswapn,
@@ -3742,34 +3739,7 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     NULL,
     (PyArray_FastClipFunc*)@from@_fastclip,
     (PyArray_FastPutmaskFunc*)@from@_fastputmask,
-    (PyArray_FastTakeFunc*)@from@_fasttake,
-    NULL, NULL, NULL, NULL,
-    {
-        (PyArray_VectorUnaryFunc*)@from@_to_BOOL,
-        (PyArray_VectorUnaryFunc*)@from@_to_BYTE,
-        (PyArray_VectorUnaryFunc*)@from@_to_UBYTE,
-        (PyArray_VectorUnaryFunc*)@from@_to_SHORT,
-        (PyArray_VectorUnaryFunc*)@from@_to_USHORT,
-        (PyArray_VectorUnaryFunc*)@from@_to_INT,
-        (PyArray_VectorUnaryFunc*)@from@_to_UINT,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_ULONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONGLONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_ULONGLONG,
-        (PyArray_VectorUnaryFunc*)@from@_to_HALF,
-        (PyArray_VectorUnaryFunc*)@from@_to_FLOAT,
-        (PyArray_VectorUnaryFunc*)@from@_to_DOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_LONGDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_CFLOAT,
-        (PyArray_VectorUnaryFunc*)@from@_to_CDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_CLONGDOUBLE,
-        (PyArray_VectorUnaryFunc*)@from@_to_DATETIME,
-        (PyArray_VectorUnaryFunc*)@from@_to_TIMEDELTA,
-        (PyArray_VectorUnaryFunc*)@from@_to_OBJECT,
-        (PyArray_VectorUnaryFunc*)@from@_to_STRING,
-        (PyArray_VectorUnaryFunc*)@from@_to_UNICODE,
-        (PyArray_VectorUnaryFunc*)@from@_to_VOID
-    }
+    (PyArray_FastTakeFunc*)@from@_fasttake
 };
 
 /*
@@ -3781,7 +3751,6 @@ NPY_NO_EXPORT PyArray_Descr @from@_Descr = {
     PyArray_@kind@LTR,
     PyArray_@from@LTR,
     '@endian@',
-    0,
     @isobject@,
     PyArray_@from@,
     @num@*sizeof(@fromtyp@),
@@ -3833,19 +3802,19 @@ static PyArray_Descr *_builtin_descrs[] = {
     &ULONG_Descr,
     &LONGLONG_Descr,
     &ULONGLONG_Descr,
-    &HALF_Descr,
     &FLOAT_Descr,
     &DOUBLE_Descr,
     &LONGDOUBLE_Descr,
     &CFLOAT_Descr,
     &CDOUBLE_Descr,
     &CLONGDOUBLE_Descr,
-    &DATETIME_Descr,
-    &TIMEDELTA_Descr,
     &OBJECT_Descr,
     &STRING_Descr,
     &UNICODE_Descr,
     &VOID_Descr,
+    &DATETIME_Descr,
+    &TIMEDELTA_Descr,
+    &HALF_Descr
 };
 
 /*NUMPY_API
@@ -3923,8 +3892,49 @@ set_typeinfo(PyObject *dict)
     PyObject *infodict, *s;
     int i;
 
+    PyArray_Descr *dtype;
+    PyObject *cobj, *key;
+
+    /* Add cast functions for the new types */
+/**begin repeat
+ *
+ * #name1 = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE, OBJECT, STRING, UNICODE, VOID,
+ *         DATETIME,TIMEDELTA#
+ */
+/**begin repeat1
+ *
+ * #name2 = HALF, DATETIME, TIMEDELTA#
+ */
+    dtype = _builtin_descrs[NPY_@name1@];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyInt_FromLong(NPY_@name2@);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)@name1@_to_@name2@, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+/**end repeat1**/
+/**end repeat**/
+
     for (i = 0; i < _MAX_LETTER; i++) {
-        _letter_to_num[i] = PyArray_NTYPES;
+        _letter_to_num[i] = NPY_NTYPES;
     }
 
 /**begin repeat
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 08e2687a0..16164011f 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -768,7 +768,7 @@ _descriptor_from_pep3118_format(char *s)
         Py_DECREF(str);
         return NULL;
     }
-    descr = (PyArray_Descr*)PyObject_CallMethod(
+    descr = PyObject_CallMethod(
         _numpy_internal, "_dtype_from_pep3118", "O", str);
     Py_DECREF(str);
     Py_DECREF(_numpy_internal);
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index bc078f097..e99a86266 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -825,7 +825,7 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
      */
     if (PyArray_ScalarKind(newdescr->type_num, NULL) >
         PyArray_ScalarKind(self->descr->type_num, NULL)) {
-        indescr = _array_small_type(newdescr, self->descr);
+        indescr = PyArray_PromoteTypes(newdescr, self->descr);
         func = indescr->f->fastclip;
         if (func == NULL) {
             return _slow_array_clip(self, min, max, out);
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 33d7f719c..28846462d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -13,79 +13,6 @@
 #include "common.h"
 #include "buffer.h"
 
-/*
- * new reference
- * doesn't alter refcount of chktype or mintype ---
- * unless one of them is returned
- */
-NPY_NO_EXPORT PyArray_Descr *
-_array_small_type(PyArray_Descr *chktype, PyArray_Descr* mintype)
-{
-    PyArray_Descr *outtype;
-    int outtype_num, save_num;
-
-    if (PyArray_EquivTypes(chktype, mintype)) {
-        Py_INCREF(mintype);
-        return mintype;
-    }
-
-
-    if (chktype->type_num > mintype->type_num) {
-        outtype_num = chktype->type_num;
-    }
-    else {
-        if (PyDataType_ISOBJECT(chktype) &&
-            PyDataType_ISSTRING(mintype)) {
-            return PyArray_DescrFromType(NPY_OBJECT);
-        }
-        else {
-            outtype_num = mintype->type_num;
-        }
-    }
-
-    save_num = outtype_num;
-    while (outtype_num < PyArray_NTYPES &&
-          !(PyArray_CanCastSafely(chktype->type_num, outtype_num)
-            && PyArray_CanCastSafely(mintype->type_num, outtype_num))) {
-        outtype_num++;
-    }
-    if (outtype_num == PyArray_NTYPES) {
-        outtype = PyArray_DescrFromType(save_num);
-    }
-    else {
-        outtype = PyArray_DescrFromType(outtype_num);
-    }
-    if (PyTypeNum_ISEXTENDED(outtype->type_num)) {
-        int testsize = outtype->elsize;
-        int chksize, minsize;
-        chksize = chktype->elsize;
-        minsize = mintype->elsize;
-        /*
-         * Handle string->unicode case separately
-         * because string itemsize is 4* as large
-         */
-        if (outtype->type_num == PyArray_UNICODE &&
-            mintype->type_num == PyArray_STRING) {
-            testsize = MAX(chksize, 4*minsize);
-        }
-        else if (chktype->type_num == PyArray_STRING &&
-                 mintype->type_num == PyArray_UNICODE) {
-            testsize = MAX(chksize*4, minsize);
-        }
-        else {
-            testsize = MAX(chksize, minsize);
-        }
-        if (testsize != outtype->elsize) {
-            PyArray_DESCR_REPLACE(outtype);
-            outtype->elsize = testsize;
-            Py_XDECREF(outtype->fields);
-            outtype->fields = NULL;
-            Py_XDECREF(outtype->names);
-            outtype->names = NULL;
-        }
-    }
-    return outtype;
-}
 
 NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op)
@@ -312,6 +239,9 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
         if (l == 0 && minitype->type_num == PyArray_BOOL) {
             Py_DECREF(minitype);
             minitype = PyArray_DescrFromType(PyArray_DEFAULT);
+            if (minitype == NULL) {
+                return NULL;
+            }
         }
         while (--l >= 0) {
             PyArray_Descr *newtype;
@@ -321,7 +251,11 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
                 goto deflt;
             }
             chktype = _array_find_type(ip, minitype, max-1);
-            newtype = _array_small_type(chktype, minitype);
+            if (chktype == NULL) {
+                Py_DECREF(minitype);
+                return NULL;
+            }
+            newtype = PyArray_PromoteTypes(chktype, minitype);
             Py_DECREF(minitype);
             minitype = newtype;
             Py_DECREF(chktype);
@@ -337,9 +271,12 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
     chktype = _use_default_type(op);
 
  finish:
-    outtype = _array_small_type(chktype, minitype);
+    outtype = PyArray_PromoteTypes(chktype, minitype);
     Py_DECREF(chktype);
     Py_DECREF(minitype);
+    if (outtype == NULL) {
+        return NULL;
+    }
     /*
      * VOID Arrays should not occur by "default"
      * unless input was already a VOID
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 67ee9b66f..8242a0d18 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -7,9 +7,6 @@ NPY_NO_EXPORT PyArray_Descr *
 _array_find_type(PyObject *op, PyArray_Descr *minitype, int max);
 
 NPY_NO_EXPORT PyArray_Descr *
-_array_small_type(PyArray_Descr *chktype, PyArray_Descr* mintype);
-
-NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op);
 
 NPY_NO_EXPORT PyArray_Descr *
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 4ad2e9f51..b6214f38e 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -115,7 +115,7 @@ PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq)
         }
     }
     seq->len = len;
-    nd = PyArray_IntpFromSequence(obj, (intp *)seq->ptr, len);
+    nd = PyArray_IntpFromSequence(obj, (npy_intp *)seq->ptr, len);
     if (nd == -1 || nd != len) {
         PyDimMem_FREE(seq->ptr);
         seq->ptr = NULL;
@@ -155,7 +155,7 @@ PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf)
             return PY_FAIL;
         }
     }
-    buf->len = (intp) buflen;
+    buf->len = (npy_intp) buflen;
 
     /* Point to the base of the buffer object if present */
 #if defined(NPY_PY3K)
@@ -441,7 +441,7 @@ PyArray_PyIntAsInt(PyObject *o)
 }
 
 /*NUMPY_API*/
-NPY_NO_EXPORT intp
+NPY_NO_EXPORT npy_intp
 PyArray_PyIntAsIntp(PyObject *o)
 {
     longlong long_value = -1;
@@ -449,7 +449,7 @@ PyArray_PyIntAsIntp(PyObject *o)
     static char *msg = "an integer is required";
     PyObject *arr;
     PyArray_Descr *descr;
-    intp ret;
+    npy_intp ret;
 
     if (!o) {
         PyErr_SetString(PyExc_TypeError, msg);
@@ -485,7 +485,7 @@ PyArray_PyIntAsIntp(PyObject *o)
         arr = PyArray_FromScalar(o, descr);
     }
     if (arr != NULL) {
-        ret = *((intp *)PyArray_DATA(arr));
+        ret = *((npy_intp *)PyArray_DATA(arr));
         Py_DECREF(arr);
         return ret;
     }
@@ -536,7 +536,7 @@ PyArray_PyIntAsIntp(PyObject *o)
         return -1;
     }
 #endif
-    return (intp) long_value;
+    return (npy_intp) long_value;
 }
 
 /*NUMPY_API
@@ -545,7 +545,7 @@ PyArray_PyIntAsIntp(PyObject *o)
  * vals must be large enough to hold maxvals
  */
 NPY_NO_EXPORT int
-PyArray_IntpFromSequence(PyObject *seq, intp *vals, int maxvals)
+PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals)
 {
     int nd, i;
     PyObject *op, *err;
@@ -567,9 +567,9 @@ PyArray_IntpFromSequence(PyObject *seq, intp *vals, int maxvals)
 #endif
         nd = 1;
 #if SIZEOF_LONG >= SIZEOF_INTP
-        vals[0] = (intp ) PyInt_AsLong(op);
+        vals[0] = (npy_intp ) PyInt_AsLong(op);
 #else
-        vals[0] = (intp ) PyLong_AsLongLong(op);
+        vals[0] = (npy_intp ) PyLong_AsLongLong(op);
 #endif
         Py_DECREF(op);
 
@@ -596,9 +596,9 @@ PyArray_IntpFromSequence(PyObject *seq, intp *vals, int maxvals)
                 return -1;
             }
 #if SIZEOF_LONG >= SIZEOF_INTP
-            vals[i]=(intp )PyInt_AsLong(op);
+            vals[i]=(npy_intp )PyInt_AsLong(op);
 #else
-            vals[i]=(intp )PyLong_AsLongLong(op);
+            vals[i]=(npy_intp )PyLong_AsLongLong(op);
 #endif
             Py_DECREF(op);
 
@@ -751,7 +751,7 @@ PyArray_TypestrConvert(int itemsize, int gentype)
   PyArray_IntTupleFromIntp
 */
 NPY_NO_EXPORT PyObject *
-PyArray_IntTupleFromIntp(int len, intp *vals)
+PyArray_IntTupleFromIntp(int len, npy_intp *vals)
 {
     int i;
     PyObject *intTuple = PyTuple_New(len);
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 005bb6d05..0a7d4afe0 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -13,6 +13,7 @@
 
 #include "arrayobject.h"
 #include "mapping.h"
+#include "lowlevel_strided_loops.h"
 
 #include "convert.h"
 
@@ -353,26 +354,112 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 }
 
 /*NUMPY_API
-  Copy an array.
-*/
+ * Fills an array with zeros.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_FillWithZero(PyArrayObject *a)
+{
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    PyArray_Descr *dtype = PyArray_DESCR(a);
+    NpyIter *iter;
+
+    NpyIter_IterNext_Fn iternext;
+    char **dataptr;
+    npy_intp stride, *countptr;
+    int needs_api;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArray_ISWRITEABLE(a)) {
+        PyErr_SetString(PyExc_RuntimeError, "cannot write to array");
+        return -1;
+    }
+
+    /* A zero-sized array needs no zeroing */
+    if (PyArray_SIZE(a) == 0) {
+        return 0;
+    }
+
+    /* If it's possible to do a simple memset, do so */
+    if (!PyDataType_REFCHK(dtype) && (PyArray_ISCONTIGUOUS(a) ||
+                                      PyArray_ISFORTRAN(a))) {
+        memset(PyArray_DATA(a), 0, PyArray_NBYTES(a));
+        return 0;
+    }
+
+    /* Use an iterator to go through all the data */
+    iter = NpyIter_New(a, NPY_ITER_WRITEONLY|NPY_ITER_NO_INNER_ITERATION,
+                    NPY_KEEPORDER, NPY_NO_CASTING,
+                    NULL, 0, NULL, 0);
+    
+    if (iter == NULL) {
+        return -1;
+    }
+
+    iternext = NpyIter_GetIterNext(iter, NULL);
+    if (iternext == NULL) {
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+    dataptr = NpyIter_GetDataPtrArray(iter);
+    stride = NpyIter_GetInnerStrideArray(iter)[0];
+    countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+    needs_api = NpyIter_IterationNeedsAPI(iter);
+
+    /*
+     * Because buffering is disabled in the iterator, the inner loop
+     * strides will be the same throughout the iteration loop.  Thus,
+     * we can pass them to this function to take advantage of
+     * contiguous strides, etc.
+     *
+     * By setting the src_dtype to NULL, we get a function which sets
+     * the destination to zeros.
+     */
+    if (PyArray_GetDTypeTransferFunction(
+                    PyArray_ISALIGNED(a),
+                    0, stride,
+                    NULL, PyArray_DESCR(a),
+                    0,
+                    &stransfer, &transferdata,
+                    &needs_api) != NPY_SUCCEED) {
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    do {
+        stransfer(NULL, 0, *dataptr, stride,
+                    *countptr, 0, transferdata);
+    } while(iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    PyArray_FreeStridedTransferData(transferdata);
+    NpyIter_Deallocate(iter);
+
+    return 0;
+}
+
+/*NUMPY_API
+ * Copy an array.
+ */
 NPY_NO_EXPORT PyObject *
-PyArray_NewCopy(PyArrayObject *m1, NPY_ORDER fortran)
+PyArray_NewCopy(PyArrayObject *m1, NPY_ORDER order)
 {
-    PyArrayObject *ret;
-    if (fortran == PyArray_ANYORDER)
-        fortran = PyArray_ISFORTRAN(m1);
-
-    Py_INCREF(m1->descr);
-    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(m1),
-                                                m1->descr,
-                                                m1->nd,
-                                                m1->dimensions,
-                                                NULL, NULL,
-                                                fortran,
-                                                (PyObject *)m1);
+    PyArrayObject *ret = (PyArrayObject *)PyArray_NewLikeArray(m1, order, NULL);
     if (ret == NULL) {
         return NULL;
     }
+
     if (PyArray_CopyInto(ret, m1) == -1) {
         Py_DECREF(ret);
         return NULL;
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 0040e8ad5..dd1918bf0 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -22,6 +22,9 @@
  *
  * Cast an array using typecode structure.
  * steals reference to at --- cannot be NULL
+ *
+ * This function always makes a copy of mp, even if the dtype
+ * doesn't change.
  */
 NPY_NO_EXPORT PyObject *
 PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran)
@@ -32,16 +35,6 @@ PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran)
 
     mpd = mp->descr;
 
-    if (((mpd == at) ||
-                ((mpd->type_num == at->type_num) &&
-                 PyArray_EquivByteorders(mpd->byteorder, at->byteorder) &&
-                 ((mpd->elsize == at->elsize) || (at->elsize==0)))) &&
-                 PyArray_ISBEHAVED_RO(mp)) {
-        Py_DECREF(at);
-        Py_INCREF(mp);
-        return (PyObject *)mp;
-    }
-
     if (at->elsize == 0) {
         PyArray_DESCR_REPLACE(at);
         if (at == NULL) {
@@ -70,7 +63,7 @@ PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran)
     if (out == NULL) {
         return NULL;
     }
-    ret = PyArray_CastTo((PyArrayObject *)out, mp);
+    ret = PyArray_CopyInto((PyArrayObject *)out, mp);
     if (ret != -1) {
         return out;
     }
@@ -90,10 +83,10 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
 {
     PyArray_VectorUnaryFunc *castfunc = NULL;
 
-    if (type_num < PyArray_NTYPES) {
+    if (type_num < NPY_NTYPES_ABI_COMPATIBLE) {
         castfunc = descr->f->cast[type_num];
     }
-    if (castfunc == NULL) {
+    else {
         PyObject *obj = descr->f->castdict;
         if (obj && PyDict_Check(obj)) {
             PyObject *key;
@@ -120,17 +113,17 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
         }
 #if PY_VERSION_HEX >= 0x02050000
         ret = PyErr_WarnEx(cls,
-                           "Casting complex values to real discards the imaginary "
-                           "part", 1);
+                           "Casting complex values to real discards "
+                           "the imaginary part", 1);
 #else
         ret = PyErr_Warn(cls,
-                         "Casting complex values to real discards the imaginary "
-                         "part");
+                         "Casting complex values to real discards "
+                         "the imaginary part");
 #endif
         Py_XDECREF(cls);
         if (ret < 0) {
             return NULL;
-	}
+	    }
     }
     if (castfunc) {
         return castfunc;
@@ -141,161 +134,6 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
 }
 
 /*
- * Reference counts:
- * copyswapn is used which increases and decreases reference counts for OBJECT arrays.
- * All that needs to happen is for any reference counts in the buffers to be
- * decreased when completely finished with the buffers.
- *
- * buffers[0] is the destination
- * buffers[1] is the source
- */
-static void
-_strided_buffered_cast(char *dptr, intp dstride, int delsize, int dswap,
-                       PyArray_CopySwapNFunc *dcopyfunc,
-                       char *sptr, intp sstride, int selsize, int sswap,
-                       PyArray_CopySwapNFunc *scopyfunc,
-                       intp N, char **buffers, int bufsize,
-                       PyArray_VectorUnaryFunc *castfunc,
-                       PyArrayObject *dest, PyArrayObject *src)
-{
-    int i;
-    if (N <= bufsize) {
-        /*
-         * 1. copy input to buffer and swap
-         * 2. cast input to output
-         * 3. swap output if necessary and copy from output buffer
-         */
-        scopyfunc(buffers[1], selsize, sptr, sstride, N, sswap, src);
-        castfunc(buffers[1], buffers[0], N, src, dest);
-        dcopyfunc(dptr, dstride, buffers[0], delsize, N, dswap, dest);
-        return;
-    }
-
-    /* otherwise we need to divide up into bufsize pieces */
-    i = 0;
-    while (N > 0) {
-        int newN = MIN(N, bufsize);
-
-        _strided_buffered_cast(dptr+i*dstride, dstride, delsize,
-                               dswap, dcopyfunc,
-                               sptr+i*sstride, sstride, selsize,
-                               sswap, scopyfunc,
-                               newN, buffers, bufsize, castfunc, dest, src);
-        i += newN;
-        N -= bufsize;
-    }
-    return;
-}
-
-static int
-_broadcast_cast(PyArrayObject *out, PyArrayObject *in,
-                PyArray_VectorUnaryFunc *castfunc, int iswap, int oswap)
-{
-    int delsize, selsize, maxaxis, i, N;
-    PyArrayMultiIterObject *multi;
-    intp maxdim, ostrides, istrides;
-    char *buffers[2];
-    PyArray_CopySwapNFunc *ocopyfunc, *icopyfunc;
-    char *obptr;
-    NPY_BEGIN_THREADS_DEF;
-
-    delsize = PyArray_ITEMSIZE(out);
-    selsize = PyArray_ITEMSIZE(in);
-    multi = (PyArrayMultiIterObject *)PyArray_MultiIterNew(2, out, in);
-    if (multi == NULL) {
-        return -1;
-    }
-
-    if (multi->size != PyArray_SIZE(out)) {
-        PyErr_SetString(PyExc_ValueError,
-                "array dimensions are not "\
-                "compatible for copy");
-        Py_DECREF(multi);
-        return -1;
-    }
-
-    icopyfunc = in->descr->f->copyswapn;
-    ocopyfunc = out->descr->f->copyswapn;
-    maxaxis = PyArray_RemoveSmallest(multi);
-    if (maxaxis < 0) {
-        /* cast 1 0-d array to another */
-        N = 1;
-        maxdim = 1;
-        ostrides = delsize;
-        istrides = selsize;
-    }
-    else {
-        maxdim = multi->dimensions[maxaxis];
-        N = (int) (MIN(maxdim, PyArray_BUFSIZE));
-        ostrides = multi->iters[0]->strides[maxaxis];
-        istrides = multi->iters[1]->strides[maxaxis];
-
-    }
-    buffers[0] = _pya_malloc(N*delsize);
-    if (buffers[0] == NULL) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    buffers[1] = _pya_malloc(N*selsize);
-    if (buffers[1] == NULL) {
-        _pya_free(buffers[0]);
-        PyErr_NoMemory();
-        return -1;
-    }
-    if (PyDataType_FLAGCHK(out->descr, NPY_NEEDS_INIT)) {
-        memset(buffers[0], 0, N*delsize);
-    }
-    if (PyDataType_FLAGCHK(in->descr, NPY_NEEDS_INIT)) {
-        memset(buffers[1], 0, N*selsize);
-    }
-
-#if NPY_ALLOW_THREADS
-    if (PyArray_ISNUMBER(in) && PyArray_ISNUMBER(out)) {
-        NPY_BEGIN_THREADS;
-    }
-#endif
-
-    while (multi->index < multi->size) {
-        _strided_buffered_cast(multi->iters[0]->dataptr,
-                ostrides,
-                delsize, oswap, ocopyfunc,
-                multi->iters[1]->dataptr,
-                istrides,
-                selsize, iswap, icopyfunc,
-                maxdim, buffers, N,
-                castfunc, out, in);
-        PyArray_MultiIter_NEXT(multi);
-    }
-#if NPY_ALLOW_THREADS
-    if (PyArray_ISNUMBER(in) && PyArray_ISNUMBER(out)) {
-        NPY_END_THREADS;
-    }
-#endif
-    Py_DECREF(multi);
-    if (PyDataType_REFCHK(in->descr)) {
-        obptr = buffers[1];
-        for (i = 0; i < N; i++, obptr+=selsize) {
-            PyArray_Item_XDECREF(obptr, in->descr);
-        }
-    }
-    if (PyDataType_REFCHK(out->descr)) {
-        obptr = buffers[0];
-        for (i = 0; i < N; i++, obptr+=delsize) {
-            PyArray_Item_XDECREF(obptr, out->descr);
-        }
-    }
-    _pya_free(buffers[0]);
-    _pya_free(buffers[1]);
-    if (PyErr_Occurred()) {
-        return -1;
-    }
-
-    return 0;
-}
-
-
-
-/*
  * Must be broadcastable.
  * This code is very similar to PyArray_CopyInto/PyArray_MoveInto
  * except casting is done --- PyArray_BUFSIZE is used
@@ -308,167 +146,8 @@ _broadcast_cast(PyArrayObject *out, PyArrayObject *in,
 NPY_NO_EXPORT int
 PyArray_CastTo(PyArrayObject *out, PyArrayObject *mp)
 {
-    int simple;
-    int same;
-    PyArray_VectorUnaryFunc *castfunc = NULL;
-    intp mpsize = PyArray_SIZE(mp);
-    int iswap, oswap;
-    NPY_BEGIN_THREADS_DEF;
-
-    if (mpsize == 0) {
-        return 0;
-    }
-    if (!PyArray_ISWRITEABLE(out)) {
-        PyErr_SetString(PyExc_ValueError, "output array is not writeable");
-        return -1;
-    }
-
-    castfunc = PyArray_GetCastFunc(mp->descr, out->descr->type_num);
-    if (castfunc == NULL) {
-        return -1;
-    }
-
-    same = PyArray_SAMESHAPE(out, mp);
-    simple = same && ((PyArray_ISCARRAY_RO(mp) && PyArray_ISCARRAY(out)) ||
-            (PyArray_ISFARRAY_RO(mp) && PyArray_ISFARRAY(out)));
-    if (simple) {
-#if NPY_ALLOW_THREADS
-        if (PyArray_ISNUMBER(mp) && PyArray_ISNUMBER(out)) {
-            NPY_BEGIN_THREADS;
-        }
-#endif
-        castfunc(mp->data, out->data, mpsize, mp, out);
-
-#if NPY_ALLOW_THREADS
-        if (PyArray_ISNUMBER(mp) && PyArray_ISNUMBER(out)) {
-            NPY_END_THREADS;
-        }
-#endif
-        if (PyErr_Occurred()) {
-            return -1;
-        }
-        return 0;
-    }
-
-    /*
-     * If the input or output is OBJECT, STRING, UNICODE, or VOID
-     *  then getitem and setitem are used for the cast
-     *  and byteswapping is handled by those methods
-     */
-    if (PyArray_ISFLEXIBLE(mp) || PyArray_ISOBJECT(mp) || PyArray_ISOBJECT(out) ||
-            PyArray_ISFLEXIBLE(out)) {
-        iswap = oswap = 0;
-    }
-    else {
-        iswap = PyArray_ISBYTESWAPPED(mp);
-        oswap = PyArray_ISBYTESWAPPED(out);
-    }
-
-    return _broadcast_cast(out, mp, castfunc, iswap, oswap);
-}
-
-
-static int
-_bufferedcast(PyArrayObject *out, PyArrayObject *in,
-              PyArray_VectorUnaryFunc *castfunc)
-{
-    char *inbuffer, *bptr, *optr;
-    char *outbuffer=NULL;
-    PyArrayIterObject *it_in = NULL, *it_out = NULL;
-    intp i, index;
-    intp ncopies = PyArray_SIZE(out) / PyArray_SIZE(in);
-    int elsize=in->descr->elsize;
-    int nels = PyArray_BUFSIZE;
-    int el;
-    int inswap, outswap = 0;
-    int obuf=!PyArray_ISCARRAY(out);
-    int oelsize = out->descr->elsize;
-    PyArray_CopySwapFunc *in_csn;
-    PyArray_CopySwapFunc *out_csn;
-    int retval = -1;
-
-    in_csn = in->descr->f->copyswap;
-    out_csn = out->descr->f->copyswap;
-
-    /*
-     * If the input or output is STRING, UNICODE, or VOID
-     * then getitem and setitem are used for the cast
-     *  and byteswapping is handled by those methods
-     */
-
-    inswap = !(PyArray_ISFLEXIBLE(in) || PyArray_ISNOTSWAPPED(in));
-
-    inbuffer = PyDataMem_NEW(PyArray_BUFSIZE*elsize);
-    if (inbuffer == NULL) {
-        return -1;
-    }
-    if (PyArray_ISOBJECT(in)) {
-        memset(inbuffer, 0, PyArray_BUFSIZE*elsize);
-    }
-    it_in = (PyArrayIterObject *)PyArray_IterNew((PyObject *)in);
-    if (it_in == NULL) {
-        goto exit;
-    }
-    if (obuf) {
-        outswap = !(PyArray_ISFLEXIBLE(out) ||
-                    PyArray_ISNOTSWAPPED(out));
-        outbuffer = PyDataMem_NEW(PyArray_BUFSIZE*oelsize);
-        if (outbuffer == NULL) {
-            goto exit;
-        }
-        if (PyArray_ISOBJECT(out)) {
-            memset(outbuffer, 0, PyArray_BUFSIZE*oelsize);
-        }
-        it_out = (PyArrayIterObject *)PyArray_IterNew((PyObject *)out);
-        if (it_out == NULL) {
-            goto exit;
-        }
-        nels = MIN(nels, PyArray_BUFSIZE);
-    }
-
-    optr = (obuf) ? outbuffer: out->data;
-    bptr = inbuffer;
-    el = 0;
-    while (ncopies--) {
-        index = it_in->size;
-        PyArray_ITER_RESET(it_in);
-        while (index--) {
-            in_csn(bptr, it_in->dataptr, inswap, in);
-            bptr += elsize;
-            PyArray_ITER_NEXT(it_in);
-            el += 1;
-            if ((el == nels) || (index == 0)) {
-                /* buffer filled, do cast */
-                castfunc(inbuffer, optr, el, in, out);
-                if (obuf) {
-                    /* Copy from outbuffer to array */
-                    for (i = 0; i < el; i++) {
-                        out_csn(it_out->dataptr,
-                                optr, outswap,
-                                out);
-                        optr += oelsize;
-                        PyArray_ITER_NEXT(it_out);
-                    }
-                    optr = outbuffer;
-                }
-                else {
-                    optr += out->descr->elsize * nels;
-                }
-                el = 0;
-                bptr = inbuffer;
-            }
-        }
-    }
-    retval = 0;
-
- exit:
-    Py_XDECREF(it_in);
-    PyDataMem_FREE(inbuffer);
-    PyDataMem_FREE(outbuffer);
-    if (obuf) {
-        Py_XDECREF(it_out);
-    }
-    return retval;
+    /* CopyInto handles the casting now */
+    return PyArray_CopyInto(out, mp);
 }
 
 /*NUMPY_API
@@ -478,42 +157,8 @@ _bufferedcast(PyArrayObject *out, PyArrayObject *in,
 NPY_NO_EXPORT int
 PyArray_CastAnyTo(PyArrayObject *out, PyArrayObject *mp)
 {
-    int simple;
-    PyArray_VectorUnaryFunc *castfunc = NULL;
-    npy_intp mpsize = PyArray_SIZE(mp);
-
-    if (mpsize == 0) {
-        return 0;
-    }
-    if (!PyArray_ISWRITEABLE(out)) {
-        PyErr_SetString(PyExc_ValueError, "output array is not writeable");
-        return -1;
-    }
-
-    if (!(mpsize == PyArray_SIZE(out))) {
-        PyErr_SetString(PyExc_ValueError,
-                        "arrays must have the same number of"
-                        " elements for the cast.");
-        return -1;
-    }
-
-    castfunc = PyArray_GetCastFunc(mp->descr, out->descr->type_num);
-    if (castfunc == NULL) {
-        return -1;
-    }
-    simple = ((PyArray_ISCARRAY_RO(mp) && PyArray_ISCARRAY(out)) ||
-              (PyArray_ISFARRAY_RO(mp) && PyArray_ISFARRAY(out)));
-    if (simple) {
-        castfunc(mp->data, out->data, mpsize, mp, out);
-        return 0;
-    }
-    if (PyArray_SAMESHAPE(out, mp)) {
-        int iswap, oswap;
-        iswap = PyArray_ISBYTESWAPPED(mp) && !PyArray_ISFLEXIBLE(mp);
-        oswap = PyArray_ISBYTESWAPPED(out) && !PyArray_ISFLEXIBLE(out);
-        return _broadcast_cast(out, mp, castfunc, iswap, oswap);
-    }
-    return _bufferedcast(out, mp, castfunc);
+    /* CopyAnyInto handles the casting now */
+    return PyArray_CopyAnyInto(out, mp);
 }
 
 /*NUMPY_API
@@ -525,7 +170,8 @@ PyArray_CanCastSafely(int fromtype, int totype)
     PyArray_Descr *from;
 
     /* Fast table lookup for small type numbers */
-    if ((unsigned int)fromtype < NPY_NTYPES && (unsigned int)totype < NPY_NTYPES) {
+    if ((unsigned int)fromtype < NPY_NTYPES &&
+                                (unsigned int)totype < NPY_NTYPES) {
         return _npy_can_cast_safely_table[fromtype][totype];
     }
 
@@ -572,15 +218,18 @@ PyArray_CanCastSafely(int fromtype, int totype)
 
 /*NUMPY_API
  * leaves reference count alone --- cannot be NULL
+ *
+ * PyArray_CanCastTypeTo is equivalent to this, but adds a 'casting'
+ * parameter.
  */
-NPY_NO_EXPORT Bool
+NPY_NO_EXPORT npy_bool
 PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
 {
     int fromtype=from->type_num;
     int totype=to->type_num;
-    Bool ret;
+    npy_bool ret;
 
-    ret = (Bool) PyArray_CanCastSafely(fromtype, totype);
+    ret = (npy_bool) PyArray_CanCastSafely(fromtype, totype);
     if (ret) {
         /* Check String and Unicode more closely */
         if (fromtype == PyArray_STRING) {
@@ -605,10 +254,216 @@ PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
     return ret;
 }
 
+/* Provides an ordering for the dtype 'kind' character codes */
+static int
+dtype_kind_to_ordering(char kind)
+{
+    switch (kind) {
+        /* Boolean kind */
+        case 'b':
+            return 0;
+        /* Unsigned int kind */
+        case 'u':
+            return 1;
+        /* Signed int kind */
+        case 'i':
+            return 2;
+        /* Float kind */
+        case 'f':
+            return 4;
+        /* Complex kind */
+        case 'c':
+            return 5;
+        /* String kind */
+        case 'S':
+        case 'a':
+            return 6;
+        /* Unicode kind */
+        case 'U':
+            return 7;
+        /* Void kind */
+        case 'V':
+            return 8;
+        /* Object kind */
+        case 'O':
+            return 9;
+        /* Anything else - ideally shouldn't happen... */
+        default:
+            return 10;
+    }
+}
+
+/* Converts a type number from unsigned to signed */
+static int
+type_num_unsigned_to_signed(int type_num)
+{
+    switch (type_num) {
+        case NPY_UBYTE:
+            return NPY_BYTE;
+        case NPY_USHORT:
+            return NPY_SHORT;
+        case NPY_UINT:
+            return NPY_INT;
+        case NPY_ULONG:
+            return NPY_LONG;
+        case NPY_ULONGLONG:
+            return NPY_LONGLONG;
+        default:
+            return type_num;
+    }
+}
+
+/*NUMPY_API
+ * Returns true if data of type 'from' may be cast to data of type
+ * 'to' according to the rule 'casting'.
+ */
+NPY_NO_EXPORT npy_bool
+PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
+                                                    NPY_CASTING casting)
+{
+    /* If unsafe casts are allowed */
+    if (casting == NPY_UNSAFE_CASTING) {
+        return 1;
+    }
+    /* Equivalent types can be cast with any value of 'casting'  */
+    else if (PyArray_EquivTypenums(from->type_num, to->type_num)) {
+        /* For complicated case, use EquivTypes (for now) */
+        if (PyTypeNum_ISUSERDEF(from->type_num) ||
+                        PyDataType_HASFIELDS(from) ||
+                        from->subarray != NULL) {
+            int ret;
+
+            /* Only NPY_NO_CASTING prevents byte order conversion */
+            if ((casting != NPY_NO_CASTING) &&
+                                (!PyArray_ISNBO(from->byteorder) ||
+                                 !PyArray_ISNBO(to->byteorder))) {
+                PyArray_Descr *nbo_from, *nbo_to;
+
+                nbo_from = PyArray_DescrNewByteorder(from, NPY_NATIVE);
+                nbo_to = PyArray_DescrNewByteorder(to, NPY_NATIVE);
+                if (nbo_from == NULL || nbo_to == NULL) {
+                    Py_XDECREF(nbo_from);
+                    Py_XDECREF(nbo_to);
+                    PyErr_Clear();
+                    return 0;
+                }
+                ret = PyArray_EquivTypes(nbo_from, nbo_to);
+                Py_DECREF(nbo_from);
+                Py_DECREF(nbo_to);
+            }
+            else {
+                ret = PyArray_EquivTypes(from, to);
+            }
+            return ret;
+        }
+
+        switch (casting) {
+            case NPY_NO_CASTING:
+                return (from->elsize == to->elsize) &&
+                        PyArray_ISNBO(from->byteorder) ==
+                                    PyArray_ISNBO(to->byteorder);
+            case NPY_EQUIV_CASTING:
+                return (from->elsize == to->elsize);
+            case NPY_SAFE_CASTING:
+                return (from->elsize <= to->elsize);
+            default:
+                return 1;
+        }
+    }
+    /* If safe or same-kind casts are allowed */
+    else if (casting == NPY_SAFE_CASTING || casting == NPY_SAME_KIND_CASTING) {
+        if (PyArray_CanCastTo(from, to)) {
+            return 1;
+        }
+        else if(casting == NPY_SAME_KIND_CASTING) {
+            /*
+             * Also allow casting from lower to higher kinds, according
+             * to the ordering provided by dtype_kind_to_ordering.
+             */
+            return dtype_kind_to_ordering(from->kind) <=
+                            dtype_kind_to_ordering(to->kind);
+        }
+        else {
+            return 0;
+        }
+    }
+    /* NPY_NO_CASTING or NPY_EQUIV_CASTING was specified */
+    else {
+        return 0;
+    }
+}
+
+/* CanCastArrayTo needs this function */
+static int min_scalar_type_num(char *valueptr, int type_num,
+                                            int *is_small_unsigned);
+
+/*NUMPY_API
+ * Returns 1 if the array object may be cast to the given data type using
+ * the casting rule, 0 otherwise.  This differs from PyArray_CanCastTo in
+ * that it handles scalar arrays (0 dimensions) specially, by checking
+ * their value.
+ */
+NPY_NO_EXPORT npy_bool
+PyArray_CanCastArrayTo(PyArrayObject *arr, PyArray_Descr *to,
+                        NPY_CASTING casting)
+{
+    PyArray_Descr *from = PyArray_DESCR(arr);
+
+    /* If it's not a scalar, use the standard rules */
+    if (PyArray_NDIM(arr) > 0 || !PyTypeNum_ISNUMBER(from->type_num)) {
+        return PyArray_CanCastTypeTo(from, to, casting);
+    }
+    /* Otherwise, check the value */
+    else {
+        char *data = PyArray_BYTES(arr);
+        int swap = !PyArray_ISNBO(from->byteorder);
+        int is_small_unsigned = 0, type_num;
+        npy_bool ret;
+        PyArray_Descr *dtype;
+
+        /* An aligned memory buffer large enough to hold any type */
+#if NPY_SIZEOF_LONGLONG >= NPY_SIZEOF_CLONGDOUBLE
+        npy_longlong value;
+#else
+        npy_clongdouble value;
+#endif
+        from->f->copyswap(&value, data, swap, NULL);
+
+        type_num = min_scalar_type_num((char *)&value, from->type_num,
+                                        &is_small_unsigned);
+        
+        /*
+         * If we've got a small unsigned scalar, and the 'to' type
+         * is not unsigned, then make it signed to allow the value
+         * to be cast more appropriately.
+         */
+        if (is_small_unsigned && !(PyTypeNum_ISUNSIGNED(to->type_num))) {
+            type_num = type_num_unsigned_to_signed(type_num);
+        }
+
+        dtype = PyArray_DescrFromType(type_num);
+        if (dtype == NULL) {
+            return 0;
+        }
+#if 0
+        printf("min scalar cast ");
+        PyObject_Print(dtype, stdout, 0);
+        printf(" to ");
+        PyObject_Print(to, stdout, 0);
+        printf("\n");
+#endif
+        ret = PyArray_CanCastTypeTo(dtype, to, casting);
+        Py_DECREF(dtype);
+        return ret;
+    }
+}
+
 /*NUMPY_API
  * See if array scalars can be cast.
+ *
+ * TODO: For NumPy 2.0, add a NPY_CASTING parameter.
  */
-NPY_NO_EXPORT Bool
+NPY_NO_EXPORT npy_bool
 PyArray_CanCastScalar(PyTypeObject *from, PyTypeObject *to)
 {
     int fromtype;
@@ -619,7 +474,777 @@ PyArray_CanCastScalar(PyTypeObject *from, PyTypeObject *to)
     if (fromtype == PyArray_NOTYPE || totype == PyArray_NOTYPE) {
         return FALSE;
     }
-    return (Bool) PyArray_CanCastSafely(fromtype, totype);
+    return (npy_bool) PyArray_CanCastSafely(fromtype, totype);
+}
+
+/*
+ * Internal promote types function which handles unsigned integers which
+ * fit in same-sized signed integers specially.
+ */
+static PyArray_Descr *
+promote_types(PyArray_Descr *type1, PyArray_Descr *type2,
+                        int is_small_unsigned1, int is_small_unsigned2)
+{
+    if (is_small_unsigned1) {
+        int type_num1 = type1->type_num, type_num2 = type2->type_num, ret_type_num;
+
+        if (type_num2 < NPY_NTYPES && !(PyTypeNum_ISBOOL(type_num2) ||
+                                        PyTypeNum_ISUNSIGNED(type_num2))) {
+            /* Convert to the equivalent-sized signed integer */
+            type_num1 = type_num_unsigned_to_signed(type_num1);
+
+            ret_type_num = _npy_type_promotion_table[type_num1][type_num2];
+            /* The table doesn't handle string/unicode/void, check the result */
+            if (ret_type_num >= 0) {
+                return PyArray_DescrFromType(ret_type_num);
+            }
+        }
+
+        return PyArray_PromoteTypes(type1, type2);
+    }
+    else if (is_small_unsigned2) {
+        int type_num1 = type1->type_num,
+            type_num2 = type2->type_num,
+            ret_type_num;
+
+        if (type_num1 < NPY_NTYPES && !(PyTypeNum_ISBOOL(type_num1) ||
+                                        PyTypeNum_ISUNSIGNED(type_num1))) {
+            /* Convert to the equivalent-sized signed integer */
+            type_num2 = type_num_unsigned_to_signed(type_num2);
+
+            ret_type_num = _npy_type_promotion_table[type_num1][type_num2];
+            /* The table doesn't handle string/unicode/void, check the result */
+            if (ret_type_num >= 0) {
+                return PyArray_DescrFromType(ret_type_num);
+            }
+        }
+
+        return PyArray_PromoteTypes(type1, type2);
+    }
+    else {
+        return PyArray_PromoteTypes(type1, type2);
+    }
+
+}
+
+/*NUMPY_API
+ * Produces the smallest size and lowest kind type to which both
+ * input types can be cast.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+{
+    int type_num1, type_num2, ret_type_num;
+
+    /* If one of the arguments is NULL, return the non-NULL one */
+    if (type1 == NULL || type2 == NULL) {
+        if (type1 == NULL) {
+            if (type2 == NULL) {
+                PyErr_SetString(PyExc_RuntimeError,
+                        "PromoteTypes received two NULL arguments");
+                return NULL;
+            }
+            else {
+                Py_INCREF(type2);
+                return type2;
+            }
+        }
+        else {
+            Py_INCREF(type1);
+            return type1;
+        }
+    }
+
+    type_num1 = type1->type_num;
+    type_num2 = type2->type_num;
+
+    /* If they're built-in types, use the promotion table */
+    if (type_num1 < NPY_NTYPES && type_num2 < NPY_NTYPES) {
+        ret_type_num = _npy_type_promotion_table[type_num1][type_num2];
+        /* The table doesn't handle string/unicode/void, check the result */
+        if (ret_type_num >= 0) {
+            return PyArray_DescrFromType(ret_type_num);
+        }
+    }
+    /* If one or both are user defined, calculate it */
+    else {
+        int skind1 = NPY_NOSCALAR, skind2 = NPY_NOSCALAR, skind;
+  
+        if (PyArray_CanCastTo(type2, type1)) {
+            /* Promoted types are always native byte order */
+            if (PyArray_ISNBO(type1->byteorder)) {
+                Py_INCREF(type1);
+                return type1;
+            }
+            else {
+                return PyArray_DescrNewByteorder(type1, NPY_NATIVE);
+            }
+        }
+        else if (PyArray_CanCastTo(type1, type2)) {
+            /* Promoted types are always native byte order */
+            if (PyArray_ISNBO(type2->byteorder)) {
+                Py_INCREF(type2);
+                return type2;
+            }
+            else {
+                return PyArray_DescrNewByteorder(type2, NPY_NATIVE);
+            }
+        }
+
+        /* Convert the 'kind' char into a scalar kind */
+        switch (type1->kind) {
+            case 'b':
+                skind1 = NPY_BOOL_SCALAR;
+                break;
+            case 'u':
+                skind1 = NPY_INTPOS_SCALAR;
+                break;
+            case 'i':
+                skind1 = NPY_INTNEG_SCALAR;
+                break;
+            case 'f':
+                skind1 = NPY_FLOAT_SCALAR;
+                break;
+            case 'c':
+                skind1 = NPY_COMPLEX_SCALAR;
+                break;
+        }
+        switch (type2->kind) {
+            case 'b':
+                skind2 = NPY_BOOL_SCALAR;
+                break;
+            case 'u':
+                skind2 = NPY_INTPOS_SCALAR;
+                break;
+            case 'i':
+                skind2 = NPY_INTNEG_SCALAR;
+                break;
+            case 'f':
+                skind2 = NPY_FLOAT_SCALAR;
+                break;
+            case 'c':
+                skind2 = NPY_COMPLEX_SCALAR;
+                break;
+        }
+
+        /* If both are scalars, there may be a promotion possible */
+        if (skind1 != NPY_NOSCALAR && skind2 != NPY_NOSCALAR) {
+
+            /* Start with the larger scalar kind */
+            skind = (skind1 > skind2) ? skind1 : skind2;
+            ret_type_num = _npy_smallest_type_of_kind_table[skind];
+
+            for (;;) {
+
+                /* If there is no larger type of this kind, try a larger kind */
+                if (ret_type_num < 0) {
+                    ++skind;
+                    /* Use -1 to signal no promoted type found */
+                    if (skind < NPY_NSCALARKINDS) {
+                        ret_type_num = _npy_smallest_type_of_kind_table[skind];
+                    }
+                    else {
+                        break;
+                    }
+                }
+
+                /* If we found a type to which we can promote both, done! */
+                if (PyArray_CanCastSafely(type_num1, ret_type_num) &&
+                            PyArray_CanCastSafely(type_num2, ret_type_num)) {
+                    return PyArray_DescrFromType(ret_type_num);
+                }
+
+                /* Try the next larger type of this kind */
+                ret_type_num = _npy_next_larger_type_table[ret_type_num];
+            }
+
+        }
+
+        PyErr_SetString(PyExc_TypeError,
+                "invalid type promotion with custom data type");
+        return NULL;
+    }
+
+    switch (type_num1) {
+        /* BOOL can convert to anything */
+        case NPY_BOOL:
+            Py_INCREF(type2);
+            return type2;
+        /* For strings and unicodes, take the larger size */
+        case NPY_STRING:
+            if (type_num2 == NPY_STRING) {
+                if (type1->elsize > type2->elsize) {
+                    Py_INCREF(type1);
+                    return type1;
+                }
+                else {
+                    Py_INCREF(type2);
+                    return type2;
+                }
+            }
+            else if (type_num2 == NPY_UNICODE) {
+                if (type2->elsize >= type1->elsize * 4) {
+                    Py_INCREF(type2);
+                    return type2;
+                }
+                else {
+                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
+                    if (d == NULL) {
+                        return NULL;
+                    }
+                    d->elsize = type1->elsize * 4;
+                    return d;
+                }
+            }
+            /* Allow NUMBER -> STRING */
+            else if (PyTypeNum_ISNUMBER(type_num2)) {
+                Py_INCREF(type1);
+                return type1;
+            }
+        case NPY_UNICODE:
+            if (type_num2 == NPY_UNICODE) {
+                if (type1->elsize > type2->elsize) {
+                    Py_INCREF(type1);
+                    return type1;
+                }
+                else {
+                    Py_INCREF(type2);
+                    return type2;
+                }
+            }
+            else if (type_num2 == NPY_STRING) {
+                if (type1->elsize >= type2->elsize * 4) {
+                    Py_INCREF(type1);
+                    return type1;
+                }
+                else {
+                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
+                    if (d == NULL) {
+                        return NULL;
+                    }
+                    d->elsize = type2->elsize * 4;
+                    return d;
+                }
+            }
+            /* Allow NUMBER -> UNICODE */
+            else if (PyTypeNum_ISNUMBER(type_num2)) {
+                Py_INCREF(type1);
+                return type1;
+            }
+            break;
+    }
+
+    switch (type_num2) {
+        /* BOOL can convert to anything */
+        case NPY_BOOL:
+            Py_INCREF(type1);
+            return type1;
+        case NPY_STRING:
+            /* Allow NUMBER -> STRING */
+            if (PyTypeNum_ISNUMBER(type_num1)) {
+                Py_INCREF(type2);
+                return type2;
+            }
+        case NPY_UNICODE:
+            /* Allow NUMBER -> UNICODE */
+            if (PyTypeNum_ISNUMBER(type_num1)) {
+                Py_INCREF(type2);
+                return type2;
+            }
+            break;
+    }
+
+    /* For equivalent types we can return either */
+    if (PyArray_EquivTypes(type1, type2)) {
+        Py_INCREF(type1);
+        return type1;
+    }
+
+    /* TODO: Also combine fields, subarrays, strings, etc */
+    
+    /*
+    printf("invalid type promotion: ");
+    PyObject_Print(type1, stdout, 0);
+    printf(" ");
+    PyObject_Print(type2, stdout, 0);
+    printf("\n");
+    */
+    PyErr_SetString(PyExc_TypeError, "invalid type promotion");
+    return NULL;
+}
+
+/*
+ * NOTE: While this is unlikely to be a performance problem, if
+ *       it is it could be reverted to a simple positive/negative
+ *       check as the previous system used.
+ *
+ * The is_small_unsigned output flag indicates whether it's an unsigned integer,
+ * and would fit in a signed integer of the same bit size.
+ */
+static int min_scalar_type_num(char *valueptr, int type_num,
+                                            int *is_small_unsigned)
+{
+    switch (type_num) {
+        case NPY_BOOL: {
+            return NPY_BOOL;
+        }
+        case NPY_UBYTE: {
+            char value = *valueptr;
+            if (value <= NPY_MAX_BYTE) {
+                *is_small_unsigned = 1;
+            }
+            return NPY_UBYTE;
+        }
+        case NPY_BYTE: {
+            char value = *valueptr;
+            if (value >= 0) {
+                *is_small_unsigned = 1;
+                return NPY_UBYTE;
+            }
+            break;
+        }
+        case NPY_USHORT: {
+            npy_ushort value = *(npy_ushort *)valueptr;
+            if (value <= NPY_MAX_UBYTE) {
+                if (value <= NPY_MAX_BYTE) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UBYTE;
+            }
+
+            if (value <= NPY_MAX_SHORT) {
+                *is_small_unsigned = 1;
+            }
+            break;
+        }
+        case NPY_SHORT: {
+            npy_short value = *(npy_short *)valueptr;
+            if (value >= 0) {
+                return min_scalar_type_num(valueptr, NPY_USHORT, is_small_unsigned);
+            }
+            else if (value >= NPY_MIN_BYTE) {
+                return NPY_BYTE;
+            }
+            break;
+        }
+#if NPY_SIZEOF_LONG == NPY_SIZEOF_INT
+        case NPY_ULONG:
+#endif
+        case NPY_UINT: {
+            npy_uint value = *(npy_uint *)valueptr;
+            if (value <= NPY_MAX_UBYTE) {
+                if (value < NPY_MAX_BYTE) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UBYTE;
+            }
+            else if (value <= NPY_MAX_USHORT) {
+                if (value <= NPY_MAX_SHORT) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_USHORT;
+            }
+
+            if (value <= NPY_MAX_INT) {
+                *is_small_unsigned = 1;
+            }
+            break;
+        }
+#if NPY_SIZEOF_LONG == NPY_SIZEOF_INT
+        case NPY_LONG:
+#endif
+        case NPY_INT: {
+            npy_int value = *(npy_int *)valueptr;
+            if (value >= 0) {
+                return min_scalar_type_num(valueptr, NPY_UINT, is_small_unsigned);
+            }
+            else if (value >= NPY_MIN_BYTE) {
+                return NPY_BYTE;
+            }
+            else if (value >= NPY_MIN_SHORT) {
+                return NPY_SHORT;
+            }
+            break;
+        }
+#if NPY_SIZEOF_LONG != NPY_SIZEOF_INT && NPY_SIZEOF_LONG != NPY_SIZEOF_LONGLONG
+        case NPY_ULONG: {
+            npy_ulong value = *(npy_ulong *)valueptr;
+            if (value <= NPY_MAX_UBYTE) {
+                if (value <= NPY_MAX_BYTE) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UBYTE;
+            }
+            else if (value <= NPY_MAX_USHORT) {
+                if (value <= NPY_MAX_SHORT) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_USHORT;
+            }
+            else if (value <= NPY_MAX_UINT) {
+                if (value <= NPY_MAX_INT) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UINT;
+            }
+
+            if (value <= NPY_MAX_LONG) {
+                *is_small_unsigned = 1;
+            }
+            break;
+        }
+        case NPY_LONG: {
+            npy_long value = *(npy_long *)valueptr;
+            if (value >= 0) {
+                return min_scalar_type_num(valueptr, NPY_ULONG, is_small_unsigned);
+            }
+            else if (value >= NPY_MIN_BYTE) {
+                return NPY_BYTE;
+            }
+            else if (value >= NPY_MIN_SHORT) {
+                return NPY_SHORT;
+            }
+            else if (value >= NPY_MIN_INT) {
+                return NPY_INT;
+            }
+            break;
+        }
+#endif
+#if NPY_SIZEOF_LONG == NPY_SIZEOF_LONGLONG
+        case NPY_ULONG:
+#endif
+        case NPY_ULONGLONG: {
+            npy_ulonglong value = *(npy_ulonglong *)valueptr;
+            if (value <= NPY_MAX_UBYTE) {
+                if (value <= NPY_MAX_BYTE) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UBYTE;
+            }
+            else if (value <= NPY_MAX_USHORT) {
+                if (value <= NPY_MAX_SHORT) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_USHORT;
+            }
+            else if (value <= NPY_MAX_UINT) {
+                if (value <= NPY_MAX_INT) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_UINT;
+            }
+#if NPY_SIZEOF_LONG != NPY_SIZEOF_INT && NPY_SIZEOF_LONG != NPY_SIZEOF_LONGLONG
+            else if (value <= NPY_MAX_ULONG) {
+                if (value <= NPY_MAX_LONG) {
+                    *is_small_unsigned = 1;
+                }
+                return NPY_ULONG;
+            }
+#endif
+
+            if (value <= NPY_MAX_LONGLONG) {
+                *is_small_unsigned = 1;
+            }
+            break;
+        }
+#if NPY_SIZEOF_LONG == NPY_SIZEOF_LONGLONG
+        case NPY_LONG:
+#endif
+        case NPY_LONGLONG: {
+            npy_longlong value = *(npy_longlong *)valueptr;
+            if (value >= 0) {
+                return min_scalar_type_num(valueptr, NPY_ULONGLONG, is_small_unsigned);
+            }
+            else if (value >= NPY_MIN_BYTE) {
+                return NPY_BYTE;
+            }
+            else if (value >= NPY_MIN_SHORT) {
+                return NPY_SHORT;
+            }
+            else if (value >= NPY_MIN_INT) {
+                return NPY_INT;
+            }
+#if NPY_SIZEOF_LONG != NPY_SIZEOF_INT && NPY_SIZEOF_LONG != NPY_SIZEOF_LONGLONG
+            else if (value >= NPY_MIN_LONG) {
+                return NPY_LONG;
+            }
+#endif
+            break;
+        }
+        /*
+         * Float types aren't allowed to be demoted to integer types,
+         * but precision loss is allowed.
+         */
+        case NPY_HALF: {
+            return NPY_HALF;
+        }
+        case NPY_FLOAT: {
+            float value = *(float *)valueptr;
+            if (value > -65000 && value < 65000) {
+                return NPY_HALF;
+            }
+            break;
+        }
+        case NPY_DOUBLE: {
+            double value = *(double *)valueptr;
+            if (value > -65000 && value < 65000) {
+                return NPY_HALF;
+            }
+            else if (value > -3.4e38 && value < 3.4e38) {
+                return NPY_FLOAT;
+            }
+            break;
+        }
+        case NPY_LONGDOUBLE: {
+            npy_longdouble value = *(npy_longdouble *)valueptr;
+            if (value > -65000 && value < 65000) {
+                return NPY_HALF;
+            }
+            else if (value > -3.4e38 && value < 3.4e38) {
+                return NPY_FLOAT;
+            }
+            else if (value > -1.7e308 && value < 1.7e308) {
+                return NPY_DOUBLE;
+            }
+            break;
+        }
+        /*
+         * The code to demote complex to float is disabled for now,
+         * as forcing complex by adding 0j is probably desireable.
+         */
+        case NPY_CFLOAT: {
+            /*
+            npy_cfloat value = *(npy_cfloat *)valueptr;
+            if (value.imag == 0) {
+                return min_scalar_type_num((char *)&value.real,
+                                            NPY_FLOAT, is_small_unsigned);
+            }
+            */
+            break;
+        }
+        case NPY_CDOUBLE: {
+            npy_cdouble value = *(npy_cdouble *)valueptr;
+            /*
+            if (value.imag == 0) {
+                return min_scalar_type_num((char *)&value.real,
+                                            NPY_DOUBLE, is_small_unsigned);
+            }
+            */
+            if (value.real > -3.4e38 && value.real < 3.4e38 &&
+                     value.imag > -3.4e38 && value.imag < 3.4e38) {
+                return NPY_CFLOAT;
+            }
+            break;
+        }
+        case NPY_CLONGDOUBLE: {
+            npy_cdouble value = *(npy_cdouble *)valueptr;
+            /*
+            if (value.imag == 0) {
+                return min_scalar_type_num((char *)&value.real,
+                                            NPY_LONGDOUBLE, is_small_unsigned);
+            }
+            */
+            if (value.real > -3.4e38 && value.real < 3.4e38 &&
+                     value.imag > -3.4e38 && value.imag < 3.4e38) {
+                return NPY_CFLOAT;
+            }
+            else if (value.real > -1.7e308 && value.real < 1.7e308 &&
+                     value.imag > -1.7e308 && value.imag < 1.7e308) {
+                return NPY_CDOUBLE;
+            }
+            break;
+        }
+    }
+
+    return type_num;
+}
+
+/*NUMPY_API
+ * If arr is a scalar (has 0 dimensions) with a built-in number data type,
+ * finds the smallest type size/kind which can still represent its data.
+ * Otherwise, returns the array's data type.
+ * 
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_MinScalarType(PyArrayObject *arr)
+{
+    PyArray_Descr *dtype = PyArray_DESCR(arr);
+    if (PyArray_NDIM(arr) > 0 || !PyTypeNum_ISNUMBER(dtype->type_num)) {
+        Py_INCREF(dtype);
+        return dtype;
+    }
+    else {
+        char *data = PyArray_BYTES(arr);
+        int swap = !PyArray_ISNBO(dtype->byteorder);
+        int is_small_unsigned = 0;
+        /* An aligned memory buffer large enough to hold any type */
+#if NPY_SIZEOF_LONGLONG >= NPY_SIZEOF_CLONGDOUBLE
+        npy_longlong value;
+#else
+        npy_clongdouble value;
+#endif
+        dtype->f->copyswap(&value, data, swap, NULL);
+
+        return PyArray_DescrFromType(
+                        min_scalar_type_num((char *)&value, dtype->type_num, &is_small_unsigned));
+
+    }
+}
+
+/*NUMPY_API
+ * Produces the result type of a bunch of inputs, using the UFunc
+ * type promotion rules.
+ *
+ * If all the inputs are scalars (have 0 dimensions), does a regular
+ * type promotion.  Otherwise, does a type promotion on the MinScalarType
+ * of all the inputs.  Data types passed directly are treated as vector
+ * types.
+ * 
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_ResultType(npy_intp narrs, PyArrayObject **arr,
+                    npy_intp ndtypes, PyArray_Descr **dtypes)
+{
+    npy_intp i;
+    int all_scalar;
+    PyArray_Descr *ret = NULL, *tmpret;
+    int ret_is_small_unsigned = 0;
+
+    /* If there's just one type, pass it through */
+    if (narrs + ndtypes == 1) {
+        if (narrs == 1) {
+            ret = PyArray_DESCR(arr[0]);
+        }
+        else {
+            ret = dtypes[0];
+        }
+        Py_INCREF(ret);
+        return ret;
+    }
+
+    /* Determine if there are any scalars */
+    if (ndtypes > 0) {
+        all_scalar = 0;
+    }
+    else {
+        all_scalar = 1;
+        for (i = 0; i < narrs; ++i) {
+            if (PyArray_NDIM(arr[i]) != 0) {
+                all_scalar = 0;
+                break;
+            }
+        }
+    }
+
+    /* Loop through all the types, promoting them */
+    if (all_scalar) {
+        for (i = 0; i < narrs; ++i) {
+            PyArray_Descr *tmp = PyArray_DESCR(arr[i]);
+            /* Combine it with the existing type */
+            if (ret == NULL) {
+                ret = tmp;
+                Py_INCREF(ret);
+            }
+            else {
+                tmpret = PyArray_PromoteTypes(tmp, ret);
+                Py_DECREF(ret);
+                ret = tmpret;
+            }
+        }
+    }
+    else {
+        for (i = 0; i < narrs; ++i) {
+            /* Get the min scalar type for the array */
+            PyArray_Descr *tmp = PyArray_DESCR(arr[i]);
+            int tmp_is_small_unsigned = 0;
+            /*
+             * If it's a scalar, find the min scalar type. The function
+             * is expanded here so that we can flag whether we've got an
+             * unsigned integer which would fit an a signed integer
+             * of the same size, something not exposed in the public API.
+             */
+            if (PyArray_NDIM(arr[i]) == 0 && PyTypeNum_ISNUMBER(tmp->type_num)) {
+                char *data = PyArray_BYTES(arr[i]);
+                int swap = !PyArray_ISNBO(tmp->byteorder);
+                int type_num;
+                /* An aligned memory buffer large enough to hold any type */
+#if NPY_SIZEOF_LONGLONG >= NPY_SIZEOF_CLONGDOUBLE
+                npy_longlong value;
+#else
+                npy_clongdouble value;
+#endif
+                tmp->f->copyswap(&value, data, swap, NULL);
+                type_num = min_scalar_type_num((char *)&value,
+                                        tmp->type_num, &tmp_is_small_unsigned);
+                tmp = PyArray_DescrFromType(type_num);
+                if (tmp == NULL) {
+                    Py_XDECREF(ret);
+                    return NULL;
+                }
+            }
+            else {
+                Py_INCREF(tmp);
+            }
+            /* Combine it with the existing type */
+            if (ret == NULL) {
+                ret = tmp;
+                ret_is_small_unsigned = tmp_is_small_unsigned;
+            }
+            else {
+#if 0
+                printf("promoting type ");
+                PyObject_Print(tmp, stdout, 0);
+                printf(" (%d) ", tmp_is_small_unsigned);
+                PyObject_Print(ret, stdout, 0);
+                printf(" (%d) ", ret_is_small_unsigned);
+                printf("\n");
+#endif
+                tmpret = promote_types(tmp, ret, tmp_is_small_unsigned,
+                                                    ret_is_small_unsigned);
+                if (tmpret == NULL) {
+                    Py_DECREF(tmp);
+                    Py_DECREF(ret);
+                    return NULL;
+                }
+                ret_is_small_unsigned = tmp_is_small_unsigned &&
+                                        ret_is_small_unsigned;
+                Py_DECREF(tmp);
+                Py_DECREF(ret);
+                ret = tmpret;
+            }
+        }
+
+        for (i = 0; i < ndtypes; ++i) {
+            PyArray_Descr *tmp = dtypes[i];
+            /* Combine it with the existing type */
+            if (ret == NULL) {
+                ret = tmp;
+                Py_INCREF(ret);
+            }
+            else {
+                if (ret_is_small_unsigned) {
+                    tmpret = promote_types(tmp, ret, 0, ret_is_small_unsigned);
+                    if (tmpret == NULL) {
+                        Py_DECREF(tmp);
+                        Py_DECREF(ret);
+                        return NULL;
+                    }
+                }
+                else {
+                    tmpret = PyArray_PromoteTypes(tmp, ret);
+                }
+                Py_DECREF(ret);
+                ret = tmpret;
+            }
+        }
+    }
+
+    if (ret == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "no arrays or types available to calculate result type");
+    }
+
+    return ret;
 }
 
 /*NUMPY_API
@@ -804,12 +1429,18 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn)
     }
 
     for (i = 0; i < n; i++) {
+        mps[i] = NULL;
+    }
+
+    for (i = 0; i < n; i++) {
         otmp = PySequence_GetItem(op, i);
         if (!PyArray_CheckAnyScalar(otmp)) {
             newtype = PyArray_DescrFromObject(otmp, intype);
             Py_XDECREF(intype);
+            if (newtype == NULL) {
+                goto fail;
+            }
             intype = newtype;
-            mps[i] = NULL;
             intypekind = PyArray_ScalarKind(intype->type_num, NULL);
         }
         else {
@@ -841,7 +1472,7 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn)
         if (!PyArray_CanCoerceScalar(stype->type_num,
                                      intype->type_num,
                                      scalarkind)) {
-            newtype = _array_small_type(intype, stype);
+            newtype = PyArray_PromoteTypes(intype, stype);
             Py_XDECREF(intype);
             intype = newtype;
         }
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 60bdd8274..844cce0c9 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -1,22 +1,10 @@
 #ifndef _NPY_ARRAY_CONVERT_DATATYPE_H_
 #define _NPY_ARRAY_CONVERT_DATATYPE_H_
 
-NPY_NO_EXPORT PyObject *
-PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran);
-
-NPY_NO_EXPORT int
-PyArray_CastTo(PyArrayObject *out, PyArrayObject *mp);
-
 NPY_NO_EXPORT PyArray_VectorUnaryFunc *
 PyArray_GetCastFunc(PyArray_Descr *descr, int type_num);
 
 NPY_NO_EXPORT int
-PyArray_CanCastSafely(int fromtype, int totype);
-
-NPY_NO_EXPORT Bool
-PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to);
-
-NPY_NO_EXPORT int
 PyArray_ObjectType(PyObject *op, int minimum_type);
 
 NPY_NO_EXPORT PyArrayObject **
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 18c39dccd..5f1f7aa25 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -17,10 +17,14 @@
 
 #include "ctors.h"
 
+#include "shape.h"
+
 #include "buffer.h"
 
 #include "numpymemoryview.h"
 
+#include "lowlevel_strided_loops.h"
+
 /*
  * Reading from a file or a string.
  *
@@ -206,13 +210,13 @@ fromfile_skip_separator(FILE **fp, const char *sep, void *NPY_UNUSED(stream_data
  * Strides are only added if given (because data is given).
  */
 static int
-_update_descr_and_dimensions(PyArray_Descr **des, intp *newdims,
-                             intp *newstrides, int oldnd)
+_update_descr_and_dimensions(PyArray_Descr **des, npy_intp *newdims,
+                             npy_intp *newstrides, int oldnd)
 {
     PyArray_Descr *old;
     int newnd;
     int numnew;
-    intp *mydim;
+    npy_intp *mydim;
     int i;
     int tuple;
 
@@ -236,17 +240,17 @@ _update_descr_and_dimensions(PyArray_Descr **des, intp *newdims,
     }
     if (tuple) {
         for (i = 0; i < numnew; i++) {
-            mydim[i] = (intp) PyInt_AsLong(
+            mydim[i] = (npy_intp) PyInt_AsLong(
                     PyTuple_GET_ITEM(old->subarray->shape, i));
         }
     }
     else {
-        mydim[0] = (intp) PyInt_AsLong(old->subarray->shape);
+        mydim[0] = (npy_intp) PyInt_AsLong(old->subarray->shape);
     }
 
     if (newstrides) {
-        intp tempsize;
-        intp *mystrides;
+        npy_intp tempsize;
+        npy_intp *mystrides;
 
         mystrides = newstrides + oldnd;
         /* Make new strides -- alwasy C-contiguous */
@@ -272,12 +276,12 @@ _update_descr_and_dimensions(PyArray_Descr **des, intp *newdims,
  * same for each element
  */
 static int
-object_depth_and_dimension(PyObject *s, int max, intp *dims)
+object_depth_and_dimension(PyObject *s, int max, npy_intp *dims)
 {
-    intp *newdims, *test_dims;
+    npy_intp *newdims, *test_dims;
     int nd, test_nd;
     int i, islist, istuple;
-    intp size;
+    npy_intp size;
     PyObject *obj;
 
     islist = PyList_Check(s);
@@ -332,92 +336,11 @@ object_depth_and_dimension(PyObject *s, int max, intp *dims)
     return nd + 1;
 }
 
-static void
-_strided_byte_copy(char *dst, intp outstrides, char *src, intp instrides,
-                   intp N, int elsize)
-{
-    intp i, j;
-    char *tout = dst;
-    char *tin = src;
-
-#define _FAST_MOVE(_type_)                              \
-    for(i=0; i<N; i++) {                               \
-        ((_type_ *)tout)[0] = ((_type_ *)tin)[0];       \
-        tin += instrides;                               \
-        tout += outstrides;                             \
-    }                                                   \
-    return
-
-    switch(elsize) {
-    case 8:
-        _FAST_MOVE(Int64);
-    case 4:
-        _FAST_MOVE(Int32);
-    case 1:
-        _FAST_MOVE(Int8);
-    case 2:
-        _FAST_MOVE(Int16);
-    case 16:
-        for (i = 0; i < N; i++) {
-            ((Int64 *)tout)[0] = ((Int64 *)tin)[0];
-            ((Int64 *)tout)[1] = ((Int64 *)tin)[1];
-            tin += instrides;
-            tout += outstrides;
-        }
-        return;
-    default:
-        for(i = 0; i < N; i++) {
-            for(j=0; j<elsize; j++) {
-                *tout++ = *tin++;
-            }
-            tin = tin + instrides - elsize;
-            tout = tout + outstrides - elsize;
-        }
-    }
-#undef _FAST_MOVE
-
-}
-
-static void
-_unaligned_strided_byte_move(char *dst, intp outstrides, char *src,
-                             intp instrides, intp N, int elsize)
-{
-    intp i;
-    char *tout = dst;
-    char *tin = src;
-
-
-#define _MOVE_N_SIZE(size)                      \
-    for(i=0; i<N; i++) {                       \
-        memmove(tout, tin, size);               \
-        tin += instrides;                       \
-        tout += outstrides;                     \
-    }                                           \
-    return
-
-    switch(elsize) {
-    case 8:
-        _MOVE_N_SIZE(8);
-    case 4:
-        _MOVE_N_SIZE(4);
-    case 1:
-        _MOVE_N_SIZE(1);
-    case 2:
-        _MOVE_N_SIZE(2);
-    case 16:
-        _MOVE_N_SIZE(16);
-    default:
-        _MOVE_N_SIZE(elsize);
-    }
-#undef _MOVE_N_SIZE
-
-}
-
 NPY_NO_EXPORT void
-_unaligned_strided_byte_copy(char *dst, intp outstrides, char *src,
-                             intp instrides, intp N, int elsize)
+_unaligned_strided_byte_copy(char *dst, npy_intp outstrides, char *src,
+                             npy_intp instrides, npy_intp N, int elsize)
 {
-    intp i;
+    npy_intp i;
     char *tout = dst;
     char *tin = src;
 
@@ -448,7 +371,7 @@ _unaligned_strided_byte_copy(char *dst, intp outstrides, char *src,
 }
 
 NPY_NO_EXPORT void
-_strided_byte_swap(void *p, intp stride, intp n, int size)
+_strided_byte_swap(void *p, npy_intp stride, npy_intp n, int size)
 {
     char *a, *b, c = 0;
     int j, m;
@@ -491,18 +414,18 @@ _strided_byte_swap(void *p, intp stride, intp n, int size)
 }
 
 NPY_NO_EXPORT void
-byte_swap_vector(void *p, intp n, int size)
+byte_swap_vector(void *p, npy_intp n, int size)
 {
-    _strided_byte_swap(p, (intp) size, n, size);
+    _strided_byte_swap(p, (npy_intp) size, n, size);
     return;
 }
 
 /* If numitems > 1, then dst must be contiguous */
 NPY_NO_EXPORT void
-copy_and_swap(void *dst, void *src, int itemsize, intp numitems,
-              intp srcstrides, int swap)
+copy_and_swap(void *dst, void *src, int itemsize, npy_intp numitems,
+              npy_intp srcstrides, int swap)
 {
-    intp i;
+    npy_intp i;
     char *s1 = (char *)src;
     char *d1 = (char *)dst;
 
@@ -523,398 +446,105 @@ copy_and_swap(void *dst, void *src, int itemsize, intp numitems,
     }
 }
 
-static int
-_copy_from0d(PyArrayObject *dest, PyArrayObject *src, int usecopy, int swap)
+/* Gets a half-open range [start, end) which contains the array data */
+void _get_memory_extents(PyArrayObject *arr,
+                    npy_uintp *out_start, npy_uintp *out_end)
 {
-    char *aligned = NULL;
-    char *sptr;
-    intp numcopies, nbytes;
-    void (*myfunc)(char *, intp, char *, intp, intp, int);
-    int retval = -1;
-    NPY_BEGIN_THREADS_DEF;
-
-    numcopies = PyArray_SIZE(dest);
-    if (numcopies < 1) {
-        return 0;
-    }
-    nbytes = PyArray_ITEMSIZE(src);
-
-    if (!PyArray_ISALIGNED(src)) {
-        aligned = malloc((size_t)nbytes);
-        if (aligned == NULL) {
-            PyErr_NoMemory();
-            return -1;
-        }
-        memcpy(aligned, src->data, (size_t) nbytes);
-        usecopy = 1;
-        sptr = aligned;
-    }
-    else {
-        sptr = src->data;
-    }
-    if (PyArray_SAFEALIGNEDCOPY(dest)) {
-        myfunc = _strided_byte_copy;
-    }
-    else if (usecopy) {
-        myfunc = _unaligned_strided_byte_copy;
-    }
-    else {
-        myfunc = _unaligned_strided_byte_move;
-    }
-
-    if ((dest->nd < 2) || PyArray_ISONESEGMENT(dest)) {
-        char *dptr;
-        intp dstride;
-
-        dptr = dest->data;
-        if (dest->nd == 1) {
-            dstride = dest->strides[0];
+    npy_uintp start, end;
+    npy_intp idim, ndim = PyArray_NDIM(arr);
+    npy_intp *dimensions = PyArray_DIMS(arr),
+            *strides = PyArray_STRIDES(arr);
+
+    /* Calculate with a closed range [start, end] */
+    start = end = (npy_uintp)PyArray_DATA(arr);
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp stride = strides[idim], dim = dimensions[idim];
+        /* If the array size is zero, return an empty range */
+        if (dim == 0) {
+            *out_start = *out_end = (npy_uintp)PyArray_DATA(arr);
+            return;
         }
+        /* Expand either upwards or downwards depending on stride */
         else {
-            dstride = nbytes;
-        }
-
-        /* Refcount note: src and dest may have different sizes */
-        PyArray_INCREF(src);
-        PyArray_XDECREF(dest);
-        NPY_BEGIN_THREADS;
-        myfunc(dptr, dstride, sptr, 0, numcopies, (int) nbytes);
-        if (swap) {
-            _strided_byte_swap(dptr, dstride, numcopies, (int) nbytes);
-        }
-        NPY_END_THREADS;
-        PyArray_INCREF(dest);
-        PyArray_XDECREF(src);
-    }
-    else {
-        PyArrayIterObject *dit;
-        int axis = -1;
-
-        dit = (PyArrayIterObject *)
-            PyArray_IterAllButAxis((PyObject *)dest, &axis);
-        if (dit == NULL) {
-            goto finish;
-        }
-        /* Refcount note: src and dest may have different sizes */
-        PyArray_INCREF(src);
-        PyArray_XDECREF(dest);
-        NPY_BEGIN_THREADS;
-        while(dit->index < dit->size) {
-            myfunc(dit->dataptr, PyArray_STRIDE(dest, axis), sptr, 0,
-                    PyArray_DIM(dest, axis), nbytes);
-            if (swap) {
-                _strided_byte_swap(dit->dataptr, PyArray_STRIDE(dest, axis),
-                        PyArray_DIM(dest, axis), nbytes);
+            if (stride > 0) {
+                end += stride*(dim-1);
+            }
+            else if (stride < 0) {
+                start += stride*(dim-1);
             }
-            PyArray_ITER_NEXT(dit);
-        }
-        NPY_END_THREADS;
-        PyArray_INCREF(dest);
-        PyArray_XDECREF(src);
-        Py_DECREF(dit);
-    }
-    retval = 0;
-
-finish:
-    if (aligned != NULL) {
-        free(aligned);
-    }
-    return retval;
-}
-
-/*
- * Special-case of PyArray_CopyInto when dst is 1-d
- * and contiguous (and aligned).
- * PyArray_CopyInto requires broadcastable arrays while
- * this one is a flattening operation...
- */
-NPY_NO_EXPORT int
-_flat_copyinto(PyObject *dst, PyObject *src, NPY_ORDER order)
-{
-    PyArrayIterObject *it;
-    PyObject *orig_src;
-    void (*myfunc)(char *, intp, char *, intp, intp, int);
-    char *dptr;
-    int axis;
-    int elsize;
-    intp nbytes;
-    NPY_BEGIN_THREADS_DEF;
-
-
-    orig_src = src;
-    if (PyArray_NDIM(src) == 0) {
-        /* Refcount note: src and dst have the same size */
-        PyArray_INCREF((PyArrayObject *)src);
-        PyArray_XDECREF((PyArrayObject *)dst);
-        NPY_BEGIN_THREADS;
-        memcpy(PyArray_BYTES(dst), PyArray_BYTES(src),
-                PyArray_ITEMSIZE(src));
-        NPY_END_THREADS;
-        return 0;
-    }
-
-    axis = PyArray_NDIM(src)-1;
-
-    if (order == PyArray_FORTRANORDER) {
-        if (PyArray_NDIM(src) <= 2) {
-            axis = 0;
-        }
-        /* fall back to a more general method */
-        else {
-            src = PyArray_Transpose((PyArrayObject *)orig_src, NULL);
-        }
-    }
-
-    it = (PyArrayIterObject *)PyArray_IterAllButAxis(src, &axis);
-    if (it == NULL) {
-        if (src != orig_src) {
-            Py_DECREF(src);
         }
-        return -1;
-    }
-
-    if (PyArray_SAFEALIGNEDCOPY(src)) {
-        myfunc = _strided_byte_copy;
     }
-    else {
-        myfunc = _unaligned_strided_byte_copy;
-    }
-
-    dptr = PyArray_BYTES(dst);
-    elsize = PyArray_ITEMSIZE(dst);
-    nbytes = elsize * PyArray_DIM(src, axis);
 
-    /* Refcount note: src and dst have the same size */
-    PyArray_INCREF((PyArrayObject *)src);
-    PyArray_XDECREF((PyArrayObject *)dst);
-    NPY_BEGIN_THREADS;
-    while(it->index < it->size) {
-        myfunc(dptr, elsize, it->dataptr, PyArray_STRIDE(src,axis),
-                PyArray_DIM(src,axis), elsize);
-        dptr += nbytes;
-        PyArray_ITER_NEXT(it);
-    }
-    NPY_END_THREADS;
-
-    if (src != orig_src) {
-        Py_DECREF(src);
-    }
-    Py_DECREF(it);
-    return 0;
+    /* Return a half-open range */
+    *out_start = start;
+    *out_end = end + arr->descr->elsize;
 }
 
-
-static int
-_copy_from_same_shape(PyArrayObject *dest, PyArrayObject *src,
-                      void (*myfunc)(char *, intp, char *, intp, intp, int),
-                      int swap)
+/* Returns 1 if the arrays have overlapping data, 0 otherwise */
+int _arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2)
 {
-    int maxaxis = -1, elsize;
-    intp maxdim;
-    PyArrayIterObject *dit, *sit;
-    NPY_BEGIN_THREADS_DEF;
-
-    dit = (PyArrayIterObject *)
-        PyArray_IterAllButAxis((PyObject *)dest, &maxaxis);
-    sit = (PyArrayIterObject *)
-        PyArray_IterAllButAxis((PyObject *)src, &maxaxis);
+    npy_uintp start1 = 0, start2 = 0, end1 = 0, end2 = 0;
 
-    maxdim = dest->dimensions[maxaxis];
-
-    if ((dit == NULL) || (sit == NULL)) {
-        Py_XDECREF(dit);
-        Py_XDECREF(sit);
-        return -1;
-    }
-    elsize = PyArray_ITEMSIZE(dest);
+    _get_memory_extents(arr1, &start1, &end1);
+    _get_memory_extents(arr2, &start2, &end2);
 
-    /* Refcount note: src and dst have the same size */
-    PyArray_INCREF(src);
-    PyArray_XDECREF(dest);
-
-    NPY_BEGIN_THREADS;
-    while(dit->index < dit->size) {
-        /* strided copy of elsize bytes */
-        myfunc(dit->dataptr, dest->strides[maxaxis],
-                sit->dataptr, src->strides[maxaxis],
-                maxdim, elsize);
-        if (swap) {
-            _strided_byte_swap(dit->dataptr,
-                    dest->strides[maxaxis],
-                    dest->dimensions[maxaxis],
-                    elsize);
-        }
-        PyArray_ITER_NEXT(dit);
-        PyArray_ITER_NEXT(sit);
-    }
-    NPY_END_THREADS;
-
-    Py_DECREF(sit);
-    Py_DECREF(dit);
-    return 0;
+    return (start1 < end2) && (start2 < end1);
 }
 
-static int
-_broadcast_copy(PyArrayObject *dest, PyArrayObject *src,
-                void (*myfunc)(char *, intp, char *, intp, intp, int),
-                int swap)
+/*NUMPY_API
+ * Move the memory of one array into another, allowing for overlapping data.
+ *
+ * This is in general a difficult problem to solve efficiently, because
+ * strides can be negative.  Consider "a = np.arange(3); a[::-1] = a", which
+ * previously produced the incorrect [0, 1, 0].
+ *
+ * Instead of trying to be fancy, we simply check for overlap and make
+ * a temporary copy when one exists.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src)
 {
-    int elsize;
-    PyArrayMultiIterObject *multi;
-    int maxaxis; intp maxdim;
-    NPY_BEGIN_THREADS_DEF;
-
-    elsize = PyArray_ITEMSIZE(dest);
-    multi = (PyArrayMultiIterObject *)PyArray_MultiIterNew(2, dest, src);
-    if (multi == NULL) {
-        return -1;
-    }
-
-    if (multi->size != PyArray_SIZE(dest)) {
-        PyErr_SetString(PyExc_ValueError,
-                "array dimensions are not "\
-                "compatible for copy");
-        Py_DECREF(multi);
-        return -1;
-    }
-
-    maxaxis = PyArray_RemoveSmallest(multi);
-    if (maxaxis < 0) {
-        /*
-         * copy 1 0-d array to another
-         * Refcount note: src and dst have the same size
-         */
-        PyArray_INCREF(src);
-        PyArray_XDECREF(dest);
-        memcpy(dest->data, src->data, elsize);
-        if (swap) {
-            byte_swap_vector(dest->data, 1, elsize);
-        }
-        return 0;
-    }
-    maxdim = multi->dimensions[maxaxis];
-
     /*
-     * Increment the source and decrement the destination
-     * reference counts
-     *
-     * Refcount note: src and dest may have different sizes
+     * A special case is when there is just one dimension with positive
+     * strides, and we pass that to CopyInto, which correctly handles
+     * it for most cases.  It may still incorrectly handle copying of
+     * partially-overlapping data elements, where the data pointer was offset
+     * by a fraction of the element size.
      */
-    PyArray_INCREF(src);
-    PyArray_XDECREF(dest);
-
-    NPY_BEGIN_THREADS;
-    while(multi->index < multi->size) {
-        myfunc(multi->iters[0]->dataptr,
-                multi->iters[0]->strides[maxaxis],
-                multi->iters[1]->dataptr,
-                multi->iters[1]->strides[maxaxis],
-                maxdim, elsize);
-        if (swap) {
-            _strided_byte_swap(multi->iters[0]->dataptr,
-                    multi->iters[0]->strides[maxaxis],
-                    maxdim, elsize);
-        }
-        PyArray_MultiIter_NEXT(multi);
-    }
-    NPY_END_THREADS;
-
-    PyArray_INCREF(dest);
-    PyArray_XDECREF(src);
-
-    Py_DECREF(multi);
-    return 0;
-}
-
-/* If destination is not the right type, then src
-   will be cast to destination -- this requires
-   src and dest to have the same shape
-*/
-
-/* Requires arrays to have broadcastable shapes
-
-   The arrays are assumed to have the same number of elements
-   They can be different sizes and have different types however.
-*/
-
-static int
-_array_copy_into(PyArrayObject *dest, PyArrayObject *src, int usecopy)
-{
-    int swap;
-    void (*myfunc)(char *, intp, char *, intp, intp, int);
-    int simple;
-    int same;
-    NPY_BEGIN_THREADS_DEF;
-
-
-    if (!PyArray_EquivArrTypes(dest, src)) {
-        return PyArray_CastTo(dest, src);
-    }
-    if (!PyArray_ISWRITEABLE(dest)) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "cannot write to array");
-        return -1;
+    if ((PyArray_NDIM(dst) == 1 &&
+                        PyArray_NDIM(src) == 1 &&
+                        PyArray_STRIDE(dst, 0) > 0 &&
+                        PyArray_STRIDE(src, 0) > 0) ||
+                        !_arrays_overlap(dst, src)) {
+        return PyArray_CopyInto(dst, src);
     }
-    same = PyArray_SAMESHAPE(dest, src);
-    simple = same && ((PyArray_ISCARRAY_RO(src) && PyArray_ISCARRAY(dest)) ||
-            (PyArray_ISFARRAY_RO(src) && PyArray_ISFARRAY(dest)));
+    else {
+        PyArrayObject *tmp;
+        int ret;
 
-    if (simple) {
-        /* Refcount note: src and dest have the same size */
-        PyArray_INCREF(src);
-        PyArray_XDECREF(dest);
-        NPY_BEGIN_THREADS;
-        if (usecopy) {
-            memcpy(dest->data, src->data, PyArray_NBYTES(dest));
+        /*
+         * Allocate a temporary copy array.
+         */
+        tmp = (PyArrayObject *)PyArray_NewLikeArray(dst, NPY_KEEPORDER, NULL);
+        if (tmp == NULL) {
+            return -1;
         }
-        else {
-            memmove(dest->data, src->data, PyArray_NBYTES(dest));
+        ret = PyArray_CopyInto(tmp, src);
+        if (ret == 0) {
+            ret = PyArray_CopyInto(dst, tmp);
         }
-        NPY_END_THREADS;
-        return 0;
-    }
-
-    swap = PyArray_ISNOTSWAPPED(dest) != PyArray_ISNOTSWAPPED(src);
-
-    if (src->nd == 0) {
-        return _copy_from0d(dest, src, usecopy, swap);
-    }
-
-    if (PyArray_SAFEALIGNEDCOPY(dest) && PyArray_SAFEALIGNEDCOPY(src)) {
-        myfunc = _strided_byte_copy;
-    }
-    else if (usecopy) {
-        myfunc = _unaligned_strided_byte_copy;
-    }
-    else {
-        myfunc = _unaligned_strided_byte_move;
-    }
-    /*
-     * Could combine these because _broadcasted_copy would work as well.
-     * But, same-shape copying is so common we want to speed it up.
-     */
-    if (same) {
-        return _copy_from_same_shape(dest, src, myfunc, swap);
-    }
-    else {
-        return _broadcast_copy(dest, src, myfunc, swap);
+        Py_DECREF(tmp);
+        return ret;
     }
 }
 
-/*NUMPY_API
- * Move the memory of one array into another.
- */
-NPY_NO_EXPORT int
-PyArray_MoveInto(PyArrayObject *dest, PyArrayObject *src)
-{
-    return _array_copy_into(dest, src, 0);
-}
-
 
 
 /* adapted from Numarray */
 static int
-setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, intp offset)
+setArrayFromSequence(PyArrayObject *a, PyObject *s, int dim, npy_intp offset)
 {
     Py_ssize_t i, slen;
     int res = -1;
@@ -1042,7 +672,7 @@ static PyObject *
 ObjectArray_FromNestedList(PyObject *s, PyArray_Descr *typecode, int fortran)
 {
     int nd;
-    intp d[MAX_DIMS];
+    npy_intp d[MAX_DIMS];
     PyArrayObject *r;
 
     /* Get the depth and the number of dimensions */
@@ -1181,14 +811,12 @@ static int
 discover_itemsize(PyObject *s, int nd, int *itemsize)
 {
     int n, r, i;
-    PyObject *e;
 
     if (PyArray_Check(s)) {
         *itemsize = MAX(*itemsize, PyArray_ITEMSIZE(s));
         return 0;
     }
 
-    n = PyObject_Length(s);
     if ((nd == 0) || PyString_Check(s) ||
 #if defined(NPY_PY3K)
         PyMemoryView_Check(s) ||
@@ -1197,19 +825,32 @@ discover_itemsize(PyObject *s, int nd, int *itemsize)
 #endif
         PyUnicode_Check(s)) {
 
-        *itemsize = MAX(*itemsize, n);
+        /* If an object has no length, leave it be */
+        n = PyObject_Length(s);
+        if (n == -1) {
+            PyErr_Clear();
+        }
+        else {
+            *itemsize = MAX(*itemsize, n);
+        }
         return 0;
     }
+
+    n = PySequence_Length(s);
     for (i = 0; i < n; i++) {
-        if ((e = PySequence_GetItem(s,i))==NULL) {
+        PyObject *e = PySequence_GetItem(s,i);
+
+        if (e == NULL) {
             return -1;
         }
+
         r = discover_itemsize(e,nd-1,itemsize);
         Py_DECREF(e);
         if (r == -1) {
             return -1;
         }
     }
+
     return 0;
 }
 
@@ -1218,7 +859,7 @@ discover_itemsize(PyObject *s, int nd, int *itemsize)
  * an array of ndim nd, and determine the size in each dimension
  */
 static int
-discover_dimensions(PyObject *s, int nd, intp *d, int check_it)
+discover_dimensions(PyObject *s, int nd, npy_intp *d, int check_it)
 {
     PyObject *e;
     int r, n, i, n_lower;
@@ -1285,7 +926,7 @@ Array_FromSequence(PyObject *s, PyArray_Descr *typecode, int fortran,
     PyArrayObject *r;
     int nd;
     int err;
-    intp d[MAX_DIMS];
+    npy_intp d[MAX_DIMS];
     int stop_at_string;
     int stop_at_tuple;
     int check_it;
@@ -1333,6 +974,7 @@ Array_FromSequence(PyObject *s, PyArray_Descr *typecode, int fortran,
             itemsize *= 4;
         }
     }
+
     if (itemsize != typecode->elsize) {
         PyArray_DESCR_REPLACE(typecode);
         typecode->elsize = itemsize;
@@ -1367,23 +1009,23 @@ Array_FromSequence(PyObject *s, PyArray_Descr *typecode, int fortran,
  */
 NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
-                     intp *dims, intp *strides, void *data,
+                     npy_intp *dims, npy_intp *strides, void *data,
                      int flags, PyObject *obj)
 {
     PyArrayObject *self;
     int i;
     size_t sd;
-    intp largest;
-    intp size;
+    npy_intp largest;
+    npy_intp size;
 
     if (descr->subarray) {
         PyObject *ret;
-        intp newdims[2*MAX_DIMS];
-        intp *newstrides = NULL;
-        memcpy(newdims, dims, nd*sizeof(intp));
+        npy_intp newdims[2*MAX_DIMS];
+        npy_intp *newstrides = NULL;
+        memcpy(newdims, dims, nd*sizeof(npy_intp));
         if (strides) {
             newstrides = newdims + MAX_DIMS;
-            memcpy(newstrides, strides, nd*sizeof(intp));
+            memcpy(newstrides, strides, nd*sizeof(npy_intp));
         }
         nd =_update_descr_and_dimensions(&descr, newdims,
                                          newstrides, nd);
@@ -1426,7 +1068,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
 
     largest = NPY_MAX_INTP / sd;
     for (i = 0; i < nd; i++) {
-        intp dim = dims[i];
+        npy_intp dim = dims[i];
 
         if (dim == 0) {
             /*
@@ -1484,7 +1126,7 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
             goto fail;
         }
         self->strides = self->dimensions + nd;
-        memcpy(self->dimensions, dims, sizeof(intp)*nd);
+        memcpy(self->dimensions, dims, sizeof(npy_intp)*nd);
         if (strides == NULL) { /* fill it in */
             sd = _array_fill_strides(self->strides, dims, nd, sd,
                                      flags, &(self->flags));
@@ -1494,12 +1136,13 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
              * we allow strides even when we create
              * the memory, but be careful with this...
              */
-            memcpy(self->strides, strides, sizeof(intp)*nd);
+            memcpy(self->strides, strides, sizeof(npy_intp)*nd);
             sd *= size;
         }
     }
     else {
         self->dimensions = self->strides = NULL;
+        self->flags |= FORTRAN;
     }
 
     if (data == NULL) {
@@ -1589,11 +1232,99 @@ PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
 }
 
 /*NUMPY_API
+ * Creates a new array with the same shape as the provided one,
+ * with possible memory layout order and data type changes.
+ *
+ * prototype - The array the new one should be like.
+ * order     - NPY_CORDER - C-contiguous result.
+ *             NPY_FORTRANORDER - Fortran-contiguous result.
+ *             NPY_ANYORDER - Fortran if prototype is Fortran, C otherwise.
+ *             NPY_KEEPORDER - Keeps the axis ordering of prototype.
+ * dtype     - If not NULL, overrides the data type of the result.
+ *
+ * NOTE: If dtype is not NULL, steals the dtype reference.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
+                     PyArray_Descr *dtype)
+{
+    PyObject *ret = NULL;
+    int ndim = PyArray_NDIM(prototype);
+
+    /* If no override data type, use the one from the prototype */
+    if (dtype == NULL) {
+        dtype = PyArray_DESCR(prototype);
+        Py_INCREF(dtype);
+    }
+
+    /* Handle ANYORDER and simple KEEPORDER cases */
+    switch (order) {
+        case NPY_ANYORDER:
+            order = PyArray_ISFORTRAN(prototype) ?
+                                    NPY_FORTRANORDER : NPY_CORDER;
+            break;
+        case NPY_KEEPORDER:
+            if (PyArray_IS_C_CONTIGUOUS(prototype) || ndim <= 1) {
+                order = NPY_CORDER;
+                break;
+            }
+            else if (PyArray_IS_F_CONTIGUOUS(prototype)) {
+                order = NPY_FORTRANORDER;
+                break;
+            }
+            break;
+        default:
+            break;
+    }
+
+    /* If it's not KEEPORDER, this is simple */
+    if (order != NPY_KEEPORDER) {
+        ret = PyArray_NewFromDescr(Py_TYPE(prototype),
+                                            dtype,
+                                            ndim,
+                                            PyArray_DIMS(prototype),
+                                            NULL,
+                                            NULL,
+                                            order,
+                                            (PyObject *)prototype);
+    }
+    /* KEEPORDER needs some analysis of the strides */
+    else {
+        npy_intp strides[NPY_MAXDIMS], stride;
+        npy_intp *shape = PyArray_DIMS(prototype);
+        _npy_stride_sort_item strideperm[NPY_MAXDIMS];
+        int i;
+
+        PyArray_CreateSortedStridePerm(prototype, strideperm);
+
+        /* Build the new strides */
+        stride = dtype->elsize;
+        for (i = ndim-1; i >= 0; --i) {
+            npy_intp i_perm = strideperm[i].perm;
+            strides[i_perm] = stride;
+            stride *= shape[i_perm];
+        }
+
+        /* Finally, allocate the array */
+        ret = PyArray_NewFromDescr(Py_TYPE(prototype),
+                                            dtype,
+                                            ndim,
+                                            shape,
+                                            strides,
+                                            NULL,
+                                            0,
+                                            (PyObject *)prototype);
+    }
+
+    return ret;
+}
+
+/*NUMPY_API
  * Generic new array creation routine.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_New(PyTypeObject *subtype, int nd, intp *dims, int type_num,
-            intp *strides, void *data, int itemsize, int flags,
+PyArray_New(PyTypeObject *subtype, int nd, npy_intp *dims, int type_num,
+            npy_intp *strides, void *data, int itemsize, int flags,
             PyObject *obj)
 {
     PyArray_Descr *descr;
@@ -1640,7 +1371,7 @@ _array_from_buffer_3118(PyObject *obj, PyObject **out)
 
     view = PyMemoryView_GET_BUFFER(memoryview);
     if (view->format != NULL) {
-        descr = (PyObject*)_descriptor_from_pep3118_format(view->format);
+        descr = _descriptor_from_pep3118_format(view->format);
         if (descr == NULL) {
             PyObject *msg;
             msg = PyBytes_FromFormat("Invalid PEP 3118 format string: '%s'",
@@ -1782,6 +1513,9 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         }
         if (newtype == NULL) {
             newtype = _array_find_type(op, NULL, MAX_DIMS);
+            if (newtype == NULL) {
+                return NULL;
+            }
         }
         else if (newtype->type_num == PyArray_OBJECT) {
             isobject = 1;
@@ -2322,7 +2056,7 @@ PyArray_FromInterface(PyObject *input)
         if (PyErr_Occurred()) {
             PyErr_Clear();
         }
-        memcpy(ret->strides, strides, n*sizeof(intp));
+        memcpy(ret->strides, strides, n*sizeof(npy_intp));
     }
     else PyErr_Clear();
     PyArray_UpdateFlags(ret, UPDATE_ALL);
@@ -2409,7 +2143,7 @@ PyArray_FromDimsAndDataAndDescr(int nd, int *d,
 {
     PyObject *ret;
     int i;
-    intp newd[MAX_DIMS];
+    npy_intp newd[MAX_DIMS];
     char msg[] = "PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.";
 
     if (DEPRECATE(msg) < 0) {
@@ -2418,7 +2152,7 @@ PyArray_FromDimsAndDataAndDescr(int nd, int *d,
     if (!PyArray_ISNBO(descr->byteorder))
         descr->byteorder = '=';
     for (i = 0; i < nd; i++) {
-        newd[i] = (intp) d[i];
+        newd[i] = (npy_intp) d[i];
     }
     ret = PyArray_NewFromDescr(&PyArray_Type, descr,
                                nd, newd,
@@ -2495,92 +2229,352 @@ PyArray_EnsureAnyArray(PyObject *op)
     return PyArray_EnsureArray(op);
 }
 
-/*NUMPY_API
- * Copy an Array into another array -- memory must not overlap
- * Does not require src and dest to have "broadcastable" shapes
- * (only the same number of elements).
- */
+/* TODO: Put the order parameter in PyArray_CopyAnyInto and remove this */
 NPY_NO_EXPORT int
-PyArray_CopyAnyInto(PyArrayObject *dest, PyArrayObject *src)
+PyArray_CopyAnyIntoOrdered(PyArrayObject *dst, PyArrayObject *src,
+                                NPY_ORDER order)
 {
-    int elsize, simple;
-    PyArrayIterObject *idest, *isrc;
-    void (*myfunc)(char *, intp, char *, intp, intp, int);
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    NpyIter *dst_iter, *src_iter;
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArray_EquivArrTypes(dest, src)) {
-        return PyArray_CastAnyTo(dest, src);
-    }
-    if (!PyArray_ISWRITEABLE(dest)) {
+    NpyIter_IterNext_Fn dst_iternext, src_iternext;
+    char **dst_dataptr, **src_dataptr;
+    npy_intp dst_stride, src_stride;
+    npy_intp *dst_countptr, *src_countptr;
+
+    char *dst_data, *src_data;
+    npy_intp dst_count, src_count, count;
+    npy_intp src_itemsize;
+    npy_intp dst_size, src_size;
+    int needs_api;
+
+    if (!PyArray_ISWRITEABLE(dst)) {
         PyErr_SetString(PyExc_RuntimeError,
                 "cannot write to array");
         return -1;
     }
-    if (PyArray_SIZE(dest) != PyArray_SIZE(src)) {
+
+    /*
+     * If the shapes match and a particular order is forced
+     * for both, use the more efficient CopyInto
+     */
+    if (order != NPY_ANYORDER && order != NPY_KEEPORDER &&
+            PyArray_NDIM(dst) == PyArray_NDIM(src) &&
+            PyArray_CompareLists(PyArray_DIMS(dst), PyArray_DIMS(src),
+                                PyArray_NDIM(dst))) {
+        return PyArray_CopyInto(dst, src);
+    }
+
+    dst_size = PyArray_SIZE(dst);
+    src_size = PyArray_SIZE(src);
+    if (dst_size != src_size) {
         PyErr_SetString(PyExc_ValueError,
                 "arrays must have the same number of elements"
                 " for copy");
         return -1;
     }
 
-    simple = ((PyArray_ISCARRAY_RO(src) && PyArray_ISCARRAY(dest)) ||
-            (PyArray_ISFARRAY_RO(src) && PyArray_ISFARRAY(dest)));
-    if (simple) {
-        /* Refcount note: src and dest have the same size */
-        PyArray_INCREF(src);
-        PyArray_XDECREF(dest);
-        NPY_BEGIN_THREADS;
-        memcpy(dest->data, src->data, PyArray_NBYTES(dest));
-        NPY_END_THREADS;
+    /* Zero-sized arrays require nothing be done */
+    if (dst_size == 0) {
         return 0;
     }
 
-    if (PyArray_SAMESHAPE(dest, src)) {
-        int swap;
 
-        if (PyArray_SAFEALIGNEDCOPY(dest) && PyArray_SAFEALIGNEDCOPY(src)) {
-            myfunc = _strided_byte_copy;
-        }
-        else {
-            myfunc = _unaligned_strided_byte_copy;
-        }
-        swap = PyArray_ISNOTSWAPPED(dest) != PyArray_ISNOTSWAPPED(src);
-        return _copy_from_same_shape(dest, src, myfunc, swap);
+    /*
+     * This copy is based on matching C-order traversals of src and dst.
+     * By using two iterators, we can find maximal sub-chunks that
+     * can be processed at once.
+     */
+    dst_iter = NpyIter_New(dst, NPY_ITER_WRITEONLY|
+                                NPY_ITER_NO_INNER_ITERATION|
+                                NPY_ITER_DONT_REVERSE_AXES|
+                                NPY_ITER_REFS_OK,
+                                order,
+                                NPY_NO_CASTING,
+                                NULL, 0, NULL, 0);
+    if (dst_iter == NULL) {
+        return -1;
+    }
+    src_iter = NpyIter_New(src, NPY_ITER_READONLY|
+                                NPY_ITER_NO_INNER_ITERATION|
+                                NPY_ITER_DONT_REVERSE_AXES|
+                                NPY_ITER_REFS_OK,
+                                order,
+                                NPY_NO_CASTING,
+                                NULL, 0, NULL, 0);
+    if (src_iter == NULL) {
+        NpyIter_Deallocate(dst_iter);
+        return -1;
     }
 
-    /* Otherwise we have to do an iterator-based copy */
-    idest = (PyArrayIterObject *)PyArray_IterNew((PyObject *)dest);
-    if (idest == NULL) {
+    /* Get all the values needed for the inner loop */
+    dst_iternext = NpyIter_GetIterNext(dst_iter, NULL);
+    dst_dataptr = NpyIter_GetDataPtrArray(dst_iter);
+    /* Since buffering is disabled, we can cache the stride */
+    dst_stride = *NpyIter_GetInnerStrideArray(dst_iter);
+    dst_countptr = NpyIter_GetInnerLoopSizePtr(dst_iter);
+
+    src_iternext = NpyIter_GetIterNext(src_iter, NULL);
+    src_dataptr = NpyIter_GetDataPtrArray(src_iter);
+    /* Since buffering is disabled, we can cache the stride */
+    src_stride = *NpyIter_GetInnerStrideArray(src_iter);
+    src_countptr = NpyIter_GetInnerLoopSizePtr(src_iter);
+
+    if (dst_iternext == NULL || src_iternext == NULL) {
+        NpyIter_Deallocate(dst_iter);
+        NpyIter_Deallocate(src_iter);
         return -1;
     }
-    isrc = (PyArrayIterObject *)PyArray_IterNew((PyObject *)src);
-    if (isrc == NULL) {
-        Py_DECREF(idest);
+
+    src_itemsize = PyArray_DESCR(src)->elsize;
+
+    needs_api = NpyIter_IterationNeedsAPI(dst_iter) ||
+                NpyIter_IterationNeedsAPI(src_iter);
+
+    /*
+     * Because buffering is disabled in the iterator, the inner loop
+     * strides will be the same throughout the iteration loop.  Thus,
+     * we can pass them to this function to take advantage of
+     * contiguous strides, etc.
+     */
+    if (PyArray_GetDTypeTransferFunction(
+                    PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
+                    src_stride, dst_stride,
+                    PyArray_DESCR(src), PyArray_DESCR(dst),
+                    0,
+                    &stransfer, &transferdata,
+                    &needs_api) != NPY_SUCCEED) {
+        NpyIter_Deallocate(dst_iter);
+        NpyIter_Deallocate(src_iter);
         return -1;
     }
-    elsize = dest->descr->elsize;
-    /* Refcount note: src and dest have the same size */
-    PyArray_INCREF(src);
-    PyArray_XDECREF(dest);
-    NPY_BEGIN_THREADS;
-    while(idest->index < idest->size) {
-        memcpy(idest->dataptr, isrc->dataptr, elsize);
-        PyArray_ITER_NEXT(idest);
-        PyArray_ITER_NEXT(isrc);
-    }
-    NPY_END_THREADS;
-    Py_DECREF(idest);
-    Py_DECREF(isrc);
-    return 0;
+
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    dst_count = *dst_countptr;
+    src_count = *src_countptr;
+    dst_data = *dst_dataptr;
+    src_data = *src_dataptr;
+    /*
+     * The tests did not trigger this code, so added a new function
+     * ndarray.setasflat to the Python exposure in order to test it.
+     */
+    for(;;) {
+        /* Transfer the biggest amount that fits both */
+        count = (src_count < dst_count) ? src_count : dst_count;
+        stransfer(dst_data, dst_stride,
+                    src_data, src_stride,
+                    count, src_itemsize, transferdata);
+
+        /* If we exhausted the dst block, refresh it */
+        if (dst_count == count) {
+            if (!dst_iternext(dst_iter)) {
+                break;
+            }
+            dst_count = *dst_countptr;
+            dst_data = *dst_dataptr;
+        }
+        else {
+            dst_count -= count;
+            dst_data += count*dst_stride;
+        }
+
+        /* If we exhausted the src block, refresh it */
+        if (src_count == count) {
+            if (!src_iternext(src_iter)) {
+                break;
+            }
+            src_count = *src_countptr;
+            src_data = *src_dataptr;
+        }
+        else {
+            src_count -= count;
+            src_data += count*src_stride;
+        }
+    }
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    PyArray_FreeStridedTransferData(transferdata);
+    NpyIter_Deallocate(dst_iter);
+    NpyIter_Deallocate(src_iter);
+
+    return PyErr_Occurred() ? -1 : 0;
+}
+
+/*NUMPY_API
+ * Copy an Array into another array -- memory must not overlap
+ * Does not require src and dest to have "broadcastable" shapes
+ * (only the same number of elements).
+ *
+ * TODO: For NumPy 2.0, this could accept an order parameter which
+ *       only allows NPY_CORDER and NPY_FORDER.  Could also rename
+ *       this to CopyAsFlat to make the name more intuitive.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+NPY_NO_EXPORT int
+PyArray_CopyAnyInto(PyArrayObject *dst, PyArrayObject *src)
+{
+    return PyArray_CopyAnyIntoOrdered(dst, src, NPY_CORDER);
 }
 
 /*NUMPY_API
  * Copy an Array into another array -- memory must not overlap.
+ * Broadcast to the destination shape if necessary.
+ *
+ * Returns 0 on success, -1 on failure.
  */
 NPY_NO_EXPORT int
-PyArray_CopyInto(PyArrayObject *dest, PyArrayObject *src)
+PyArray_CopyInto(PyArrayObject *dst, PyArrayObject *src)
 {
-    return _array_copy_into(dest, src, 1);
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArray_ISWRITEABLE(dst)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "cannot write to array");
+        return -1;
+    }
+
+    if (PyArray_NDIM(dst) >= PyArray_NDIM(src) &&
+                            PyArray_TRIVIALLY_ITERABLE_PAIR(dst, src)) {
+        char *dst_data, *src_data;
+        npy_intp count, dst_stride, src_stride, src_itemsize;
+
+        int needs_api = 0;
+
+        PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(dst, src, count,
+                              dst_data, src_data, dst_stride, src_stride);
+
+        /*
+         * Check for overlap with positive strides, and if found,
+         * possibly reverse the order
+         */
+        if (dst_data > src_data && src_stride > 0 && dst_stride > 0 &&
+                        (dst_data < src_data+src_stride*count) &&
+                        (src_data < dst_data+dst_stride*count)) {
+            dst_data += dst_stride*(count-1);
+            src_data += src_stride*(count-1);
+            dst_stride = -dst_stride;
+            src_stride = -src_stride;
+        }
+
+        if (PyArray_GetDTypeTransferFunction(
+                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
+                        src_stride, dst_stride,
+                        PyArray_DESCR(src), PyArray_DESCR(dst),
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            return -1;
+        }
+
+        src_itemsize = PyArray_DESCR(src)->elsize;
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+
+        stransfer(dst_data, dst_stride, src_data, src_stride,
+                    count, src_itemsize, transferdata);
+
+        if (!needs_api) {
+            NPY_END_THREADS;
+        }
+
+        PyArray_FreeStridedTransferData(transferdata);
+
+        return PyErr_Occurred() ? -1 : 0;
+    }
+    else {
+        PyArrayObject *op[2];
+        npy_uint32 op_flags[2];
+        NpyIter *iter;
+
+        NpyIter_IterNext_Fn iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *countptr;
+        npy_intp src_itemsize;
+        int needs_api;
+
+        op[0] = dst;
+        op[1] = src;
+        op_flags[0] = NPY_ITER_WRITEONLY|NPY_ITER_NO_BROADCAST;
+        op_flags[1] = NPY_ITER_READONLY;
+
+        iter = NpyIter_MultiNew(2, op,
+                            NPY_ITER_NO_INNER_ITERATION|
+                            NPY_ITER_REFS_OK|
+                            NPY_ITER_ZEROSIZE_OK,
+                            NPY_KEEPORDER,
+                            NPY_NO_CASTING,
+                            op_flags,
+                            NULL, 0, NULL, 0);
+        if (iter == NULL) {
+            return -1;
+        }
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            return -1;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+        src_itemsize = PyArray_DESCR(src)->elsize;
+
+        needs_api = NpyIter_IterationNeedsAPI(iter);
+
+        /*
+         * Because buffering is disabled in the iterator, the inner loop
+         * strides will be the same throughout the iteration loop.  Thus,
+         * we can pass them to this function to take advantage of
+         * contiguous strides, etc.
+         */
+        if (PyArray_GetDTypeTransferFunction(
+                        PyArray_ISALIGNED(src) && PyArray_ISALIGNED(dst),
+                        stride[1], stride[0],
+                        PyArray_DESCR(src), PyArray_DESCR(dst),
+                        0,
+                        &stransfer, &transferdata,
+                        &needs_api) != NPY_SUCCEED) {
+            NpyIter_Deallocate(iter);
+            return -1;
+        }
+
+
+        if (NpyIter_GetIterSize(iter) != 0) {
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            do {
+                stransfer(dataptr[0], stride[0],
+                            dataptr[1], stride[1],
+                            *countptr, src_itemsize, transferdata);
+            } while(iternext(iter));
+
+            if (!needs_api) {
+                NPY_END_THREADS;
+            }
+        }
+
+        PyArray_FreeStridedTransferData(transferdata);
+        NpyIter_Deallocate(iter);
+
+        return PyErr_Occurred() ? -1 : 0;
+    }
 }
 
 
@@ -2651,7 +2645,7 @@ PyArray_CheckAxis(PyArrayObject *arr, int *axis, int flags)
  * accepts NULL type
  */
 NPY_NO_EXPORT PyObject *
-PyArray_Zeros(int nd, intp *dims, PyArray_Descr *type, int fortran)
+PyArray_Zeros(int nd, npy_intp *dims, PyArray_Descr *type, int fortran)
 {
     PyArrayObject *ret;
 
@@ -2680,7 +2674,7 @@ PyArray_Zeros(int nd, intp *dims, PyArray_Descr *type, int fortran)
  * steals referenct to type
  */
 NPY_NO_EXPORT PyObject *
-PyArray_Empty(int nd, intp *dims, PyArray_Descr *type, int fortran)
+PyArray_Empty(int nd, npy_intp *dims, PyArray_Descr *type, int fortran)
 {
     PyArrayObject *ret;
 
@@ -2708,7 +2702,7 @@ PyArray_Empty(int nd, intp *dims, PyArray_Descr *type, int fortran)
  * Return 0 on success, -1 on failure. In case of failure, set a PyExc_Overflow
  * exception
  */
-static int _safe_ceil_to_intp(double value, intp* ret)
+static int _safe_ceil_to_intp(double value, npy_intp* ret)
 {
     double ivalue;
 
@@ -2717,7 +2711,7 @@ static int _safe_ceil_to_intp(double value, intp* ret)
         return -1;
     }
 
-    *ret = (intp)ivalue;
+    *ret = (npy_intp)ivalue;
     return 0;
 }
 
@@ -2728,7 +2722,7 @@ static int _safe_ceil_to_intp(double value, intp* ret)
 NPY_NO_EXPORT PyObject *
 PyArray_Arange(double start, double stop, double step, int type_num)
 {
-    intp length;
+    npy_intp length;
     PyObject *range;
     PyArray_ArrFuncs *funcs;
     PyObject *obj;
@@ -2793,10 +2787,10 @@ PyArray_Arange(double start, double stop, double step, int type_num)
 /*
  * the formula is len = (intp) ceil((start - stop) / step);
  */
-static intp
+static npy_intp
 _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, int cmplx)
 {
-    intp len, tmp;
+    npy_intp len, tmp;
     PyObject *val;
     double value;
 
@@ -2873,7 +2867,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
     PyObject *range;
     PyArray_ArrFuncs *funcs;
     PyObject *next, *err;
-    intp length;
+    npy_intp length;
     PyArray_Descr *native = NULL;
     int swap;
 
@@ -2999,7 +2993,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
 }
 
 static PyArrayObject *
-array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, intp num, size_t *nread)
+array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nread)
 {
     PyArrayObject *r;
     npy_intp start, numbytes;
@@ -3009,14 +3003,14 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, intp num, size_t *nread)
 
 #if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400)
         /* Workaround Win64 fwrite() bug. Ticket #1660 */
-        start = (intp )_ftelli64(fp);
+        start = (npy_intp )_ftelli64(fp);
         if (start < 0) {
             fail = 1;
         }
         if (_fseeki64(fp, 0, SEEK_END) < 0) {
             fail = 1;
         }
-        numbytes = (intp) _ftelli64(fp);
+        numbytes = (npy_intp) _ftelli64(fp);
         if (numbytes < 0) {
             fail = 1;
         }
@@ -3025,14 +3019,14 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, intp num, size_t *nread)
             fail = 1;
         }
 #else
-        start = (intp )ftell(fp);
+        start = (npy_intp)ftell(fp);
         if (start < 0) {
             fail = 1;
         }
         if (fseek(fp, 0, SEEK_END) < 0) {
             fail = 1;
         }
-        numbytes = (intp) ftell(fp);
+        numbytes = (npy_intp) ftell(fp);
         if (numbytes < 0) {
             fail = 1;
         }
@@ -3069,17 +3063,17 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, intp num, size_t *nread)
  */
 #define FROM_BUFFER_SIZE 4096
 static PyArrayObject *
-array_from_text(PyArray_Descr *dtype, intp num, char *sep, size_t *nread,
+array_from_text(PyArray_Descr *dtype, npy_intp num, char *sep, size_t *nread,
                 void *stream, next_element next, skip_separator skip_sep,
                 void *stream_data)
 {
     PyArrayObject *r;
-    intp i;
+    npy_intp i;
     char *dptr, *clean_sep, *tmp;
     int err = 0;
-    intp thisbuf = 0;
-    intp size;
-    intp bytes, totalbytes;
+    npy_intp thisbuf = 0;
+    npy_intp size;
+    npy_intp bytes, totalbytes;
 
     size = (num >= 0) ? num : FROM_BUFFER_SIZE;
     r = (PyArrayObject *)
@@ -3160,7 +3154,7 @@ array_from_text(PyArray_Descr *dtype, intp num, char *sep, size_t *nread,
  * necessary is read by this routine.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, intp num, char *sep)
+PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
 {
     PyArrayObject *ret;
     size_t nread = 0;
@@ -3195,7 +3189,7 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, intp num, char *sep)
         Py_DECREF(dtype);
         return NULL;
     }
-    if (((intp) nread) < num) {
+    if (((npy_intp) nread) < num) {
         /* Realloc memory for smaller number of elements */
         const size_t nsize = NPY_MAX(nread,1)*ret->descr->elsize;
         char *tmp;
@@ -3213,12 +3207,12 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, intp num, char *sep)
 /*NUMPY_API*/
 NPY_NO_EXPORT PyObject *
 PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type,
-                   intp count, intp offset)
+                   npy_intp count, npy_intp offset)
 {
     PyArrayObject *ret;
     char *data;
     Py_ssize_t ts;
-    intp s, n;
+    npy_intp s, n;
     int itemsize;
     int write = 1;
 
@@ -3269,15 +3263,15 @@ PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type,
     if ((offset < 0) || (offset >= ts)) {
         PyErr_Format(PyExc_ValueError,
                      "offset must be non-negative and smaller than buffer "\
-                     "lenth (%" INTP_FMT ")", (intp)ts);
+                     "lenth (%" INTP_FMT ")", (npy_intp)ts);
         Py_DECREF(buf);
         Py_DECREF(type);
         return NULL;
     }
 
     data += offset;
-    s = (intp)ts - offset;
-    n = (intp)count;
+    s = (npy_intp)ts - offset;
+    n = (npy_intp)count;
     itemsize = type->elsize;
     if (n < 0 ) {
         if (s % itemsize != 0) {
@@ -3342,8 +3336,8 @@ PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type,
  * for whitespace around the separator is added.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_FromString(char *data, intp slen, PyArray_Descr *dtype,
-                   intp num, char *sep)
+PyArray_FromString(char *data, npy_intp slen, PyArray_Descr *dtype,
+                   npy_intp num, char *sep)
 {
     int itemsize;
     PyArrayObject *ret;
@@ -3429,12 +3423,12 @@ PyArray_FromString(char *data, intp slen, PyArray_Descr *dtype,
  * steals a reference to dtype (which cannot be NULL)
  */
 NPY_NO_EXPORT PyObject *
-PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, intp count)
+PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
 {
     PyObject *value;
     PyObject *iter = PyObject_GetIter(obj);
     PyArrayObject *ret = NULL;
-    intp i, elsize, elcount;
+    npy_intp i, elsize, elcount;
     char *item, *new_data;
 
     if (iter == NULL) {
@@ -3543,7 +3537,7 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, intp count)
  */
 
 NPY_NO_EXPORT size_t
-_array_fill_strides(intp *strides, intp *dims, int nd, size_t itemsize,
+_array_fill_strides(npy_intp *strides, npy_intp *dims, int nd, size_t itemsize,
                     int inflag, int *objflags)
 {
     int i;
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index 5fd1d8e58..eb1586f03 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -45,10 +45,12 @@ PyArray_CopyAnyInto(PyArrayObject *dest, PyArrayObject *src);
 NPY_NO_EXPORT PyObject *
 PyArray_CheckAxis(PyArrayObject *arr, int *axis, int flags);
 
-/* FIXME: remove those from here */
+/* TODO: Put the order parameter in PyArray_CopyAnyInto and remove this */
 NPY_NO_EXPORT int
-_flat_copyinto(PyObject *dst, PyObject *src, NPY_ORDER order);
+PyArray_CopyAnyIntoOrdered(PyArrayObject *dst, PyArrayObject *src,
+                                NPY_ORDER order);
 
+/* FIXME: remove those from here */
 NPY_NO_EXPORT size_t
 _array_fill_strides(intp *strides, intp *dims, int nd, size_t itemsize,
                     int inflag, int *objflags);
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 9de35de88..ad1d98270 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -267,7 +267,7 @@ NPY_NO_EXPORT npy_datetime
 PyArray_DatetimeStructToDatetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d)
 {
     npy_datetime ret;
-    npy_longlong days; /* The absolute number of days since Jan 1, 1970 */
+    npy_longlong days = 0; /* The absolute number of days since Jan 1, 1970 */
 
     if (fr > NPY_FR_M) {
         days = days_from_ymd(d->year, d->month, d->day);
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
new file mode 100644
index 000000000..9074a2160
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -0,0 +1,2946 @@
+/*
+ * This file contains low-level loops for data type transfers.
+ * In particular the function PyArray_GetDTypeTransferFunction is
+ * implemented here.
+ *
+ * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
+ * The Univerity of British Columbia
+ *
+ * See LICENSE.txt for the license.
+
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+#include <numpy/ufuncobject.h>
+#include <numpy/npy_cpu.h>
+
+#include "lowlevel_strided_loops.h"
+
+#define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_DT_DBG_TRACING 0
+/* Tracing incref/decref can be very noisy */
+#define NPY_DT_REF_DBG_TRACING 0
+
+#if NPY_DT_REF_DBG_TRACING
+#define NPY_DT_DBG_REFTRACE(msg, ref) \
+    printf("%-12s %20p %s%d%s\n", msg, ref, \
+                        ref ? "(refcnt " : "", \
+                        ref ? (int)ref->ob_refcnt : 0, \
+                        ref ? ((ref->ob_refcnt <= 0) ? \
+                                        ") <- BIG PROBLEM!!!!" : ")") : ""); \
+    fflush(stdout);
+#else
+#define NPY_DT_DBG_REFTRACE(...)
+#endif
+/**********************************************/
+
+/*
+ * Returns a transfer function which DECREFs any references in src_type.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+static int
+get_decsrcref_transfer_function(int aligned,
+                            npy_intp src_stride,
+                            PyArray_Descr *src_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api);
+
+/*
+ * Returns a transfer function which zeros out the dest values.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+static int
+get_setdstzero_transfer_function(int aligned,
+                            npy_intp dst_stride,
+                            PyArray_Descr *dst_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api);
+
+/*
+ * Returns a transfer function which sets a boolean type to ones.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+get_bool_setdstone_transfer_function(npy_intp dst_stride,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *NPY_UNUSED(out_needs_api));
+
+/*************************** COPY REFERENCES *******************************/
+
+/* Moves references from src to dst */
+static void
+_strided_to_strided_move_references(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    PyObject *src_ref = NULL, *dst_ref = NULL;
+    while (N > 0) {
+        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
+        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+
+        /* Release the reference in dst */
+        NPY_DT_DBG_REFTRACE("dec dst ref", dst_ref);
+        Py_XDECREF(dst_ref);
+        /* Move the reference */
+        NPY_DT_DBG_REFTRACE("move src ref", src_ref);
+        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        /* Set the source reference to NULL */
+        src_ref = NULL;
+        NPY_COPY_PYOBJECT_PTR(src, &src_ref);
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+/* Copies references from src to dst */
+static void
+_strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    PyObject *src_ref = NULL, *dst_ref = NULL;
+    while (N > 0) {
+        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
+        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+
+        /* Release the reference in dst */
+        NPY_DT_DBG_REFTRACE("dec dst ref", dst_ref);
+        Py_XDECREF(dst_ref);
+        /* Copy the reference */
+        NPY_DT_DBG_REFTRACE("copy src ref", src_ref);
+        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        /* Claim the reference */
+        Py_XINCREF(src_ref);
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+/************************** ZERO-PADDED COPY ******************************/
+
+/* Does a zero-padded copy */
+typedef struct {
+    void *freefunc, *copyfunc;
+    npy_intp dst_itemsize;
+} _strided_zero_pad_data;
+
+/* zero-padded data copy function */
+_strided_zero_pad_data *_strided_zero_pad_data_copy(
+                                        _strided_zero_pad_data *data)
+{
+    _strided_zero_pad_data *newdata = 
+            (_strided_zero_pad_data *)PyArray_malloc(
+                                    sizeof(_strided_zero_pad_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(_strided_zero_pad_data));
+
+    return newdata;
+}
+
+/*
+ * Does a strided to strided zero-padded copy for the case where
+ * dst_itemsize > src_itemsize
+ */
+static void
+_strided_to_strided_zero_pad_copy(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _strided_zero_pad_data *d = (_strided_zero_pad_data *)data;
+    npy_intp dst_itemsize = d->dst_itemsize;
+    npy_intp zero_size = dst_itemsize-src_itemsize;
+
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        memset(dst + src_itemsize, 0, zero_size);
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+NPY_NO_EXPORT int
+PyArray_GetStridedZeroPadCopyFn(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            npy_intp src_itemsize, npy_intp dst_itemsize,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata)
+{
+    if (src_itemsize >= dst_itemsize) {
+        /* If the sizes are different, the alignment flag isn't trustworthy */
+        if (src_itemsize != dst_itemsize) {
+            aligned = 0;
+        }
+        *out_stransfer = PyArray_GetStridedCopyFn(aligned, src_stride,
+                                dst_stride, dst_itemsize);
+        *out_transferdata = NULL;
+        return (*out_stransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+    }
+    else {
+        _strided_zero_pad_data *d = PyArray_malloc(
+                                        sizeof(_strided_zero_pad_data));
+        if (d == NULL) {
+            PyErr_NoMemory();
+            return NPY_FAIL;
+        }
+        d->dst_itemsize = dst_itemsize;
+        d->freefunc = &PyArray_free;
+        d->copyfunc = &_strided_zero_pad_data_copy;
+
+        *out_stransfer = &_strided_to_strided_zero_pad_copy;
+        *out_transferdata = d;
+        return NPY_SUCCEED;
+    }
+}
+
+/***************** WRAP ALIGNED CONTIGUOUS TRANSFER FUNCTION **************/
+
+/* Wraps a transfer function + data in alignment code */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_StridedTransferFn *wrapped,
+                *tobuffer, *frombuffer;
+    void *wrappeddata, *todata, *fromdata;
+    npy_intp src_itemsize, dst_itemsize;
+    char *bufferin, *bufferout;
+} _align_wrap_data;
+
+/* transfer data free function */
+void _align_wrap_data_free(_align_wrap_data *data)
+{
+    PyArray_FreeStridedTransferData(data->wrappeddata);
+    PyArray_FreeStridedTransferData(data->todata);
+    PyArray_FreeStridedTransferData(data->fromdata);
+    PyArray_free(data);
+}
+
+/* transfer data copy function */
+_align_wrap_data *_align_wrap_data_copy(_align_wrap_data *data)
+{
+    _align_wrap_data *newdata;
+    npy_intp basedatasize, datasize;
+
+    /* Round up the structure size to 16-byte boundary */
+    basedatasize = (sizeof(_align_wrap_data)+15)&(-0x10);
+    /* Add space for two low level buffers */
+    datasize = basedatasize +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*data->src_itemsize +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*data->dst_itemsize;
+
+    /* Allocate the data, and populate it */
+    newdata = (_align_wrap_data *)PyArray_malloc(datasize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    memcpy(newdata, data, basedatasize);
+    newdata->bufferin = (char *)newdata + basedatasize;
+    newdata->bufferout = newdata->bufferin +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*newdata->src_itemsize;
+    if (newdata->wrappeddata != NULL) {
+        newdata->wrappeddata =
+                        PyArray_CopyStridedTransferData(data->wrappeddata);
+        if (newdata->wrappeddata == NULL) {
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+    if (newdata->todata != NULL) {
+        newdata->todata = PyArray_CopyStridedTransferData(data->todata);
+        if (newdata->todata == NULL) {
+            PyArray_FreeStridedTransferData(newdata->wrappeddata);
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+    if (newdata->fromdata != NULL) {
+        newdata->fromdata = PyArray_CopyStridedTransferData(data->fromdata);
+        if (newdata->fromdata == NULL) {
+            PyArray_FreeStridedTransferData(newdata->wrappeddata);
+            PyArray_FreeStridedTransferData(newdata->todata);
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _align_wrap_data *d = (_align_wrap_data *)data;
+    PyArray_StridedTransferFn *wrapped = d->wrapped,
+            *tobuffer = d->tobuffer,
+            *frombuffer = d->frombuffer;
+    npy_intp dst_itemsize = d->dst_itemsize;
+    void *wrappeddata = d->wrappeddata,
+            *todata = d->todata,
+            *fromdata = d->fromdata;
+    char *bufferin = d->bufferin, *bufferout = d->bufferout;
+
+    for(;;) {
+        if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
+            tobuffer(bufferin, src_itemsize, src, src_stride,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    src_itemsize, todata);
+            wrapped(bufferout, dst_itemsize, bufferin, src_itemsize,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    src_itemsize, wrappeddata);
+            frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    dst_itemsize, fromdata);
+            N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+            src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
+            dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
+        }
+        else {
+            tobuffer(bufferin, src_itemsize, src, src_stride, N,
+                                            src_itemsize, todata);
+            wrapped(bufferout, dst_itemsize, bufferin, src_itemsize, N,
+                                            src_itemsize, wrappeddata);
+            frombuffer(dst, dst_stride, bufferout, dst_itemsize, N,
+                                            dst_itemsize, fromdata);
+            return;
+        }
+    }
+}
+
+static void
+_strided_to_strided_contig_align_wrap_init_dest(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _align_wrap_data *d = (_align_wrap_data *)data;
+    PyArray_StridedTransferFn *wrapped = d->wrapped,
+            *tobuffer = d->tobuffer,
+            *frombuffer = d->frombuffer;
+    npy_intp dst_itemsize = d->dst_itemsize;
+    void *wrappeddata = d->wrappeddata,
+            *todata = d->todata,
+            *fromdata = d->fromdata;
+    char *bufferin = d->bufferin, *bufferout = d->bufferout;
+
+    for(;;) {
+        if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
+            tobuffer(bufferin, src_itemsize, src, src_stride,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    src_itemsize, todata);
+            memset(bufferout, 0, dst_itemsize*NPY_LOWLEVEL_BUFFER_BLOCKSIZE);
+            wrapped(bufferout, dst_itemsize, bufferin, src_itemsize,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    src_itemsize, wrappeddata);
+            frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                    dst_itemsize, fromdata);
+            N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+            src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
+            dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
+        }
+        else {
+            tobuffer(bufferin, src_itemsize, src, src_stride, N,
+                                            src_itemsize, todata);
+            memset(bufferout, 0, dst_itemsize*N);
+            wrapped(bufferout, dst_itemsize, bufferin, src_itemsize, N,
+                                            src_itemsize, wrappeddata);
+            frombuffer(dst, dst_stride, bufferout, dst_itemsize, N,
+                                            dst_itemsize, fromdata);
+            return;
+        }
+    }
+}
+
+/*
+ * Wraps an aligned contig to contig transfer function between either
+ * copies or byte swaps to temporary buffers.
+ *
+ * src_itemsize/dst_itemsize - The sizes of the src and dst datatypes.
+ * tobuffer - copy/swap function from src to an aligned contiguous buffer.
+ * todata - data for tobuffer
+ * frombuffer - copy/swap function from an aligned contiguous buffer to dst.
+ * fromdata - data for frombuffer
+ * wrapped - contig to contig transfer function being wrapped
+ * wrappeddata - data for wrapped
+ * init_dest - 1 means to memset the dest buffer to 0 before calling wrapped.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+wrap_aligned_contig_transfer_function(
+            npy_intp src_itemsize, npy_intp dst_itemsize,
+            PyArray_StridedTransferFn *tobuffer, void *todata,
+            PyArray_StridedTransferFn *frombuffer, void *fromdata,
+            PyArray_StridedTransferFn *wrapped, void *wrappeddata,
+            int init_dest,
+            PyArray_StridedTransferFn **out_stransfer,
+            void **out_transferdata)
+{
+    _align_wrap_data *data;
+    npy_intp basedatasize, datasize;
+
+    /* Round up the structure size to 16-byte boundary */
+    basedatasize = (sizeof(_align_wrap_data)+15)&(-0x10);
+    /* Add space for two low level buffers */
+    datasize = basedatasize +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_itemsize +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_itemsize;
+
+    /* Allocate the data, and populate it */
+    data = (_align_wrap_data *)PyArray_malloc(datasize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+    data->freefunc = (void *)&_align_wrap_data_free;
+    data->copyfunc = (void *)&_align_wrap_data_copy;
+    data->tobuffer = tobuffer;
+    data->todata = todata;
+    data->frombuffer = frombuffer;
+    data->fromdata = fromdata;
+    data->wrapped = wrapped;
+    data->wrappeddata = wrappeddata;
+    data->src_itemsize = src_itemsize;
+    data->dst_itemsize = dst_itemsize;
+    data->bufferin = (char *)data + basedatasize;
+    data->bufferout = data->bufferin +
+                NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_itemsize;
+
+    /* Set the function and data */
+    if (init_dest) {
+        *out_stransfer = &_strided_to_strided_contig_align_wrap_init_dest;
+    }
+    else {
+        *out_stransfer = &_strided_to_strided_contig_align_wrap;
+    }
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+/*************************** WRAP DTYPE COPY/SWAP *************************/
+/* Wraps the dtype copy swap function */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_CopySwapNFunc *copyswapn;
+    int swap;
+    PyArrayObject *arr;
+} _wrap_copy_swap_data;
+
+/* wrap copy swap data free function */
+void _wrap_copy_swap_data_free(_wrap_copy_swap_data *data)
+{
+    Py_DECREF(data->arr);
+    PyArray_free(data);
+}
+
+/* wrap copy swap data copy function */
+_wrap_copy_swap_data *_wrap_copy_swap_data_copy(_wrap_copy_swap_data *data)
+{
+    _wrap_copy_swap_data *newdata = 
+        (_wrap_copy_swap_data *)PyArray_malloc(sizeof(_wrap_copy_swap_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(_wrap_copy_swap_data));
+    Py_INCREF(newdata->arr);
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_wrap_copy_swap(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _wrap_copy_swap_data *d = (_wrap_copy_swap_data *)data;
+
+    d->copyswapn(dst, dst_stride, src, src_stride, N, d->swap, d->arr);
+}
+
+/* This only gets used for custom data types */
+static int
+wrap_copy_swap_function(int aligned,
+                npy_intp src_stride, npy_intp dst_stride,
+                PyArray_Descr *dtype,
+                int should_swap,
+                PyArray_StridedTransferFn **out_stransfer,
+                void **out_transferdata)
+{
+    _wrap_copy_swap_data *data;
+    npy_intp shape = 1;
+
+    /* Allocate the data for the copy swap */
+    data = (_wrap_copy_swap_data *)PyArray_malloc(sizeof(_wrap_copy_swap_data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        *out_stransfer = NULL;
+        *out_transferdata = NULL;
+        return NPY_FAIL;
+    }
+
+    data->freefunc = &_wrap_copy_swap_data_free;
+    data->copyfunc = &_wrap_copy_swap_data_copy;
+    data->copyswapn = dtype->f->copyswapn;
+    data->swap = should_swap;
+
+    /*
+     * TODO: This is a hack so the copyswap functions have an array.
+     *       The copyswap functions shouldn't need that.
+     */
+    Py_INCREF(dtype);
+    data->arr = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype,
+                            1, &shape, NULL, NULL, 0, NULL);
+    if (data->arr == NULL) {
+        PyArray_free(data);
+        return NPY_FAIL;
+    }
+
+    *out_stransfer = &_strided_to_strided_wrap_copy_swap;
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+/*************************** DTYPE CAST FUNCTIONS *************************/
+
+/* Does a simple aligned cast */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_VectorUnaryFunc *castfunc;
+    PyArrayObject *aip, *aop;
+} _strided_cast_data;
+
+/* strided cast data free function */
+void _strided_cast_data_free(_strided_cast_data *data)
+{
+    Py_DECREF(data->aip);
+    Py_DECREF(data->aop);
+    PyArray_free(data);
+}
+
+/* strided cast data copy function */
+_strided_cast_data *_strided_cast_data_copy(_strided_cast_data *data)
+{
+    _strided_cast_data *newdata = 
+            (_strided_cast_data *)PyArray_malloc(sizeof(_strided_cast_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(_strided_cast_data));
+    Py_INCREF(newdata->aip);
+    Py_INCREF(newdata->aop);
+
+    return newdata;
+}
+
+static void
+_aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _strided_cast_data *d = (_strided_cast_data *)data;
+    PyArray_VectorUnaryFunc *castfunc = d->castfunc;
+    PyArrayObject *aip = d->aip, *aop = d->aop;
+
+    while (N > 0) {
+        castfunc(src, dst, 1, aip, aop);
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+/* This one requires src be of type NPY_OBJECT */
+static void
+_aligned_strided_to_strided_cast_decref_src(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _strided_cast_data *d = (_strided_cast_data *)data;
+    PyArray_VectorUnaryFunc *castfunc = d->castfunc;
+    PyArrayObject *aip = d->aip, *aop = d->aop;
+    PyObject *src_ref;
+
+    while (N > 0) {
+        castfunc(src, dst, 1, aip, aop);
+
+        /* After casting, decrement the source ref */
+        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
+        NPY_DT_DBG_REFTRACE("dec src ref (cast object -> not object)", src_ref);
+        Py_XDECREF(src_ref);
+
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_aligned_contig_to_contig_cast(char *dst, npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(itemsize),
+                        void *data)
+{
+    _strided_cast_data *d = (_strided_cast_data *)data;
+
+    d->castfunc(src, dst, N, d->aip, d->aop);
+}
+
+static int
+get_nbo_cast_numeric_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            int src_type_num, int dst_type_num,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata)
+{
+    /* Emit a warning if complex imaginary is being cast away */
+    if (PyTypeNum_ISCOMPLEX(src_type_num) &&
+                    !PyTypeNum_ISCOMPLEX(dst_type_num) &&
+                    !PyTypeNum_ISBOOL(dst_type_num)) {
+        PyObject *cls = NULL, *obj = NULL;
+        int ret;
+        obj = PyImport_ImportModule("numpy.core");
+        if (obj) {
+            cls = PyObject_GetAttrString(obj, "ComplexWarning");
+            Py_DECREF(obj);
+        }
+#if PY_VERSION_HEX >= 0x02050000
+        ret = PyErr_WarnEx(cls,
+                           "Casting complex values to real discards "
+                           "the imaginary part", 1);
+#else
+        ret = PyErr_Warn(cls,
+                         "Casting complex values to real discards "
+                         "the imaginary part");
+#endif
+        Py_XDECREF(cls);
+        if (ret < 0) {
+            return NPY_FAIL;
+        }
+    }
+
+    *out_stransfer = PyArray_GetStridedNumericCastFn(aligned,
+                                src_stride, dst_stride,
+                                src_type_num, dst_type_num);
+    *out_transferdata = NULL;
+    if (*out_stransfer == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "unexpected error in GetStridedNumericCastFn");
+        return NPY_FAIL;
+    }
+
+    return NPY_SUCCEED;
+}
+
+static int
+get_nbo_cast_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api,
+                            int *out_needs_wrap)
+{
+    _strided_cast_data *data;
+    PyArray_VectorUnaryFunc *castfunc;
+    PyArray_Descr *tmp_dtype;
+    npy_intp shape = 1, src_itemsize = src_dtype->elsize,
+            dst_itemsize = dst_dtype->elsize;
+
+    if (PyTypeNum_ISNUMBER(src_dtype->type_num) &&
+                    PyTypeNum_ISNUMBER(dst_dtype->type_num)) {
+        *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
+                          !PyArray_ISNBO(dst_dtype->byteorder);
+        return get_nbo_cast_numeric_transfer_function(aligned,
+                                    src_stride, dst_stride,
+                                    src_dtype->type_num, dst_dtype->type_num,
+                                    out_stransfer, out_transferdata);
+    }
+
+    *out_needs_wrap = !aligned ||
+                      !PyArray_ISNBO(src_dtype->byteorder) ||
+                      !PyArray_ISNBO(dst_dtype->byteorder);
+
+    /* Check the data types whose casting functions use API calls */
+    switch (src_dtype->type_num) {
+        case NPY_OBJECT:
+        case NPY_STRING:
+        case NPY_UNICODE:
+        case NPY_VOID:
+            if (out_needs_api) {
+                *out_needs_api = 1;
+            }
+            break;
+    }
+    switch (dst_dtype->type_num) {
+        case NPY_OBJECT:
+        case NPY_STRING:
+        case NPY_UNICODE:
+        case NPY_VOID:
+            if (out_needs_api) {
+                *out_needs_api = 1;
+            }
+            break;
+    }
+
+    /* Get the cast function */
+    castfunc = PyArray_GetCastFunc(src_dtype, dst_dtype->type_num);
+    if (!castfunc) {
+        *out_stransfer = NULL;
+        *out_transferdata = NULL;
+        return NPY_FAIL;
+    }
+
+    /* Allocate the data for the casting */
+    data = (_strided_cast_data *)PyArray_malloc(sizeof(_strided_cast_data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        *out_stransfer = NULL;
+        *out_transferdata = NULL;
+        return NPY_FAIL;
+    }
+    data->freefunc = (void*)&_strided_cast_data_free;
+    data->copyfunc = (void*)&_strided_cast_data_copy;
+    data->castfunc = castfunc;
+    /*
+     * TODO: This is a hack so the cast functions have an array.
+     *       The cast functions shouldn't need that.  Also, since we
+     *       always handle byte order conversions, this array should
+     *       have native byte order.
+     */
+    if (PyArray_ISNBO(src_dtype->byteorder)) {
+        tmp_dtype = src_dtype;
+        Py_INCREF(tmp_dtype);
+    }
+    else {
+        tmp_dtype = PyArray_DescrNewByteorder(src_dtype, NPY_NATIVE);
+        if (tmp_dtype == NULL) {
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+    }
+    data->aip = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, tmp_dtype,
+                            1, &shape, NULL, NULL, 0, NULL);
+    if (data->aip == NULL) {
+        PyArray_free(data);
+        return NPY_FAIL;
+    }
+    /*
+     * TODO: This is a hack so the cast functions have an array.
+     *       The cast functions shouldn't need that.  Also, since we
+     *       always handle byte order conversions, this array should
+     *       have native byte order.
+     */
+    if (PyArray_ISNBO(dst_dtype->byteorder)) {
+        tmp_dtype = dst_dtype;
+        Py_INCREF(tmp_dtype);
+    }
+    else {
+        tmp_dtype = PyArray_DescrNewByteorder(dst_dtype, NPY_NATIVE);
+        if (tmp_dtype == NULL) {
+            Py_DECREF(data->aip);
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+    }
+    data->aop = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, tmp_dtype,
+                            1, &shape, NULL, NULL, 0, NULL);
+    if (data->aop == NULL) {
+        Py_DECREF(data->aip);
+        PyArray_free(data);
+        return NPY_FAIL;
+    }
+
+    /* If it's aligned and all native byte order, we're all done */
+    if (move_references && src_dtype->type_num == NPY_OBJECT) {
+        *out_stransfer = _aligned_strided_to_strided_cast_decref_src;
+    }
+    else {
+        /*
+         * Use the contig version if the strides are contiguous or
+         * we're telling the caller to wrap the return, because
+         * the wrapping uses a contiguous buffer.
+         */
+        if ((src_stride == src_itemsize && dst_stride == dst_itemsize) ||
+                        *out_needs_wrap) {
+            *out_stransfer = _aligned_contig_to_contig_cast;
+        }
+        else {
+            *out_stransfer = _aligned_strided_to_strided_cast;
+        }
+    }
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+static int
+get_cast_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyArray_StridedTransferFn *caststransfer;
+    void *castdata, *todata = NULL, *fromdata = NULL;
+    int needs_wrap = 0;
+    npy_intp src_itemsize = src_dtype->elsize,
+            dst_itemsize = dst_dtype->elsize;
+
+    if (src_dtype->type_num == dst_dtype->type_num) {
+        PyErr_SetString(PyExc_ValueError,
+                "low level cast function is for unequal type numbers");
+        return NPY_FAIL;
+    }
+
+    if (get_nbo_cast_transfer_function(aligned,
+                            src_stride, dst_stride,
+                            src_dtype, dst_dtype,
+                            move_references,
+                            &caststransfer,
+                            &castdata,
+                            out_needs_api,
+                            &needs_wrap) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+
+    /*
+     * If all native byte order and doesn't need alignment wrapping,
+     * return the function
+     */
+    if (!needs_wrap) {
+        *out_stransfer = caststransfer;
+        *out_transferdata = castdata;
+
+        return NPY_SUCCEED;
+    }
+    /* Otherwise, we have to copy and/or swap to aligned temporaries */
+    else {
+        PyArray_StridedTransferFn *tobuffer, *frombuffer;
+
+        /* Get the copy/swap operation from src */
+
+        /* If it's a custom data type, wrap its copy swap function */
+        if (src_dtype->type_num >= NPY_NTYPES) {
+            tobuffer = NULL;
+            wrap_copy_swap_function(aligned,
+                                src_stride, src_itemsize,
+                                src_dtype,
+                                !PyArray_ISNBO(src_dtype->byteorder),
+                                &tobuffer, &todata);
+        }
+        /* A straight copy */
+        else if (src_itemsize == 1 || PyArray_ISNBO(src_dtype->byteorder)) {
+            tobuffer = PyArray_GetStridedCopyFn(aligned,
+                                        src_stride, src_itemsize,
+                                        src_itemsize);
+        }
+        /* If it's not complex, one swap */
+        else if(src_dtype->kind != 'c') {
+            tobuffer = PyArray_GetStridedCopySwapFn(aligned,
+                                        src_stride, src_itemsize,
+                                        src_itemsize);
+        }
+        /* If complex, a paired swap */
+        else {
+            tobuffer = PyArray_GetStridedCopySwapPairFn(aligned,
+                                        src_stride, src_itemsize,
+                                        src_itemsize);
+        }
+
+        /* Get the copy/swap operation to dst */
+
+        /* If it's a custom data type, wrap its copy swap function */
+        if (dst_dtype->type_num >= NPY_NTYPES) {
+            frombuffer = NULL;
+            wrap_copy_swap_function(aligned,
+                                dst_itemsize, dst_stride,
+                                dst_dtype,
+                                !PyArray_ISNBO(dst_dtype->byteorder),
+                                &frombuffer, &fromdata);
+        }
+        /* A straight copy */
+        else if (dst_itemsize == 1 || PyArray_ISNBO(dst_dtype->byteorder)) {
+            if (dst_dtype->type_num == NPY_OBJECT) {
+                frombuffer = &_strided_to_strided_move_references;
+            }
+            else {
+                frombuffer = PyArray_GetStridedCopyFn(aligned,
+                                        dst_itemsize, dst_stride,
+                                        dst_itemsize);
+            }
+        }
+        /* If it's not complex, one swap */
+        else if(dst_dtype->kind != 'c') {
+            frombuffer = PyArray_GetStridedCopySwapFn(aligned,
+                                        dst_itemsize, dst_stride,
+                                        dst_itemsize);
+        }
+        /* If complex, a paired swap */
+        else {
+            frombuffer = PyArray_GetStridedCopySwapPairFn(aligned,
+                                        dst_itemsize, dst_stride,
+                                        dst_itemsize);
+        }
+
+        if (frombuffer == NULL || tobuffer == NULL) {
+            PyArray_FreeStridedTransferData(castdata);
+            PyArray_FreeStridedTransferData(todata);
+            PyArray_FreeStridedTransferData(fromdata);
+            return NPY_FAIL;
+        }
+
+        *out_stransfer = caststransfer;
+
+        /* Wrap it all up in a new transfer function + data */
+        if (wrap_aligned_contig_transfer_function(
+                            src_itemsize, dst_itemsize,
+                            tobuffer, todata,
+                            frombuffer, fromdata,
+                            caststransfer, castdata,
+                            PyDataType_FLAGCHK(dst_dtype, NPY_NEEDS_INIT),
+                            out_stransfer, out_transferdata) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(castdata);
+            PyArray_FreeStridedTransferData(todata);
+            PyArray_FreeStridedTransferData(fromdata);
+            return NPY_FAIL;
+        }
+
+        return NPY_SUCCEED;
+    }
+}
+
+/**************************** COPY 1 TO N CONTIGUOUS ************************/
+
+/* Copies 1 element to N contiguous elements */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_StridedTransferFn *stransfer;
+    void *data;
+    npy_intp N, dst_itemsize;
+    /* If this is non-NULL the source type has references needing a decref */
+    PyArray_StridedTransferFn *stransfer_finish_src;
+    void *data_finish_src;
+} _one_to_n_data;
+
+/* transfer data free function */
+void _one_to_n_data_free(_one_to_n_data *data)
+{
+    PyArray_FreeStridedTransferData(data->data);
+    PyArray_FreeStridedTransferData(data->data_finish_src);
+    PyArray_free(data);
+}
+
+/* transfer data copy function */
+_one_to_n_data *_one_to_n_data_copy(_one_to_n_data *data)
+{
+    _one_to_n_data *newdata;
+
+    /* Allocate the data, and populate it */
+    newdata = (_one_to_n_data *)PyArray_malloc(sizeof(_one_to_n_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+    memcpy(newdata, data, sizeof(_one_to_n_data));
+    if (data->data != NULL) {
+        newdata->data = PyArray_CopyStridedTransferData(data->data);
+        if (newdata->data == NULL) {
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+    if (data->data_finish_src != NULL) {
+        newdata->data_finish_src =
+                        PyArray_CopyStridedTransferData(data->data_finish_src);
+        if (newdata->data_finish_src == NULL) {
+            PyArray_FreeStridedTransferData(newdata->data);
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _one_to_n_data *d = (_one_to_n_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    void *subdata = d->data;
+    npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
+
+    while (N > 0) {
+        subtransfer(dst, dst_itemsize,
+                    src, 0,
+                    subN, src_itemsize,
+                    subdata);
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+static void
+_strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _one_to_n_data *d = (_one_to_n_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer,
+                *stransfer_finish_src = d->stransfer_finish_src;
+    void *subdata = d->data, *data_finish_src = data_finish_src;
+    npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
+
+    while (N > 0) {
+        subtransfer(dst, dst_itemsize,
+                    src, 0,
+                    subN, src_itemsize,
+                    subdata);
+
+        
+        stransfer_finish_src(NULL, 0,
+                            src, 0,
+                            1, src_itemsize,
+                            data_finish_src);
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+/*
+ * Wraps a transfer function to produce one that copies one element
+ * of src to N contiguous elements of dst.  If stransfer_finish_src is
+ * not NULL, it should be a transfer function which just affects
+ * src, for example to do a final DECREF operation for references.
+ */
+static int
+wrap_transfer_function_one_to_n(
+                            PyArray_StridedTransferFn *stransfer_inner,
+                            void *data_inner,
+                            PyArray_StridedTransferFn *stransfer_finish_src,
+                            void *data_finish_src,
+                            npy_intp dst_itemsize,
+                            npy_intp N,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata)
+{
+    _one_to_n_data *data;
+
+
+    data = PyArray_malloc(sizeof(_one_to_n_data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+
+    data->freefunc = &_one_to_n_data_free;
+    data->copyfunc = &_one_to_n_data_copy;
+    data->stransfer = stransfer_inner;
+    data->data = data_inner;
+    data->stransfer_finish_src = stransfer_finish_src;
+    data->data_finish_src = data_finish_src;
+    data->N = N;
+    data->dst_itemsize = dst_itemsize;
+
+    if (stransfer_finish_src == NULL) {
+        *out_stransfer = &_strided_to_strided_one_to_n;
+    }
+    else {
+        *out_stransfer = &_strided_to_strided_one_to_n_with_finish;
+    }
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+static int
+get_one_to_n_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            npy_intp N,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyArray_StridedTransferFn *stransfer, *stransfer_finish_src = NULL;
+    void *data, *data_finish_src = NULL;
+
+    /*
+     * move_references is set to 0, handled in the wrapping transfer fn,
+     * src_stride is set to zero, because its 1 to N copying,
+     * and dst_stride is set to contiguous, because subarrays are always
+     * contiguous.
+     */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                    0, dst_dtype->elsize,
+                    src_dtype, dst_dtype,
+                    0,
+                    &stransfer, &data,
+                    out_needs_api) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+
+    /* If the src object will need a DECREF, set src_dtype */
+    if (move_references && PyDataType_REFCHK(src_dtype)) {
+        if (get_decsrcref_transfer_function(aligned,
+                            src_stride,
+                            src_dtype,
+                            &stransfer_finish_src,
+                            &data_finish_src,
+                            out_needs_api) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(data);
+            return NPY_FAIL;
+        }
+    }
+
+    if (wrap_transfer_function_one_to_n(stransfer, data,
+                            stransfer_finish_src, data_finish_src,
+                            dst_dtype->elsize,
+                            N,
+                            out_stransfer, out_transferdata) != NPY_SUCCEED) {
+        PyArray_FreeStridedTransferData(data);
+        PyArray_FreeStridedTransferData(data_finish_src);
+        return NPY_FAIL;
+    }
+
+    return NPY_SUCCEED;
+}
+
+/**************************** COPY N TO N CONTIGUOUS ************************/
+
+/* Copies N contiguous elements to N contiguous elements */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_StridedTransferFn *stransfer;
+    void *data;
+    npy_intp N, src_itemsize, dst_itemsize;
+} _n_to_n_data;
+
+/* transfer data free function */
+void _n_to_n_data_free(_n_to_n_data *data)
+{
+    PyArray_FreeStridedTransferData(data->data);
+    PyArray_free(data);
+}
+
+/* transfer data copy function */
+_n_to_n_data *_n_to_n_data_copy(_n_to_n_data *data)
+{
+    _n_to_n_data *newdata;
+
+    /* Allocate the data, and populate it */
+    newdata = (_n_to_n_data *)PyArray_malloc(sizeof(_n_to_n_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+    memcpy(newdata, data, sizeof(_n_to_n_data));
+    if (newdata->data != NULL) {
+        newdata->data = PyArray_CopyStridedTransferData(data->data);
+        if (newdata->data == NULL) {
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *data)
+{
+    _n_to_n_data *d = (_n_to_n_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    void *subdata = d->data;
+    npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
+                dst_subitemsize = d->dst_itemsize;
+
+    while (N > 0) {
+        subtransfer(dst, dst_subitemsize,
+                    src, src_subitemsize,
+                    subN, src_subitemsize,
+                    subdata);
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+static void
+_contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _n_to_n_data *d = (_n_to_n_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    void *subdata = d->data;
+    npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
+                dst_subitemsize = d->dst_itemsize;
+
+    subtransfer(dst, dst_subitemsize,
+                src, src_subitemsize,
+                subN*N, src_subitemsize,
+                subdata);
+}
+
+/*
+ * Wraps a transfer function to produce one that copies N contiguous elements
+ * of src to N contiguous elements of dst.
+ */
+static int
+wrap_transfer_function_n_to_n(
+                            PyArray_StridedTransferFn *stransfer_inner,
+                            void *data_inner,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            npy_intp src_itemsize, npy_intp dst_itemsize,
+                            npy_intp N,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata)
+{
+    _n_to_n_data *data;
+
+    data = PyArray_malloc(sizeof(_n_to_n_data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+
+    data->freefunc = &_n_to_n_data_free;
+    data->copyfunc = &_n_to_n_data_copy;
+    data->stransfer = stransfer_inner;
+    data->data = data_inner;
+    data->N = N;
+    data->src_itemsize = src_itemsize;
+    data->dst_itemsize = dst_itemsize;
+
+    /*
+     * If the N subarray elements exactly fit in the strides,
+     * then can do a faster contiguous transfer.
+     */
+    if (src_stride == N * src_itemsize &&
+                    dst_stride == N * dst_itemsize) {
+        *out_stransfer = &_contig_to_contig_n_to_n;
+    }
+    else {
+        *out_stransfer = &_strided_to_strided_n_to_n;
+    }
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+static int
+get_n_to_n_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            npy_intp N,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyArray_StridedTransferFn *stransfer;
+    void *data;
+    
+    /*
+     * src_stride and dst_stride are set to contiguous, because
+     * subarrays are always contiguous.
+     */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                    src_dtype->elsize, dst_dtype->elsize,
+                    src_dtype, dst_dtype,
+                    move_references,
+                    &stransfer, &data,
+                    out_needs_api) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+
+    if (wrap_transfer_function_n_to_n(stransfer, data,
+                            src_stride, dst_stride,
+                            src_dtype->elsize, dst_dtype->elsize,
+                            N,
+                            out_stransfer,
+                            out_transferdata) != NPY_SUCCEED) {
+        PyArray_FreeStridedTransferData(data);
+        return NPY_FAIL;
+    }
+
+    return NPY_SUCCEED;
+}
+
+/********************** COPY WITH SUBARRAY BROADCAST ************************/
+
+typedef struct {
+    npy_intp offset, count;
+} _subarray_broadcast_offsetrun;
+
+/* Copies element with subarray broadcasting */
+typedef struct {
+    void *freefunc, *copyfunc;
+    PyArray_StridedTransferFn *stransfer;
+    void *data;
+    npy_intp src_N, dst_N, src_itemsize, dst_itemsize;
+    PyArray_StridedTransferFn *stransfer_decsrcref;
+    void *data_decsrcref;
+    PyArray_StridedTransferFn *stransfer_decdstref;
+    void *data_decdstref;
+    /* This gets a run-length encoded representation of the transfer */
+    npy_intp run_count;
+    _subarray_broadcast_offsetrun offsetruns;
+} _subarray_broadcast_data;
+
+/* transfer data free function */
+void _subarray_broadcast_data_free(_subarray_broadcast_data *data)
+{
+    PyArray_FreeStridedTransferData(data->data);
+    PyArray_FreeStridedTransferData(data->data_decsrcref);
+    PyArray_FreeStridedTransferData(data->data_decdstref);
+    PyArray_free(data);
+}
+
+/* transfer data copy function */
+_subarray_broadcast_data *_subarray_broadcast_data_copy(
+                                _subarray_broadcast_data *data)
+{
+    _subarray_broadcast_data *newdata;
+    npy_intp run_count = data->run_count, structsize;
+
+    structsize = sizeof(_subarray_broadcast_data) +
+                        run_count*sizeof(_subarray_broadcast_offsetrun);
+
+    /* Allocate the data and populate it */
+    newdata = (_subarray_broadcast_data *)PyArray_malloc(structsize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    memcpy(newdata, data, structsize);
+    if (data->data != NULL) {
+        newdata->data = PyArray_CopyStridedTransferData(data->data);
+        if (newdata->data == NULL) {
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+    if (data->data_decsrcref != NULL) {
+        newdata->data_decsrcref =
+                        PyArray_CopyStridedTransferData(data->data_decsrcref);
+        if (newdata->data_decsrcref == NULL) {
+            PyArray_FreeStridedTransferData(newdata->data);
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+    if (data->data_decdstref != NULL) {
+        newdata->data_decdstref =
+                        PyArray_CopyStridedTransferData(data->data_decdstref);
+        if (newdata->data_decdstref == NULL) {
+            PyArray_FreeStridedTransferData(newdata->data);
+            PyArray_FreeStridedTransferData(newdata->data_decsrcref);
+            PyArray_free(newdata);
+            return NULL;
+        }
+    }
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    void *subdata = d->data;
+    npy_intp run, run_count = d->run_count,
+            src_subitemsize = d->src_itemsize,
+            dst_subitemsize = d->dst_itemsize;
+    npy_intp index, offset, count;
+    char *dst_ptr;
+    _subarray_broadcast_offsetrun *offsetruns = &d->offsetruns;
+
+    while (N > 0) {
+        index = 0;
+        for (run = 0; run < run_count; ++run) {
+            offset = offsetruns[run].offset;
+            count = offsetruns[run].count;
+            dst_ptr = dst + index*dst_subitemsize;
+            if (offset != -1) {
+                subtransfer(dst_ptr, dst_subitemsize,
+                            src + offset, src_subitemsize,
+                            count, src_subitemsize,
+                            subdata);
+            }
+            else {
+                memset(dst_ptr, 0, count*dst_subitemsize);
+            }
+            index += count;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+
+static void
+_strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
+    PyArray_StridedTransferFn *subtransfer = d->stransfer;
+    void *subdata = d->data;
+    PyArray_StridedTransferFn *stransfer_decsrcref = d->stransfer_decsrcref;
+    void *data_decsrcref = d->data_decsrcref;
+    PyArray_StridedTransferFn *stransfer_decdstref = d->stransfer_decdstref;
+    void *data_decdstref = d->data_decdstref;
+    npy_intp run, run_count = d->run_count,
+            src_subitemsize = d->src_itemsize,
+            dst_subitemsize = d->dst_itemsize,
+            src_subN = d->src_N;
+    npy_intp index, offset, count;
+    char *dst_ptr;
+    _subarray_broadcast_offsetrun *offsetruns = &d->offsetruns;
+
+    while (N > 0) {
+        index = 0;
+        for (run = 0; run < run_count; ++run) {
+            offset = offsetruns[run].offset;
+            count = offsetruns[run].count;
+            dst_ptr = dst + index*dst_subitemsize;
+            if (offset != -1) {
+                subtransfer(dst_ptr, dst_subitemsize,
+                            src + offset, src_subitemsize,
+                            count, src_subitemsize,
+                            subdata);
+            }
+            else {
+                if (stransfer_decdstref != NULL) {
+                    stransfer_decdstref(NULL, 0, dst_ptr, dst_subitemsize,
+                                        count, dst_subitemsize,
+                                        data_decdstref);
+                }
+                memset(dst_ptr, 0, count*dst_subitemsize);
+            }
+            index += count;
+        }
+
+        if (stransfer_decsrcref != NULL) {
+            stransfer_decsrcref(NULL, 0, src, src_subitemsize,
+                                    src_subN, src_subitemsize,
+                                    data_decsrcref);
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+        --N;
+    }
+}
+
+
+static int
+get_subarray_broadcast_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            npy_intp src_size, npy_intp dst_size,
+                            PyArray_Dims src_shape, PyArray_Dims dst_shape,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    _subarray_broadcast_data *data;
+    npy_intp structsize, index, run, run_size, src_index, dst_index, i, ndim;
+    _subarray_broadcast_offsetrun *offsetruns;
+    
+    structsize = sizeof(_subarray_broadcast_data) +
+                        dst_size*sizeof(_subarray_broadcast_offsetrun);
+
+    /* Allocate the data and populate it */
+    data = (_subarray_broadcast_data *)PyArray_malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+
+    /*
+     * move_references is set to 0, handled in the wrapping transfer fn,
+     * src_stride and dst_stride are set to contiguous, as N will always
+     * be 1 when it's called.
+     */
+    if (PyArray_GetDTypeTransferFunction(aligned,
+                    src_dtype->elsize, dst_dtype->elsize,
+                    src_dtype, dst_dtype,
+                    0,
+                    &data->stransfer, &data->data,
+                    out_needs_api) != NPY_SUCCEED) {
+        PyArray_free(data);
+        return NPY_FAIL;
+    }
+    data->freefunc = &_subarray_broadcast_data_free;
+    data->copyfunc = &_subarray_broadcast_data_copy;
+    data->src_N = src_size;
+    data->dst_N = dst_size;
+    data->src_itemsize = src_dtype->elsize;
+    data->dst_itemsize = dst_dtype->elsize;
+
+    /* If the src object will need a DECREF */
+    if (move_references && PyDataType_REFCHK(src_dtype)) {
+        if (PyArray_GetDTypeTransferFunction(aligned,
+                        src_dtype->elsize, 0,
+                        src_dtype, NULL,
+                        1,
+                        &data->stransfer_decsrcref,
+                        &data->data_decsrcref,
+                        out_needs_api) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(data->data);
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+    }
+    else {
+        data->stransfer_decsrcref = NULL;
+        data->data_decsrcref = NULL;
+    }
+
+    /* If the dst object needs a DECREF to set it to NULL */
+    if (PyDataType_REFCHK(dst_dtype)) {
+        if (PyArray_GetDTypeTransferFunction(aligned,
+                        dst_dtype->elsize, 0,
+                        dst_dtype, NULL,
+                        1,
+                        &data->stransfer_decdstref,
+                        &data->data_decdstref,
+                        out_needs_api) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(data->data);
+            PyArray_FreeStridedTransferData(data->data_decsrcref);
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+    }
+    else {
+        data->stransfer_decdstref = NULL;
+        data->data_decdstref = NULL;
+    }
+
+    /* Calculate the broadcasting and set the offsets */
+    offsetruns = &data->offsetruns;
+    ndim = (src_shape.len > dst_shape.len) ? src_shape.len : dst_shape.len;
+    for (index = 0; index < dst_size; ++index) {
+        npy_intp src_factor = 1;
+
+        dst_index = index;
+        src_index = 0;
+        for (i = ndim-1; i >= 0; --i) {
+            npy_intp coord = 0, shape;
+
+            /* Get the dst coord of this index for dimension i */
+            if (i >= ndim - dst_shape.len) {
+                shape = dst_shape.ptr[i-(ndim-dst_shape.len)];
+                coord = dst_index % shape;
+                dst_index /= shape;
+            }
+
+            /* Translate it into a src coord and update src_index */
+            if (i >= ndim - src_shape.len) {
+                shape = src_shape.ptr[i-(ndim-src_shape.len)];
+                if (shape == 1) {
+                    coord = 0;
+                }
+                else {
+                    if (coord < shape) {
+                        src_index += src_factor*coord;
+                        src_factor *= shape;
+                    }
+                    else {
+                        /* Out of bounds, flag with -1 */
+                        src_index = -1;
+                        break;
+                    }
+                }
+            }
+        }
+        /* Set the offset */
+        if (src_index == -1) {
+            offsetruns[index].offset = -1;
+        }
+        else {
+            offsetruns[index].offset = src_index;
+        }
+    }
+
+    /* Run-length encode the result */
+    run = 0;
+    run_size = 1;
+    for (index = 1; index < dst_size; ++index) {
+        if (offsetruns[run].offset == -1) {
+            /* Stop the run when there's a valid index again */
+            if (offsetruns[index].offset != -1) {
+                offsetruns[run].count = run_size;
+                run++;
+                run_size = 1;
+                offsetruns[run].offset = offsetruns[index].offset;
+            }
+            else {
+                run_size++;
+            }
+        }
+        else {
+            /* Stop the run when there's a valid index again */
+            if (offsetruns[index].offset != offsetruns[index-1].offset + 1) {
+                offsetruns[run].count = run_size;
+                run++;
+                run_size = 1;
+                offsetruns[run].offset = offsetruns[index].offset;
+            }
+            else {
+                run_size++;
+            }
+        }
+    }
+    offsetruns[run].count = run_size;
+    run++;
+    data->run_count = run;
+
+    /* Multiply all the offsets by the src item size */
+    while (run--) {
+        if (offsetruns[run].offset != -1) {
+            offsetruns[run].offset *= src_dtype->elsize;
+        }
+    }
+
+    if (data->stransfer_decsrcref == NULL &&
+                                data->stransfer_decdstref == NULL) {
+        *out_stransfer = &_strided_to_strided_subarray_broadcast;
+    }
+    else {
+        *out_stransfer = &_strided_to_strided_subarray_broadcast_withrefs;
+    }
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+/*
+ * Handles subarray transfer.  To call this, at least one of the dtype's
+ * subarrays must be non-NULL
+ */
+static int
+get_subarray_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyArray_Dims src_shape = {NULL, -1}, dst_shape = {NULL, -1};
+    npy_intp src_size = 1, dst_size = 1;
+
+    /* Get the subarray shapes and sizes */
+    if (src_dtype->subarray != NULL) {
+       if (!(PyArray_IntpConverter(src_dtype->subarray->shape,
+                                            &src_shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return NPY_FAIL;
+        }
+        src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
+        src_dtype = src_dtype->subarray->base;
+    }
+    if (dst_dtype->subarray != NULL) {
+       if (!(PyArray_IntpConverter(dst_dtype->subarray->shape,
+                                            &dst_shape))) {
+            if (src_shape.ptr != NULL) {
+                PyDimMem_FREE(src_shape.ptr);
+            }
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return NPY_FAIL;
+        }
+        dst_size = PyArray_MultiplyList(dst_shape.ptr, dst_shape.len);
+        dst_dtype = dst_dtype->subarray->base;
+    }
+
+    /*
+     * Just a straight one-element copy.
+     */
+    if (dst_size == 1 && src_size == 1) {
+        PyDimMem_FREE(src_shape.ptr);
+        PyDimMem_FREE(dst_shape.ptr);
+
+        return PyArray_GetDTypeTransferFunction(aligned,
+                src_stride, dst_stride,
+                src_dtype, dst_dtype,
+                move_references,
+                out_stransfer, out_transferdata,
+                out_needs_api);
+    }
+    /* Copy the src value to all the dst values */
+    else if (src_size == 1) {
+        PyDimMem_FREE(src_shape.ptr);
+        PyDimMem_FREE(dst_shape.ptr);
+
+        return get_one_to_n_transfer_function(aligned,
+                        src_stride, dst_stride,
+                        src_dtype, dst_dtype,
+                        move_references,
+                        dst_size,
+                        out_stransfer, out_transferdata,
+                        out_needs_api);
+    }
+    /* If the shapes match exactly, do an n to n copy */
+    else if (src_shape.len == dst_shape.len &&
+               PyArray_CompareLists(src_shape.ptr, dst_shape.ptr,
+                                                    src_shape.len)) {
+        PyDimMem_FREE(src_shape.ptr);
+        PyDimMem_FREE(dst_shape.ptr);
+
+        return get_n_to_n_transfer_function(aligned,
+                        src_stride, dst_stride,
+                        src_dtype, dst_dtype,
+                        move_references,
+                        src_size,
+                        out_stransfer, out_transferdata,
+                        out_needs_api);
+    }
+    /*
+     * Copy the subarray with broadcasting, truncating, and zero-padding
+     * as necessary.
+     */
+    else {
+        int ret = get_subarray_broadcast_transfer_function(aligned,
+                        src_stride, dst_stride,
+                        src_dtype, dst_dtype,
+                        src_size, dst_size,
+                        src_shape, dst_shape,
+                        move_references,
+                        out_stransfer, out_transferdata,
+                        out_needs_api);
+
+        PyDimMem_FREE(src_shape.ptr);
+        PyDimMem_FREE(dst_shape.ptr);
+        return ret;
+    }
+}
+
+/**************************** COPY FIELDS *******************************/
+typedef struct {
+    npy_intp src_offset, dst_offset, src_itemsize;
+    PyArray_StridedTransferFn *stransfer;
+    void *data;
+} _single_field_transfer;
+
+typedef struct {
+    void *freefunc, *copyfunc;
+    npy_intp field_count;
+    
+    _single_field_transfer fields;
+} _field_transfer_data;
+
+/* transfer data free function */
+void _field_transfer_data_free(_field_transfer_data *data)
+{
+    npy_intp i, field_count = data->field_count;
+    _single_field_transfer *fields = &data->fields;
+
+    for (i = 0; i < field_count; ++i) {
+        PyArray_FreeStridedTransferData(fields[i].data);
+    }
+    PyArray_free(data);
+}
+
+/* transfer data copy function */
+_field_transfer_data *_field_transfer_data_copy(
+                                _field_transfer_data *data)
+{
+    _field_transfer_data *newdata;
+    npy_intp i, field_count = data->field_count, structsize;
+    _single_field_transfer *fields, *newfields;
+
+    structsize = sizeof(_field_transfer_data) +
+                    field_count * sizeof(_single_field_transfer);
+
+    /* Allocate the data and populate it */
+    newdata = (_field_transfer_data *)PyArray_malloc(structsize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    memcpy(newdata, data, structsize);
+    /* Copy all the fields transfer data */
+    fields = &data->fields;
+    newfields = &newdata->fields;
+    for (i = 0; i < field_count; ++i) {
+        if (fields[i].data != NULL) {
+            newfields[i].data =
+                        PyArray_CopyStridedTransferData(fields[i].data);
+            if (newfields[i].data == NULL) {
+                for (i = i-1; i >= 0; --i) {
+                    PyArray_FreeStridedTransferData(newfields[i].data);
+                }
+                PyArray_free(newdata);
+                return NULL;
+            }
+        }
+        
+    }
+
+    return newdata;
+}
+
+static void
+_strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _field_transfer_data *d = (_field_transfer_data *)data;
+    npy_intp i, field_count = d->field_count;
+    _single_field_transfer *field;
+
+    /* Do the transfer a block at a time */
+    for (;;) {
+        field = &d->fields;
+        if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
+            for (i = 0; i < field_count; ++i, ++field) {
+                field->stransfer(dst + field->dst_offset, dst_stride,
+                                 src + field->src_offset, src_stride,
+                                 NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                                 field->src_itemsize,
+                                 field->data);
+            }
+            N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+            src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
+            dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
+        }
+        else {
+            for (i = 0; i < field_count; ++i, ++field) {
+                field->stransfer(dst + field->dst_offset, dst_stride,
+                                 src + field->src_offset, src_stride,
+                                 N,
+                                 field->src_itemsize,
+                                 field->data);
+            }
+            return;
+        }
+    }
+}
+
+/*
+ * Handles fields transfer.  To call this, at least one of the dtypes
+ * must have fields
+ */
+static int
+get_fields_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *src_fld_dtype, *dst_fld_dtype;
+    npy_int i, names_size, field_count, structsize;
+    int src_offset, dst_offset;
+    _field_transfer_data *data;
+    _single_field_transfer *fields;
+
+    /* Copy the src value to all the fields of dst */
+    if (!PyDescr_HASFIELDS(src_dtype)) {
+        names = dst_dtype->names;
+        names_size = PyTuple_GET_SIZE(dst_dtype->names);
+
+        field_count = names_size;
+        structsize = sizeof(_field_transfer_data) +
+                        (field_count + 1) * sizeof(_single_field_transfer);
+        /* Allocate the data and populate it */
+        data = (_field_transfer_data *)PyArray_malloc(structsize);
+        if (data == NULL) {
+            PyErr_NoMemory();
+            return NPY_FAIL;
+        }
+        data->freefunc = &_field_transfer_data_free;
+        data->copyfunc = &_field_transfer_data_copy;
+        fields = &data->fields;
+
+        for (i = 0; i < names_size; ++i) {
+            key = PyTuple_GET_ITEM(names, i);
+            tup = PyDict_GetItem(dst_dtype->fields, key);
+            if (!PyArg_ParseTuple(tup, "Oi|O", &dst_fld_dtype,
+                                                    &dst_offset, &title)) {
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            if (PyArray_GetDTypeTransferFunction(0,
+                                    src_stride, dst_stride,
+                                    src_dtype, dst_fld_dtype,
+                                    0,
+                                    &fields[i].stransfer,
+                                    &fields[i].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                for (i = i-1; i >= 0; --i) {
+                    PyArray_FreeStridedTransferData(fields[i].data);
+                }
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            fields[i].src_offset = 0;
+            fields[i].dst_offset = dst_offset;
+            fields[i].src_itemsize = src_dtype->elsize;
+        }
+
+        /*
+         * If the references should be removed from src, add
+         * another transfer function to do that.
+         */
+        if (move_references && PyDataType_REFCHK(src_dtype)) {
+            if (get_decsrcref_transfer_function(0,
+                                    src_stride,
+                                    src_dtype,
+                                    &fields[field_count].stransfer,
+                                    &fields[field_count].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                for (i = 0; i < field_count; ++i) {
+                    PyArray_FreeStridedTransferData(fields[i].data);
+                }
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            fields[field_count].src_offset = 0;
+            fields[field_count].dst_offset = 0;
+            fields[field_count].src_itemsize = src_dtype->elsize;
+            field_count++;
+        }
+        data->field_count = field_count;
+
+        *out_stransfer = &_strided_to_strided_field_transfer;
+        *out_transferdata = data;
+
+        return NPY_SUCCEED;
+    }
+    /* Copy the value of the first field to dst */
+    else if (!PyDescr_HASFIELDS(dst_dtype)) {
+        names = src_dtype->names;
+        names_size = PyTuple_GET_SIZE(src_dtype->names);
+
+        /*
+         * If DECREF is needed on source fields, may need
+         * to process all the fields
+         */
+        if (move_references && PyDataType_REFCHK(src_dtype)) {
+            field_count = names_size + 1;
+        }
+        else {
+            field_count = 1;
+        }
+        structsize = sizeof(_field_transfer_data) +
+                        field_count * sizeof(_single_field_transfer);
+        /* Allocate the data and populate it */
+        data = (_field_transfer_data *)PyArray_malloc(structsize);
+        if (data == NULL) {
+            PyErr_NoMemory();
+            return NPY_FAIL;
+        }
+        data->freefunc = &_field_transfer_data_free;
+        data->copyfunc = &_field_transfer_data_copy;
+        fields = &data->fields;
+
+        key = PyTuple_GET_ITEM(names, 0);
+        tup = PyDict_GetItem(src_dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
+                                                &src_offset, &title)) {
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+        field_count = 0;
+        /*
+         * Special case bool type, the existence of fields implies True
+         *
+         * TODO: Perhaps a better behavior would be to combine all the
+         *       input fields with an OR?  The same would apply to subarrays.
+         */
+        if (dst_dtype->type_num == NPY_BOOL) {
+            if (get_bool_setdstone_transfer_function(dst_stride,
+                                    &fields[field_count].stransfer,
+                                    &fields[field_count].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            fields[field_count].src_offset = 0;
+            fields[field_count].dst_offset = 0;
+            fields[field_count].src_itemsize = 0;
+            field_count++;
+
+            /* If the src field has references, may need to clear them */
+            if (move_references && PyDataType_REFCHK(src_fld_dtype)) {
+                if (get_decsrcref_transfer_function(0,
+                            src_stride,
+                            src_fld_dtype,
+                            &fields[field_count].stransfer,
+                            &fields[field_count].data,
+                            out_needs_api) != NPY_SUCCEED) {
+                    PyArray_FreeStridedTransferData(fields[0].data);
+                    PyArray_free(data);
+                    return NPY_FAIL;
+                }
+                fields[field_count].src_offset = src_offset;
+                fields[field_count].dst_offset = 0;
+                fields[field_count].src_itemsize = src_fld_dtype->elsize;
+                field_count++;
+            }
+        }
+        /* Transfer the first field to the output */
+        else {
+            if (PyArray_GetDTypeTransferFunction(0,
+                                    src_stride, dst_stride,
+                                    src_fld_dtype, dst_dtype,
+                                    move_references,
+                                    &fields[field_count].stransfer,
+                                    &fields[field_count].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            fields[field_count].src_offset = src_offset;
+            fields[field_count].dst_offset = 0;
+            fields[field_count].src_itemsize = src_fld_dtype->elsize;
+            field_count++;
+        }
+
+        /*
+         * If the references should be removed from src, add
+         * more transfer functions to decrement the references
+         * for all the other fields.
+         */
+        if (move_references && PyDataType_REFCHK(src_dtype)) {
+            for (i = 1; i < names_size; ++i) {
+                key = PyTuple_GET_ITEM(names, i);
+                tup = PyDict_GetItem(src_dtype->fields, key);
+                if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
+                                                    &src_offset, &title)) {
+                    return NPY_FAIL;
+                }
+                if (PyDataType_REFCHK(src_fld_dtype)) {
+                    if (get_decsrcref_transfer_function(0,
+                                src_stride,
+                                src_fld_dtype,
+                                &fields[field_count].stransfer,
+                                &fields[field_count].data,
+                                out_needs_api) != NPY_SUCCEED) {
+                        for (i = field_count-1; i >= 0; --i) {
+                            PyArray_FreeStridedTransferData(fields[i].data);
+                        }
+                        PyArray_free(data);
+                        return NPY_FAIL;
+                    }
+                    fields[field_count].src_offset = src_offset;
+                    fields[field_count].dst_offset = 0;
+                    fields[field_count].src_itemsize = src_fld_dtype->elsize;
+                    field_count++;
+                }
+            }
+        }
+
+        data->field_count = field_count;
+
+        *out_stransfer = &_strided_to_strided_field_transfer;
+        *out_transferdata = data;
+
+        return NPY_SUCCEED;
+    }
+    /* Match up the fields to copy */
+    else {
+        /* Keeps track of the names we already used */
+        PyObject *used_names_dict = NULL;
+
+        names = dst_dtype->names;
+        names_size = PyTuple_GET_SIZE(dst_dtype->names);
+
+        /*
+         * If DECREF is needed on source fields, will need
+         * to also go through its fields.
+         */
+        if (move_references && PyDataType_REFCHK(src_dtype)) {
+            field_count = names_size + PyTuple_GET_SIZE(src_dtype->names);
+            used_names_dict = PyDict_New();
+            if (used_names_dict == NULL) {
+                return NPY_FAIL;
+            }
+        }
+        else {
+            field_count = names_size;
+        }
+        structsize = sizeof(_field_transfer_data) +
+                        field_count * sizeof(_single_field_transfer);
+        /* Allocate the data and populate it */
+        data = (_field_transfer_data *)PyArray_malloc(structsize);
+        if (data == NULL) {
+            PyErr_NoMemory();
+            Py_XDECREF(used_names_dict);
+            return NPY_FAIL;
+        }
+        data->freefunc = &_field_transfer_data_free;
+        data->copyfunc = &_field_transfer_data_copy;
+        fields = &data->fields;
+
+        for (i = 0; i < names_size; ++i) {
+            key = PyTuple_GET_ITEM(names, i);
+            tup = PyDict_GetItem(dst_dtype->fields, key);
+            if (!PyArg_ParseTuple(tup, "Oi|O", &dst_fld_dtype,
+                                                    &dst_offset, &title)) {
+                for (i = i-1; i >= 0; --i) {
+                    PyArray_FreeStridedTransferData(fields[i].data);
+                }
+                PyArray_free(data);
+                Py_XDECREF(used_names_dict);
+                return NPY_FAIL;
+            }
+            tup = PyDict_GetItem(src_dtype->fields, key);
+            if (tup != NULL) {
+                if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
+                                                        &src_offset, &title)) {
+                    for (i = i-1; i >= 0; --i) {
+                        PyArray_FreeStridedTransferData(fields[i].data);
+                    }
+                    PyArray_free(data);
+                    Py_XDECREF(used_names_dict);
+                    return NPY_FAIL;
+                }
+                if (PyArray_GetDTypeTransferFunction(0,
+                                        src_stride, dst_stride,
+                                        src_fld_dtype, dst_fld_dtype,
+                                        move_references,
+                                        &fields[i].stransfer,
+                                        &fields[i].data,
+                                        out_needs_api) != NPY_SUCCEED) {
+                    for (i = i-1; i >= 0; --i) {
+                        PyArray_FreeStridedTransferData(fields[i].data);
+                    }
+                    PyArray_free(data);
+                    Py_XDECREF(used_names_dict);
+                    return NPY_FAIL;
+                }
+                fields[i].src_offset = src_offset;
+                fields[i].dst_offset = dst_offset;
+                fields[i].src_itemsize = src_fld_dtype->elsize;
+
+                if (used_names_dict != NULL) {
+                    PyDict_SetItem(used_names_dict, key, Py_True);
+                }
+            }
+            else {
+                if (get_setdstzero_transfer_function(0,
+                                            dst_stride,
+                                            dst_fld_dtype,
+                                            &fields[i].stransfer,
+                                            &fields[i].data,
+                                            out_needs_api) != NPY_SUCCEED) {
+                    for (i = i-1; i >= 0; --i) {
+                        PyArray_FreeStridedTransferData(fields[i].data);
+                    }
+                    PyArray_free(data);
+                    Py_XDECREF(used_names_dict);
+                    return NPY_FAIL;
+                }
+                fields[i].src_offset = 0;
+                fields[i].dst_offset = dst_offset;
+                fields[i].src_itemsize = 0;
+            }
+        }
+
+        if (move_references && PyDataType_REFCHK(src_dtype)) {
+            /* Use field_count to track additional functions added */
+            field_count = names_size;
+
+            names = src_dtype->names;
+            names_size = PyTuple_GET_SIZE(src_dtype->names);
+            for (i = 0; i < names_size; ++i) {
+                key = PyTuple_GET_ITEM(names, i);
+                if (PyDict_GetItem(used_names_dict, key) == NULL) {
+                    tup = PyDict_GetItem(src_dtype->fields, key);
+                    if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
+                                                    &src_offset, &title)) {
+                        for (i = field_count-1; i >= 0; --i) {
+                            PyArray_FreeStridedTransferData(fields[i].data);
+                        }
+                        PyArray_free(data);
+                        Py_XDECREF(used_names_dict);
+                        return NPY_FAIL;
+                    }
+                    if (PyDataType_REFCHK(src_fld_dtype)) {
+                        if (get_decsrcref_transfer_function(0,
+                                    src_stride,
+                                    src_fld_dtype,
+                                    &fields[field_count].stransfer,
+                                    &fields[field_count].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                            for (i = field_count-1; i >= 0; --i) {
+                                PyArray_FreeStridedTransferData(fields[i].data);
+                            }
+                            PyArray_free(data);
+                            return NPY_FAIL;
+                        }
+                        fields[field_count].src_offset = src_offset;
+                        fields[field_count].dst_offset = 0;
+                        fields[field_count].src_itemsize =
+                                                src_fld_dtype->elsize;
+                        field_count++;
+                    }
+                }
+            }
+        }
+
+        Py_XDECREF(used_names_dict);
+
+        data->field_count = field_count;
+
+        *out_stransfer = &_strided_to_strided_field_transfer;
+        *out_transferdata = data;
+
+        return NPY_SUCCEED;
+    }
+}
+
+static int
+get_decsrcref_fields_transfer_function(int aligned,
+                            npy_intp src_stride,
+                            PyArray_Descr *src_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *src_fld_dtype;
+    npy_int i, names_size, field_count, structsize;
+    int src_offset;
+    _field_transfer_data *data;
+    _single_field_transfer *fields;
+
+    names = src_dtype->names;
+    names_size = PyTuple_GET_SIZE(src_dtype->names);
+
+    field_count = names_size;
+    structsize = sizeof(_field_transfer_data) +
+                    field_count * sizeof(_single_field_transfer);
+    /* Allocate the data and populate it */
+    data = (_field_transfer_data *)PyArray_malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+    data->freefunc = &_field_transfer_data_free;
+    data->copyfunc = &_field_transfer_data_copy;
+    fields = &data->fields;
+
+    field_count = 0;
+    for (i = 0; i < names_size; ++i) {
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(src_dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
+                                                &src_offset, &title)) {
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+        if (PyDataType_REFCHK(src_fld_dtype)) {
+            if (out_needs_api) {
+                *out_needs_api = 1;
+            }
+            if (get_decsrcref_transfer_function(0,
+                                    src_stride,
+                                    src_fld_dtype,
+                                    &fields[field_count].stransfer,
+                                    &fields[field_count].data,
+                                    out_needs_api) != NPY_SUCCEED) {
+                for (i = field_count-1; i >= 0; --i) {
+                    PyArray_FreeStridedTransferData(fields[i].data);
+                }
+                PyArray_free(data);
+                return NPY_FAIL;
+            }
+            fields[field_count].src_offset = src_offset;
+            fields[field_count].dst_offset = 0;
+            fields[field_count].src_itemsize = src_dtype->elsize;
+            field_count++;
+        }
+    }
+
+    data->field_count = field_count;
+
+    *out_stransfer = &_strided_to_strided_field_transfer;
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+static int
+get_setdestzero_fields_transfer_function(int aligned,
+                            npy_intp dst_stride,
+                            PyArray_Descr *dst_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *dst_fld_dtype;
+    npy_int i, names_size, field_count, structsize;
+    int dst_offset;
+    _field_transfer_data *data;
+    _single_field_transfer *fields;
+
+    names = dst_dtype->names;
+    names_size = PyTuple_GET_SIZE(dst_dtype->names);
+
+    field_count = names_size;
+    structsize = sizeof(_field_transfer_data) +
+                    field_count * sizeof(_single_field_transfer);
+    /* Allocate the data and populate it */
+    data = (_field_transfer_data *)PyArray_malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NPY_FAIL;
+    }
+    data->freefunc = &_field_transfer_data_free;
+    data->copyfunc = &_field_transfer_data_copy;
+    fields = &data->fields;
+
+    for (i = 0; i < names_size; ++i) {
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(dst_dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &dst_fld_dtype,
+                                                &dst_offset, &title)) {
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+        if (get_setdstzero_transfer_function(0,
+                                dst_stride,
+                                dst_fld_dtype,
+                                &fields[i].stransfer,
+                                &fields[i].data,
+                                out_needs_api) != NPY_SUCCEED) {
+            for (i = i-1; i >= 0; --i) {
+                PyArray_FreeStridedTransferData(fields[i].data);
+            }
+            PyArray_free(data);
+            return NPY_FAIL;
+        }
+        fields[i].src_offset = 0;
+        fields[i].dst_offset = dst_offset;
+        fields[i].src_itemsize = 0;
+    }
+
+    data->field_count = field_count;
+
+    *out_stransfer = &_strided_to_strided_field_transfer;
+    *out_transferdata = data;
+
+    return NPY_SUCCEED;
+}
+
+/************************* DEST BOOL SETONE *******************************/
+
+static void
+_null_to_strided_set_bool_one(char *dst,
+                        npy_intp dst_stride,
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    /* bool type is one byte, so can just use the char */
+
+    while (N > 0) {
+        *dst = 1;
+
+        dst += dst_stride;
+        --N;
+    }
+}
+
+static void
+_null_to_contig_set_bool_one(char *dst,
+                        npy_intp NPY_UNUSED(dst_stride),
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    /* bool type is one byte, so can just use the char */
+
+    memset(dst, 1, N);
+}
+
+/* Only for the bool type, sets the destination to 1 */
+NPY_NO_EXPORT int
+get_bool_setdstone_transfer_function(npy_intp dst_stride,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *NPY_UNUSED(out_needs_api))
+{
+    if (dst_stride == 1) {
+        *out_stransfer = &_null_to_contig_set_bool_one;
+    }
+    else {
+        *out_stransfer = &_null_to_strided_set_bool_one;
+    }
+    *out_transferdata = NULL;
+
+    return NPY_SUCCEED;
+}
+
+/*************************** DEST SETZERO *******************************/
+
+/* Sets dest to zero */
+typedef struct {
+    void *freefunc, *copyfunc;
+    npy_intp dst_itemsize;
+} _dst_memset_zero_data;
+
+/* zero-padded data copy function */
+_dst_memset_zero_data *_dst_memset_zero_data_copy(
+                                        _dst_memset_zero_data *data)
+{
+    _dst_memset_zero_data *newdata = 
+            (_dst_memset_zero_data *)PyArray_malloc(
+                                    sizeof(_dst_memset_zero_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(_dst_memset_zero_data));
+
+    return newdata;
+}
+
+static void
+_null_to_strided_memset_zero(char *dst,
+                        npy_intp dst_stride,
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _dst_memset_zero_data *d = (_dst_memset_zero_data *)data;
+    npy_intp dst_itemsize = d->dst_itemsize;
+
+    while (N > 0) {
+        memset(dst, 0, dst_itemsize);
+        dst += dst_stride;
+        --N;
+    }
+}
+
+static void
+_null_to_contig_memset_zero(char *dst,
+                        npy_intp dst_stride,
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *data)
+{
+    _dst_memset_zero_data *d = (_dst_memset_zero_data *)data;
+    npy_intp dst_itemsize = d->dst_itemsize;
+
+    memset(dst, 0, N*dst_itemsize);
+}
+
+static void
+_null_to_strided_reference_setzero(char *dst,
+                        npy_intp dst_stride,
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    PyObject *dst_ref = NULL;
+
+    while (N > 0) {
+        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+
+        /* Release the reference in dst */
+        NPY_DT_DBG_REFTRACE("dec dest ref (to set zero)", dst_ref);
+        Py_XDECREF(dst_ref);
+        
+        /* Set it to zero */
+        dst_ref = NULL;
+        NPY_COPY_PYOBJECT_PTR(dst, &dst_ref);
+
+        dst += dst_stride;
+        --N;
+    }
+}
+
+NPY_NO_EXPORT int
+get_setdstzero_transfer_function(int aligned,
+                            npy_intp dst_stride,
+                            PyArray_Descr *dst_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    _dst_memset_zero_data *data;
+
+    /* If there are no references, just set the whole thing to zero */
+    if (!PyDataType_REFCHK(dst_dtype)) {
+        data = (_dst_memset_zero_data *)
+                        PyArray_malloc(sizeof(_dst_memset_zero_data));
+        if (data == NULL) {
+            PyErr_NoMemory();
+            return NPY_FAIL;
+        }
+
+        data->freefunc = &PyArray_free;
+        data->copyfunc = &_dst_memset_zero_data_copy;
+        data->dst_itemsize = dst_dtype->elsize;
+
+        if (dst_stride == data->dst_itemsize) {
+            *out_stransfer = &_null_to_contig_memset_zero;
+        }
+        else {
+            *out_stransfer = &_null_to_strided_memset_zero;
+        }
+        *out_transferdata = data;
+    }
+    /* If it's exactly one reference, use the decref function */
+    else if (dst_dtype->type_num == NPY_OBJECT) {
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        *out_stransfer = &_null_to_strided_reference_setzero;
+        *out_transferdata = NULL;
+    }
+    /* If there are subarrays, need to wrap it */
+    else if (dst_dtype->subarray != NULL) {
+        PyArray_Dims dst_shape = {NULL, -1};
+        npy_intp dst_size = 1;
+        PyArray_StridedTransferFn *stransfer;
+        void *data;
+
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        if (!(PyArray_IntpConverter(dst_dtype->subarray->shape,
+                                            &dst_shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return NPY_FAIL;
+        }
+        dst_size = PyArray_MultiplyList(dst_shape.ptr, dst_shape.len);
+        PyDimMem_FREE(dst_shape.ptr);
+
+        /* Get a function for contiguous dst of the subarray type */
+        if (get_setdstzero_transfer_function(aligned,
+                                dst_dtype->subarray->base->elsize,
+                                dst_dtype->subarray->base,
+                                &stransfer, &data,
+                                out_needs_api) != NPY_SUCCEED) {
+            return NPY_FAIL;
+        }
+
+        if (wrap_transfer_function_n_to_n(stransfer, data,
+                            0, dst_stride,
+                            0, dst_dtype->subarray->base->elsize,
+                            dst_size,
+                            out_stransfer, out_transferdata) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(data);
+            return NPY_FAIL;
+        }
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dst_dtype)) {
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        return get_setdestzero_fields_transfer_function(aligned,
+                            dst_stride, dst_dtype,
+                            out_stransfer,
+                            out_transferdata,
+                            out_needs_api);
+    }
+
+    return NPY_SUCCEED;
+}
+
+static void
+_dec_src_ref_nop(char *NPY_UNUSED(dst),
+                        npy_intp NPY_UNUSED(dst_stride),
+                        char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
+                        npy_intp NPY_UNUSED(N),
+                        npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    /* NOP */
+}
+
+static void
+_strided_to_null_dec_src_ref_reference(char *NPY_UNUSED(dst),
+                        npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp src_stride,
+                        npy_intp N,
+                        npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    PyObject *src_ref = NULL;
+    while (N > 0) {
+        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
+
+        /* Release the reference in src */
+        NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
+        Py_XDECREF(src_ref);
+
+        src += src_stride;
+        --N;
+    }
+}
+
+
+NPY_NO_EXPORT int
+get_decsrcref_transfer_function(int aligned,
+                            npy_intp src_stride,
+                            PyArray_Descr *src_dtype,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    /* If there are no references, it's a nop */
+    if (!PyDataType_REFCHK(src_dtype)) {
+        *out_stransfer = &_dec_src_ref_nop;
+        *out_transferdata = NULL;
+
+        return NPY_SUCCEED;
+    }
+    /* If it's a single reference, it's one decref */
+    else if (src_dtype->type_num == NPY_OBJECT) {
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        *out_stransfer = &_strided_to_null_dec_src_ref_reference;
+        *out_transferdata = NULL;
+
+        return NPY_SUCCEED;
+    }
+    /* If there are subarrays, need to wrap it */
+    else if (src_dtype->subarray != NULL) {
+        PyArray_Dims src_shape = {NULL, -1};
+        npy_intp src_size = 1;
+        PyArray_StridedTransferFn *stransfer;
+        void *data;
+
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        if (!(PyArray_IntpConverter(src_dtype->subarray->shape,
+                                            &src_shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return NPY_FAIL;
+        }
+        src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
+        PyDimMem_FREE(src_shape.ptr);
+        
+        /* Get a function for contiguous src of the subarray type */
+        if (get_decsrcref_transfer_function(aligned,
+                                src_dtype->subarray->base->elsize,
+                                src_dtype->subarray->base,
+                                &stransfer, &data,
+                                out_needs_api) != NPY_SUCCEED) {
+            return NPY_FAIL;
+        }
+
+        if (wrap_transfer_function_n_to_n(stransfer, data,
+                                src_stride, 0,
+                                src_dtype->subarray->base->elsize, 0,
+                                src_size,
+                                out_stransfer, out_transferdata) != NPY_SUCCEED) {
+            PyArray_FreeStridedTransferData(data);
+            return NPY_FAIL;
+        }
+
+        return NPY_SUCCEED;
+    }
+    /* If there are fields, need to do each field */
+    else {
+        if (out_needs_api) {
+            *out_needs_api = 1;
+        }
+
+        return get_decsrcref_fields_transfer_function(aligned,
+                            src_stride, src_dtype,
+                            out_stransfer,
+                            out_transferdata,
+                            out_needs_api);
+    }
+}
+
+/********************* MAIN DTYPE TRANSFER FUNCTION ***********************/
+
+NPY_NO_EXPORT int
+PyArray_GetDTypeTransferFunction(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api)
+{
+    npy_intp src_itemsize, dst_itemsize;
+    int src_type_num, dst_type_num;
+
+#if NPY_DT_DBG_TRACING
+    printf("Calculating dtype transfer from ");
+    PyObject_Print((PyObject *)src_dtype, stdout, 0);
+    printf(" to ");
+    PyObject_Print((PyObject *)dst_dtype, stdout, 0);
+    printf("\n");
+#endif
+
+    /*
+     * If one of the dtypes is NULL, we give back either a src decref
+     * function or a dst setzero function
+     */
+    if (dst_dtype == NULL) {
+        if (move_references) {
+            return get_decsrcref_transfer_function(aligned,
+                                src_dtype->elsize,
+                                src_dtype,
+                                out_stransfer, out_transferdata,
+                                out_needs_api);
+        }
+        else {
+            *out_stransfer = &_dec_src_ref_nop;
+            *out_transferdata = NULL;
+            return NPY_SUCCEED;
+        }
+    }
+    else if (src_dtype == NULL) {
+        return get_setdstzero_transfer_function(aligned,
+                                dst_dtype->elsize,
+                                dst_dtype,
+                                out_stransfer, out_transferdata,
+                                out_needs_api);
+    }
+
+    src_itemsize = src_dtype->elsize;
+    dst_itemsize = dst_dtype->elsize;
+    src_type_num = src_dtype->type_num;
+    dst_type_num = dst_dtype->type_num;
+
+    /* Common special case - number -> number NBO cast */
+    if (PyTypeNum_ISNUMBER(src_type_num) &&
+                    PyTypeNum_ISNUMBER(dst_type_num) &&
+                    PyArray_ISNBO(src_dtype->byteorder) &&
+                    PyArray_ISNBO(dst_dtype->byteorder)) {
+        if (PyArray_EquivTypenums(src_type_num, dst_type_num)) {
+            *out_stransfer = PyArray_GetStridedCopyFn(aligned,
+                                        src_stride, dst_stride,
+                                        src_itemsize);
+            *out_transferdata = NULL;
+            return (*out_stransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+        }
+        else {
+            return get_nbo_cast_numeric_transfer_function (aligned,
+                                        src_stride, dst_stride,
+                                        src_type_num, dst_type_num,
+                                        out_stransfer, out_transferdata);
+        }
+    }
+
+    /*
+     * If there are no references and the data types are equivalent,
+     * return a simple copy
+     */
+    if (!PyDataType_REFCHK(src_dtype) && !PyDataType_REFCHK(dst_dtype) &&
+                            PyArray_EquivTypes(src_dtype, dst_dtype)) {
+        /*
+         * We can't pass through the aligned flag because it's not
+         * appropriate. Consider a size-8 string, it will say it's
+         * aligned because strings only need alignment 1, but the
+         * copy function wants to know if it's alignment 8.
+         *
+         * TODO: Change align from a flag to a "best power of 2 alignment"
+         *       which holds the strongest alignment value for all
+         *       the data which will be used.
+         */
+        *out_stransfer = PyArray_GetStridedCopyFn(0,
+                                        src_stride, dst_stride,
+                                        src_dtype->elsize);
+        *out_transferdata = NULL;
+        return NPY_SUCCEED;
+    }
+
+    /* First look at the possibilities of just a copy or swap */
+    if (src_itemsize == dst_itemsize && src_dtype->kind == dst_dtype->kind &&
+                !PyDataType_HASFIELDS(src_dtype) &&
+                !PyDataType_HASFIELDS(dst_dtype) &&
+                src_dtype->subarray == NULL && dst_dtype->subarray == NULL) {
+        /* A custom data type requires that we use its copy/swap */
+        if (src_type_num >= NPY_NTYPES || dst_type_num >= NPY_NTYPES) {
+            /*
+             * If the sizes and kinds are identical, but they're different
+             * custom types, then get a cast function
+             */
+            if (src_type_num != dst_type_num) {
+                return get_cast_transfer_function(aligned,
+                                src_stride, dst_stride,
+                                src_dtype, dst_dtype,
+                                move_references,
+                                out_stransfer, out_transferdata,
+                                out_needs_api);
+            }
+            else {
+                return wrap_copy_swap_function(aligned,
+                                src_stride, dst_stride,
+                                src_dtype,
+                                PyArray_ISNBO(src_dtype->byteorder) !=
+                                        PyArray_ISNBO(dst_dtype->byteorder),
+                                out_stransfer, out_transferdata);
+            }
+
+
+        }
+
+        /* The special types, which have no byte-order */
+        switch (src_type_num) {
+            case NPY_VOID:
+            case NPY_STRING:
+            case NPY_UNICODE:
+                *out_stransfer = PyArray_GetStridedCopyFn(0,
+                                    src_stride, dst_stride,
+                                    src_itemsize);
+                *out_transferdata = NULL;
+                return NPY_SUCCEED;
+            case NPY_OBJECT:
+                if (out_needs_api) {
+                    *out_needs_api = 1;
+                }
+                if (move_references) {
+                    *out_stransfer = &_strided_to_strided_move_references;
+                    *out_transferdata = NULL;
+                }
+                else {
+                    *out_stransfer = &_strided_to_strided_copy_references;
+                    *out_transferdata = NULL;
+                }
+                return NPY_SUCCEED;
+        }
+
+        /* This is a straight copy */
+        if (src_itemsize == 1 || PyArray_ISNBO(src_dtype->byteorder) ==
+                                 PyArray_ISNBO(dst_dtype->byteorder)) {
+            *out_stransfer = PyArray_GetStridedCopyFn(aligned,
+                                        src_stride, dst_stride,
+                                        src_itemsize);
+            *out_transferdata = NULL;
+            return (*out_stransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+        }
+        /* This is a straight copy + byte swap */
+        else if (!PyTypeNum_ISCOMPLEX(src_type_num)) {
+            *out_stransfer = PyArray_GetStridedCopySwapFn(aligned,
+                                        src_stride, dst_stride,
+                                        src_itemsize);
+            *out_transferdata = NULL;
+            return (*out_stransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+        }
+        /* This is a straight copy + element pair byte swap */
+        else {
+            *out_stransfer = PyArray_GetStridedCopySwapPairFn(aligned,
+                                        src_stride, dst_stride,
+                                        src_itemsize);
+            *out_transferdata = NULL;
+            return (*out_stransfer == NULL) ? NPY_FAIL : NPY_SUCCEED;
+        }
+    }
+
+    /* Handle subarrays */
+    if (src_dtype->subarray != NULL || dst_dtype->subarray != NULL) {
+        return get_subarray_transfer_function(aligned,
+                        src_stride, dst_stride,
+                        src_dtype, dst_dtype,
+                        move_references,
+                        out_stransfer, out_transferdata,
+                        out_needs_api);
+    }
+
+    /* Handle fields */
+    if (PyDataType_HASFIELDS(src_dtype) ||
+                PyDataType_HASFIELDS(dst_dtype)) {
+        return get_fields_transfer_function(aligned,
+                        src_stride, dst_stride,
+                        src_dtype, dst_dtype,
+                        move_references,
+                        out_stransfer, out_transferdata,
+                        out_needs_api);
+    }
+
+    /* Check for different-sized strings, unicodes, or voids */
+    if (src_type_num == dst_type_num) switch (src_type_num) {
+        case NPY_STRING:
+        case NPY_UNICODE:
+        case NPY_VOID:
+            return PyArray_GetStridedZeroPadCopyFn(0,
+                                    src_stride, dst_stride,
+                                    src_dtype->elsize, dst_dtype->elsize,
+                                    out_stransfer, out_transferdata);
+    }
+
+    /* Otherwise a cast is necessary */
+    return get_cast_transfer_function(aligned,
+                    src_stride, dst_stride,
+                    src_dtype, dst_dtype,
+                    move_references,
+                    out_stransfer, out_transferdata,
+                    out_needs_api);
+}
+
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
new file mode 100644
index 000000000..f13ff6d21
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -0,0 +1,2402 @@
+/*
+ * This file contains the implementation of the 'einsum' function,
+ * which provides an einstein-summation operation.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The Univerity of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+#include <numpy/halffloat.h>
+
+#include <ctype.h>
+
+#ifdef __SSE__
+#define EINSUM_USE_SSE1 1
+#else
+#define EINSUM_USE_SSE1 0
+#endif
+
+/*
+ * TODO: Only SSE for float32 is implemented in the loops,
+ * no SSE2 for float64
+ */
+#ifdef __SSE2__
+#define EINSUM_USE_SSE2 0
+#else
+#define EINSUM_USE_SSE2 0
+#endif
+
+#if EINSUM_USE_SSE1
+#include <xmmintrin.h>
+#endif
+
+#if EINSUM_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
+
+typedef enum {
+    BROADCAST_LEFT,
+    BROADCAST_RIGHT,
+    BROADCAST_MIDDLE
+} EINSUM_BROADCAST;
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #temp = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         float, float, double, longdouble,
+ *         float, double, longdouble#
+ * #to = ,,,,,
+ *       ,,,,,
+ *       npy_float_to_half,,,,
+ *       ,,#
+ * #from = ,,,,,
+ *         ,,,,,
+ *         npy_half_to_float,,,,
+ *         ,,#
+ * #complex = 0*5,
+ *            0*5,
+ *            0*4,
+ *            1*3#
+ * #float32 = 0*5,
+ *            0*5,
+ *            0,1,0,0,
+ *            0*3#
+ */
+
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) +
+                                         @from@(*(npy_@name@ *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif @nop@ == 2
+        *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) *
+                                         @from@(*(npy_@name@ *)data1) +
+                                         @from@(*(npy_@name@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif @nop@ == 3
+        *(npy_@name@ *)data_out = @to@(@from@(*(npy_@name@ *)data0) *
+                                         @from@(*(npy_@name@ *)data1) *
+                                         @from@(*(npy_@name@ *)data2) +
+                                         @from@(*(npy_@name@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(npy_@name@ *)dataptr[i]);
+        }
+        *(npy_@name@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(npy_@name@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        ((npy_@temp@ *)data_out)[0] = ((npy_@temp@ *)data0)[0] +
+                                         ((npy_@temp@ *)data_out)[0];
+        ((npy_@temp@ *)data_out)[1] = ((npy_@temp@ *)data0)[1] +
+                                         ((npy_@temp@ *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_@temp@ re, im, tmp;
+        re = ((npy_@temp@ *)dataptr[0])[0];
+        im = ((npy_@temp@ *)dataptr[0])[1];
+        int i;
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_@temp@ *)dataptr[i])[0] -
+                  im * ((npy_@temp@ *)dataptr[i])[1];
+            im = re * ((npy_@temp@ *)dataptr[i])[1] +
+                 im * ((npy_@temp@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@name@ *data_out = (npy_@name@ *)dataptr[1];
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+#if !@complex@
+        data_out[@i@] = @to@(@from@(data0[@i@]) +
+                             @from@(data_out[@i@]));
+#else /* complex */
+        ((npy_@temp@ *)data_out + 2*@i@)[0] =
+                                ((npy_@temp@ *)data0 + 2*@i@)[0] +
+                                ((npy_@temp@ *)data_out + 2*@i@)[0];
+        ((npy_@temp@ *)data_out + 2*@i@)[1] =
+                                ((npy_@temp@ *)data0 + 2*@i@)[1] +
+                                ((npy_@temp@ *)data_out + 2*@i@)[1];
+#endif
+        data0 += 16;
+        data_out += 16;
+/**end repeat2**/
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        return;
+    }
+#if !@complex@
+    data_out[@i@] = @to@(@from@(data0[@i@]) +
+                         @from@(data_out[@i@]));
+#else
+    ((npy_@temp@ *)data_out + 2*@i@)[0] =
+                            ((npy_@temp@ *)data0 + 2*@i@)[0] +
+                            ((npy_@temp@ *)data_out + 2*@i@)[0];
+    ((npy_@temp@ *)data_out + 2*@i@)[1] =
+                            ((npy_@temp@ *)data0 + 2*@i@)[1] +
+                            ((npy_@temp@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@name@ *data1 = (npy_@name@ *)dataptr[1];
+    npy_@name@ *data_out = (npy_@name@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b;
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+        EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 16;
+            data1 += 16;
+            data_out += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 16;
+        data1 += 16;
+        data_out += 16;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@temp@ value0 = @from@(*(npy_@name@ *)dataptr[0]);
+    npy_@name@ *data1 = (npy_@name@ *)dataptr[1];
+    npy_@name@ *data_out = (npy_@name@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value0_sse;
+
+    value0_sse = _mm_set_ps1(value0);
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 16;
+            data_out += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        data_out[@i@] = @to@(value0 *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data1 += 16;
+        data_out += 16;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(value0 *
+                         @from@(data1[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@temp@ value1 = @from@(*(npy_@name@ *)dataptr[1]);
+    npy_@name@ *data_out = (npy_@name@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value1_sse;
+
+    value1_sse = _mm_set_ps1(value1);
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 16;
+            data_out += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@])*
+                             value1  +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 16;
+        data_out += 16;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@])*
+                         value1  +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@name@ *data1 = (npy_@name@ *)dataptr[1];
+    npy_@temp@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+            data0 += 16;
+            data1 += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 16;
+        data1 += 16;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#endif
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        *(npy_@name@ *)dataptr[2] += @to@(accum);
+        return;
+    }
+    accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+
+    *(npy_@name@ *)dataptr[2] += @to@(accum);
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@temp@ value0 = @from@(*(npy_@name@ *)dataptr[0]);
+    npy_@name@ *data1 = (npy_@name@ *)dataptr[1];
+    npy_@temp@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
+/**end repeat2**/
+            data1 += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        accum += @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data1 += 16;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#endif
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        *(npy_@name@ *)dataptr[2] += @to@(value0 * accum);
+        return;
+    }
+    accum += @from@(data1[@i@]);
+/**end repeat2**/
+
+    *(npy_@name@ *)dataptr[2] += @to@(value0 * accum);
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@temp@ value1 = @from@(*(npy_@name@ *)dataptr[1]);
+    npy_@temp@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 16 */
+        while (count >= 16) {
+            count -= 16;
+
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 16;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4, 8, 12#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        accum += @from@(data0[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 16;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#endif
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        *(npy_@name@ *)dataptr[2] += @to@(accum * value1);
+        return;
+    }
+    accum += @from@(data0[@i@]);
+/**end repeat2**/
+
+    *(npy_@name@ *)dataptr[2] += @to@(accum * value1);
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_@name@ *data0 = (npy_@name@ *)dataptr[0];
+    npy_@name@ *data1 = (npy_@name@ *)dataptr[1];
+    npy_@name@ *data2 = (npy_@name@ *)dataptr[2];
+    npy_@name@ *data_out = (npy_@name@ *)dataptr[3];
+
+    /* Unroll the loop by 16 */
+    while (count >= 16) {
+        count -= 16;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) *
+                             @from@(data2[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+        data0 += 16;
+        data1 += 16;
+        data_out += 16;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) *
+                         @from@(data2[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp *NPY_UNUSED(strides), npy_intp count)
+{
+    while (count--) {
+#if !@complex@
+        npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(npy_@name@ *)dataptr[i]);
+        }
+        *(npy_@name@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(npy_@name@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_@name@);
+        }
+#else /* complex */
+#  if @nop@ <= 3
+#    define _SUMPROD_NOP @nop@
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_@temp@ re, im, tmp;
+        re = ((npy_@temp@ *)dataptr[0])[0];
+        im = ((npy_@temp@ *)dataptr[0])[1];
+        int i;
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_@temp@ *)dataptr[i])[0] -
+                  im * ((npy_@temp@ *)dataptr[i])[1];
+            im = re * ((npy_@temp@ *)dataptr[i])[1] +
+                 im * ((npy_@temp@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_@temp@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_@name@);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if @complex@
+    npy_@temp@ accum_re = 0, accum_im = 0;
+#else
+    npy_@temp@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        accum += @from@(*(npy_@name@ *)data0);
+        data0 += stride0;
+#  elif @nop@ == 2
+        accum += @from@(*(npy_@name@ *)data0) *
+                 @from@(*(npy_@name@ *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif @nop@ == 3
+        accum += @from@(*(npy_@name@ *)data0) *
+                 @from@(*(npy_@name@ *)data1) *
+                 @from@(*(npy_@name@ *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_@temp@ temp = @from@(*(npy_@name@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(npy_@name@ *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        accum_re += ((npy_@temp@ *)data0)[0];
+        accum_im += ((npy_@temp@ *)data0)[1];
+        data0 += stride0;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_@temp@ re, im, tmp;
+        re = ((npy_@temp@ *)dataptr[0])[0];
+        im = ((npy_@temp@ *)dataptr[0])[1];
+        int i;
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_@temp@ *)dataptr[i])[0] -
+                  im * ((npy_@temp@ *)dataptr[i])[1];
+            im = re * ((npy_@temp@ *)dataptr[i])[1] +
+                 im * ((npy_@temp@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if @complex@
+#  if @nop@ <= 3
+    ((npy_@temp@ *)dataptr[@nop@])[0] += accum_re;
+    ((npy_@temp@ *)dataptr[@nop@])[1] += accum_im;
+#  else
+    ((npy_@temp@ *)dataptr[nop])[0] += accum_re;
+    ((npy_@temp@ *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if @nop@ <= 3
+    *((npy_@name@ *)dataptr[@nop@]) = @to@(accum +
+                                    @from@(*((npy_@name@ *)dataptr[@nop@])));
+#  else
+    *((npy_@name@ *)dataptr[nop]) = @to@(accum +
+                                    @from@(*((npy_@name@ *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif @nop@ == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif @nop@ == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+#endif
+
+/* Unroll the loop by 16 for fixed-size nop */
+#if (@nop@ <= 3)
+    while (count >= 16) {
+        count -= 16;
+#else
+    while (count--) {
+#endif
+
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data1 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data1 += 16*sizeof(npy_bool);
+        data2 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        if (count-- == 0) {
+            return;
+        }
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        if (count-- == 0) {
+            return;
+        }
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data1 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+        if (count-- == 0) {
+            return;
+        }
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 16*sizeof(npy_bool);
+        data1 += 16*sizeof(npy_bool);
+        data2 += 16*sizeof(npy_bool);
+        data_out += 16*sizeof(npy_bool);
+#  endif
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif @nop@ == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif @nop@ == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if @nop@ <= 3
+    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+/**end repeat**/
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp *, npy_intp);
+
+static sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp *fixed_strides)
+{
+    int iop;
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        if (fixed_strides[0] == itemsize) {
+            if (fixed_strides[1] == itemsize) {
+                if (fixed_strides[2] == itemsize) {
+                    /* contig, contig, contig */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_contig_two;
+/**end repeat**/
+                    }
+                }
+                else if (fixed_strides[2] == 0) {
+                    /* contig, contig, stride0 */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_contig_contig_outstride0_two;
+/**end repeat**/
+                    }
+                }
+            }
+            else if (fixed_strides[1] == 0) {
+                if (fixed_strides[2] == itemsize) {
+                    /* contig, stride0, contig */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_contig_stride0_outcontig_two;
+/**end repeat**/
+                    }
+                }
+                else if (fixed_strides[2] == 0) {
+                    /* contig, stride0, stride0 */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_contig_stride0_outstride0_two;
+/**end repeat**/
+                    }
+                }
+            }
+        }
+        else if (fixed_strides[0] == 0) {
+            if (fixed_strides[1] == itemsize) {
+                if (fixed_strides[2] == itemsize) {
+                    /* stride0, contig, contig */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_stride0_contig_outcontig_two;
+/**end repeat**/
+                    }
+                }
+                else if (fixed_strides[2] == 0) {
+                    /* stride0, contig, stride0 */
+                    switch (type_num) {
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ */
+                        case NPY_@NAME@:
+                   return &@name@_sum_of_products_stride0_contig_outstride0_two;
+/**end repeat**/
+                    }
+                }
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        switch (type_num) {
+/**begin repeat
+ * #name = bool,
+ *         byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ */
+            case NPY_@NAME@:
+                switch (nop) {
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+#if @nop@ <= 3
+                    case @nop@:
+#else
+                    default:
+#endif
+                        return &@name@_sum_of_products_outstride0_@noplabel@;
+/**end repeat1**/
+                }
+/**end repeat**/
+        }
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop) {
+        switch (type_num) {
+/**begin repeat
+ * #name = bool,
+ *         byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ */
+            case NPY_@NAME@:
+                switch (nop) {
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+#if @nop@ <= 3
+                    case @nop@:
+#else
+                    default:
+#endif
+                        return &@name@_sum_of_products_contig_@noplabel@;
+/**end repeat1**/
+                }
+/**end repeat**/
+        }
+    }
+
+    /* Regular inner loop */
+    switch (type_num) {
+/**begin repeat
+ * #name = bool,
+ *         byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ */
+        case NPY_@NAME@:
+            switch (nop) {
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+#if @nop@ <= 3
+                case @nop@:
+#else
+                default:
+#endif
+                    return &@name@_sum_of_products_@noplabel@;
+/**end repeat1**/
+            }
+/**end repeat**/
+    }
+
+    return NULL;
+}
+
+/*
+ * Parses the subscripts for one operand into an output
+ * of 'ndim' labels
+ */
+static int
+parse_operand_subscripts(char *subscripts, int length,
+                        int ndim,
+                        int iop, char *out_labels,
+                        char *out_label_counts,
+                        int *out_min_label,
+                        int *out_max_label,
+                        int *out_num_labels,
+                        EINSUM_BROADCAST *out_broadcast)
+{
+    int i, idim, ndim_left, label;
+    int left_labels = 0, right_labels = 0;
+
+    /* Process the labels from the end until the ellipsis */
+    idim = ndim-1;
+    for (i = length-1; i >= 0; --i) {
+        label = subscripts[i];
+        /* A label for an axis */
+        if (label > 0 && isalpha(label)) {
+            if (idim >= 0) {
+                out_labels[idim--] = label;
+                /* Calculate the min and max labels */
+                if (label < *out_min_label) {
+                    *out_min_label = label;
+                }
+                if (label > *out_max_label) {
+                    *out_max_label = label;
+                }
+                /* If it's the first time we see this label, count it */
+                if (out_label_counts[label] == 0) {
+                    (*out_num_labels)++;
+                }
+                out_label_counts[label]++;
+                right_labels = 1;
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                            "einstein sum subscripts string contains "
+                            "too many subscripts for operand %d", iop);
+                return 0;
+            }
+        }
+        /* The end of the ellipsis */
+        else if (label == '.') {
+            /* A valid ellipsis */
+            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
+                length = i-2;
+                break;
+            }
+            else {
+                PyErr_SetString(PyExc_ValueError,
+                            "einstein sum subscripts string contains a "
+                            "'.' that is not part of an ellipsis ('...')");
+                return 0;
+                
+            }
+        }
+        else {
+            PyErr_Format(PyExc_ValueError,
+                        "invalid subscript '%c' in einstein sum "
+                        "subscripts string, subscripts must "
+                        "be letters", (char)label);
+            return 0;
+        }
+    }
+    /* Reduce ndim to just the dimensions left to fill at the beginning */
+    ndim_left = idim+1;
+    idim = 0;
+
+    /*
+     * If we stopped because of an ellipsis, start again from the beginning.
+     * The length was truncated to end at the ellipsis in this case.
+     */
+    if (i > 0) {
+        for (i = 0; i < length; ++i) {
+            label = subscripts[i];
+            /* A label for an axis */
+            if (label > 0 && isalnum(label)) {
+                if (idim < ndim_left) {
+                    out_labels[idim++] = label;
+                    /* Calculate the min and max labels */
+                    if (label < *out_min_label) {
+                        *out_min_label = label;
+                    }
+                    if (label > *out_max_label) {
+                        *out_max_label = label;
+                    }
+                    /* If it's the first time we see this label, count it */
+                    if (out_label_counts[label] == 0) {
+                        (*out_num_labels)++;
+                    }
+                    out_label_counts[label]++;
+                    left_labels = 1;
+                }
+                else {
+                    PyErr_Format(PyExc_ValueError,
+                                "einstein sum subscripts string contains "
+                                "too many subscripts for operand %d", iop);
+                    return 0;
+                }
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                            "invalid subscript '%c' in einstein sum "
+                            "subscripts string, subscripts must "
+                            "be letters", (char)label);
+                return 0;
+            }
+        }
+    }
+
+    /* Set the remaining labels to 0 */
+    while (idim < ndim_left) {
+        out_labels[idim++] = 0;
+    }
+
+    /*
+     * Find any labels duplicated for this operand, and turn them
+     * into negative offets to the axis to merge with.
+     */
+    for (idim = 0; idim  < ndim-1; ++idim) {
+        char *next;
+        /* If this is a proper label, find any duplicates of it */
+        label = out_labels[idim];
+        if (label > 0) {
+            /* Search for the next matching label */
+            next = (char *)memchr(out_labels+idim+1, label,
+                                    ndim-idim-1);
+            while (next != NULL) {
+                /* The offset from next to out_labels[idim] (negative) */
+                *next = (out_labels+idim)-next;
+                /* Search for the next matching label */
+                next = (char *)memchr(next+1, label,
+                                        out_labels+ndim-1-next);
+            }
+        }
+    }
+
+    if (left_labels && right_labels) {
+        *out_broadcast = BROADCAST_MIDDLE;
+    }
+    else if (!left_labels) {
+        *out_broadcast = BROADCAST_RIGHT;
+    }
+    else {
+        *out_broadcast = BROADCAST_LEFT;
+    }
+
+    return 1;
+}
+
+/*
+ * Parses the subscripts for the output operand into an output
+ * that requires 'ndim_broadcast' unlabeled dimensions, returning
+ * the number of output dimensions.  Returns -1 if there is an error.
+ */
+static int
+parse_output_subscripts(char *subscripts, int length,
+                        int ndim_broadcast,
+                        const char *label_counts,
+                        char *out_labels,
+                        EINSUM_BROADCAST *out_broadcast)
+{
+    int i, nlabels, label, idim, ndim, ndim_left;
+    int left_labels = 0, right_labels = 0;
+
+    /* Count the labels, making sure they're all unique and valid */
+    nlabels = 0;
+    for (i = 0; i < length; ++i) {
+        label = subscripts[i];
+        if (label > 0 && isalpha(label)) {
+            /* Check if it occurs again */
+            if (memchr(subscripts+i+1, label, length-i-1) == NULL) {
+                /* Check that it was used in the inputs */
+                if (label_counts[label] == 0) {
+                    PyErr_Format(PyExc_ValueError,
+                            "einstein sum subscripts string included "
+                            "output subscript '%c' which never appeared "
+                            "in an input", (char)label);
+                    return -1;
+                }
+
+                nlabels++;
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                        "einstein sum subscripts string includes "
+                        "output subscript '%c' multiple times",
+                        (char)label);
+                return -1;
+            }
+        }
+        else if (label != '.') {
+            PyErr_Format(PyExc_ValueError,
+                        "invalid subscript '%c' in einstein sum "
+                        "subscripts string, subscripts must "
+                        "be letters", (char)label);
+            return -1;
+        }
+    }
+
+    /* The number of output dimensions */
+    ndim = ndim_broadcast + nlabels;
+
+    /* Process the labels from the end until the ellipsis */
+    idim = ndim-1;
+    for (i = length-1; i >= 0; --i) {
+        label = subscripts[i];
+        /* A label for an axis */
+        if (label != '.') {
+            if (idim >= 0) {
+                out_labels[idim--] = label;
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                            "einstein sum subscripts string contains "
+                            "too many output subscripts");
+                return -1;
+            }
+            right_labels = 1;
+        }
+        /* The end of the ellipsis */
+        else {
+            /* A valid ellipsis */
+            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
+                length = i-2;
+                break;
+            }
+            else {
+                PyErr_SetString(PyExc_ValueError,
+                            "einstein sum subscripts string contains a "
+                            "'.' that is not part of an ellipsis ('...')");
+                return -1;
+                
+            }
+        }
+    }
+    /* Reduce ndim to just the dimensions left to fill at the beginning */
+    ndim_left = idim+1;
+    idim = 0;
+
+    /*
+     * If we stopped because of an ellipsis, start again from the beginning.
+     * The length was truncated to end at the ellipsis in this case.
+     */
+    if (i > 0) {
+        for (i = 0; i < length; ++i) {
+            label = subscripts[i];
+            /* A label for an axis */
+            if (label != '.') {
+                if (idim < ndim_left) {
+                    out_labels[idim++] = label;
+                }
+                else {
+                    PyErr_Format(PyExc_ValueError,
+                                "einstein sum subscripts string contains "
+                                "too many subscripts for the output");
+                    return -1;
+                }
+                left_labels = 1;
+            }
+            else {
+                PyErr_SetString(PyExc_ValueError,
+                            "einstein sum subscripts string contains a "
+                            "'.' that is not part of an ellipsis ('...')");
+                return -1;
+            }
+        }
+    }
+
+    /* Set the remaining output labels to 0 */
+    while (idim < ndim_left) {
+        out_labels[idim++] = 0;
+    }
+
+    if (left_labels && right_labels) {
+        *out_broadcast = BROADCAST_MIDDLE;
+    }
+    else if (!left_labels) {
+        *out_broadcast = BROADCAST_RIGHT;
+    }
+    else {
+        *out_broadcast = BROADCAST_LEFT;
+    }
+
+    return ndim;
+}
+
+
+/*
+ * When there's just one operand and no reduction, we
+ * can return a view into op.  This calculates the view
+ * if possible.
+ */
+static int
+get_single_op_view(PyArrayObject *op, int  iop, char *labels,
+                   int ndim_output, char *output_labels,
+                   PyArrayObject **ret)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    char *out_label;
+    int label, i, idim, ndim, ibroadcast = 0;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim_output; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Match the labels in the operand with the output labels */
+    for (idim = 0; idim < ndim; ++idim) {
+        label = labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            label = labels[idim+label];
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            /* The next output label that's a broadcast dimension */
+            for (; ibroadcast < ndim_output; ++ibroadcast) {
+                if (output_labels[ibroadcast] == 0) {
+                    break;
+                }
+            }
+            if (ibroadcast == ndim_output) {
+                PyErr_SetString(PyExc_ValueError,
+                        "output had too few broadcast dimensions");
+                return 0;
+            }
+            new_dims[ibroadcast] = PyArray_DIM(op, idim);
+            new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
+            ++ibroadcast;
+        }
+        else {
+            /* Find the position for this dimension in the output */
+            out_label = (char *)memchr(output_labels, label,
+                                                    ndim_output);
+            /* If it's not found, reduction -> can't return a view */
+            if (out_label == NULL) {
+                break;
+            }
+            /* Update the dimensions and strides of the output */
+            i = out_label - output_labels;
+            if (new_dims[i] != 0 &&
+                    new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in operand %d for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        iop, label, (int)new_dims[i],
+                        (int)PyArray_DIM(op, idim));
+                return 0;
+            }
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+    }
+    /* If we processed all the input axes, return a view */
+    if (idim == ndim) {
+        Py_INCREF(PyArray_DESCR(op));
+        *ret = (PyArrayObject *)PyArray_NewFromDescr(
+                                Py_TYPE(op),
+                                PyArray_DESCR(op),
+                                ndim_output, new_dims, new_strides,
+                                PyArray_DATA(op),
+                                0, (PyObject *)op);
+
+        if (*ret == NULL) {
+            return 0;
+        }
+        if (!PyArray_Check(*ret)) {
+            Py_DECREF(*ret);
+            *ret = NULL;
+            PyErr_SetString(PyExc_RuntimeError,
+                        "NewFromDescr failed to return an array");
+            return 0;
+        }
+        PyArray_UpdateFlags(*ret,
+                    NPY_C_CONTIGUOUS|NPY_ALIGNED|NPY_F_CONTIGUOUS);
+        Py_INCREF(op);
+        PyArray_BASE(*ret) = (PyObject *)op;
+        return 1;
+    }
+
+    /* Return success, but that we couldn't make a view */
+    *ret = NULL;
+    return 1;
+}
+
+static PyArrayObject *
+get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    int i, idim, ndim, icombine, combineoffset, label;
+    int icombinemap[NPY_MAXDIMS];
+
+    PyArrayObject *ret = NULL;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Copy the dimensions and strides, except when collapsing */
+    icombine = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        label = labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            combineoffset = label;
+            label = labels[idim+label];
+        }
+        else {
+            combineoffset = 0;
+            if (icombine != idim) {
+                labels[icombine] = labels[idim];
+            }
+            icombinemap[idim] = icombine;
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            new_dims[icombine] = PyArray_DIM(op, idim);
+            new_strides[icombine] = PyArray_STRIDE(op, idim);
+        }
+        else {
+            /* Update the combined axis dimensions and strides */
+            i = idim + combineoffset;
+            if (combineoffset < 0 &&
+                        new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in operand %d for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        iop, label, (int)new_dims[i],
+                        (int)PyArray_DIM(op, idim));
+                return NULL;
+            }
+            i = icombinemap[i];
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+
+        /* If the label didn't say to combine axes, increment dest i */
+        if (combineoffset == 0) {
+            icombine++;
+        }
+    }
+
+    /* The compressed number of dimensions */
+    ndim = icombine;
+
+    Py_INCREF(PyArray_DESCR(op));
+    ret = (PyArrayObject *)PyArray_NewFromDescr(
+                            Py_TYPE(op),
+                            PyArray_DESCR(op),
+                            ndim, new_dims, new_strides,
+                            PyArray_DATA(op),
+                            PyArray_ISWRITEABLE(op) ? NPY_WRITEABLE : 0,
+                            (PyObject *)op);
+
+    if (ret == NULL) {
+        return NULL;
+    }
+    if (!PyArray_Check(ret)) {
+        Py_DECREF(ret);
+        PyErr_SetString(PyExc_RuntimeError,
+                    "NewFromDescr failed to return an array");
+        return NULL;
+    }
+    PyArray_UpdateFlags(ret,
+                NPY_C_CONTIGUOUS|NPY_ALIGNED|NPY_F_CONTIGUOUS);
+    Py_INCREF(op);
+    PyArray_BASE(ret) = (PyObject *)op;
+
+    return ret;
+}
+
+static int
+prepare_op_axes(int ndim, int iop, char *labels, npy_intp *axes,
+            npy_intp ndim_iter, char *iter_labels, EINSUM_BROADCAST broadcast)
+{
+    int i, label, ibroadcast;
+
+    /* Regular broadcasting */
+    if (broadcast == BROADCAST_RIGHT) {
+        /* broadcast dimensions get placed in rightmost position */
+        ibroadcast = ndim-1; 
+        for (i = ndim_iter-1; i >= 0; --i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
+                    --ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * extend it with a "newaxis"
+                 */
+                if (ibroadcast < 0) {
+                    axes[i] = -1;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                    --ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+    /* Reverse broadcasting */
+    else if (broadcast == BROADCAST_LEFT) {
+        /* broadcast dimensions get placed in leftmost position */
+        ibroadcast = 0; 
+        for (i = 0; i < ndim_iter; ++i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast < ndim && labels[ibroadcast] != 0) {
+                    ++ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * extend it with a "newaxis"
+                 */
+                if (ibroadcast >= ndim) {
+                    axes[i] = -1;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                    ++ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+    /* Middle broadcasting */
+    else {
+        /* broadcast dimensions get placed in leftmost position */
+        ibroadcast = 0; 
+        for (i = 0; i < ndim_iter; ++i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast < ndim && labels[ibroadcast] != 0) {
+                    ++ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * it's an error
+                 */
+                if (ibroadcast >= ndim) {
+                    PyErr_Format(PyExc_ValueError,
+                            "operand %d did not have enough dimensions "
+                            "to match the broadcasting, and couldn't be "
+                            "extended because einstein sum subscripts "
+                            "were specified at both the start and end",
+                            iop);
+                    return 0;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                    ++ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+
+/*NUMPY_API
+ * This function provides summation of array elements according to
+ * the Einstein summation convention.  For example:
+ *  - trace(a)        -> einsum("ii", a)
+ *  - transpose(a)    -> einsum("ji", a)
+ *  - multiply(a,b)   -> einsum(",", a, b)
+ *  - inner(a,b)      -> einsum("i,i", a, b)
+ *  - outer(a,b)      -> einsum("i,j", a, b)
+ *  - matvec(a,b)     -> einsum("ij,j", a, b)
+ *  - matmat(a,b)     -> einsum("ij,jk", a, b)
+ *
+ * subscripts: The string of subscripts for einstein summation.
+ * nop:        The number of operands
+ * op_in:      The array of operands
+ * dtype:      Either NULL, or the data type to force the calculation as.
+ * order:      The order for the calculation/the output axes.
+ * casting:    What kind of casts should be permitted.
+ * out:        Either NULL, or an array into which the output should be placed.
+ *
+ * By default, the labels get placed in alphabetical order
+ * at the end of the output. So, if c = einsum("i,j", a, b)
+ * then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b)
+ * then c[i,j] = a[j]*b[i].
+ *
+ * Alternatively, you can control the output order or prevent
+ * an axis from being summed/force an axis to be summed by providing
+ * indices for the output. This allows us to turn 'trace' into
+ * 'diag', for example.
+ *  - diag(a)         -> einsum("ii->i", a)
+ *  - sum(a, axis=0)  -> einsum("i...->", a)
+ *
+ * Subscripts at the beginning and end may be specified by
+ * putting an ellipsis "..." in the middle.  For example,
+ * the function einsum("i...i", a) takes the diagonal of
+ * the first and last dimensions of the operand, and
+ * einsum("ij...,jk...->ik...") takes the matrix product using
+ * the first two indices of each operand instead of the last two.
+ *
+ * When there is only one operand, no axes being summed, and
+ * no output parameter, this function returns a view
+ * into the operand instead of making a copy.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_EinsteinSum(char *subscripts, npy_intp nop,
+                    PyArrayObject **op_in,
+                    PyArray_Descr *dtype,
+                    NPY_ORDER order, NPY_CASTING casting,
+                    PyArrayObject *out)
+{
+    int iop, label, min_label = 127, max_label = 0, num_labels;
+    char label_counts[128];
+    char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
+    char output_labels[NPY_MAXDIMS], *iter_labels;
+    int idim, ndim_output, ndim_broadcast, ndim_iter;
+
+    EINSUM_BROADCAST broadcast[NPY_MAXARGS];
+    PyArrayObject *op[NPY_MAXARGS], *ret = NULL;
+    PyArray_Descr *op_dtypes_array[NPY_MAXARGS], **op_dtypes;
+
+    npy_intp op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    npy_intp *op_axes[NPY_MAXARGS];
+    npy_uint32 op_flags[NPY_MAXARGS];
+
+    NpyIter *iter;
+    sum_of_products_fn sop;
+    npy_intp fixed_strides[NPY_MAXARGS];
+
+    /* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */
+    if (nop >= NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError,
+                    "too many operands provided to einstein sum function");
+        return NULL;
+    }
+    else if (nop < 1) {
+        PyErr_SetString(PyExc_ValueError,
+                    "not enough operands provided to einstein sum function");
+        return NULL;
+    }
+
+    /* Parse the subscripts string into label_counts and op_labels */
+    memset(label_counts, 0, sizeof(label_counts));
+    num_labels = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        int length = (int)strcspn(subscripts, ",-");
+
+        if (iop == nop-1 && subscripts[length] == ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "more operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+        else if(iop < nop-1 && subscripts[length] != ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "fewer operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+
+        if (!parse_operand_subscripts(subscripts, length,
+                        PyArray_NDIM(op_in[iop]),
+                        iop, op_labels[iop], label_counts,
+                        &min_label, &max_label, &num_labels,
+                        &broadcast[iop])) {
+            return NULL;
+        }
+        
+        /* Move subscripts to the start of the labels for the next op */
+        subscripts += length;
+        if (iop < nop-1) {
+            subscripts++;
+        }
+    }
+
+    /*
+     * Find the number of broadcast dimensions, which is the maximum
+     * number of labels == 0 in an op_labels array.
+     */
+    ndim_broadcast = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        npy_intp count_zeros = 0;
+        int ndim;
+        char *labels = op_labels[iop];
+
+        ndim = PyArray_NDIM(op_in[iop]);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (labels[idim] == 0) {
+                ++count_zeros;
+            }
+        }
+
+        if (count_zeros > ndim_broadcast) {
+            ndim_broadcast = count_zeros;
+        }
+    }
+
+    /*
+     * If there is no output signature, create one using each label
+     * that appeared once, in alphabetical order
+     */
+    if (subscripts[0] == '\0') {
+        char outsubscripts[NPY_MAXDIMS];
+        int length = 0;
+        for (label = min_label; label <= max_label; ++label) {
+            if (label_counts[label] == 1) {
+                if (length < NPY_MAXDIMS-1) {
+                    outsubscripts[length++] = label;
+                }
+                else {
+                    PyErr_SetString(PyExc_ValueError,
+                                "einstein sum subscript string has too many "
+                                "distinct labels");
+                    return NULL;
+                }
+            }
+        }
+        /* Parse the output subscript string */
+        ndim_output = parse_output_subscripts(outsubscripts, length,
+                                        ndim_broadcast, label_counts,
+                                        output_labels, &broadcast[nop]);
+    }
+    else {
+        if (subscripts[0] != '-' || subscripts[1] != '>') {
+            PyErr_SetString(PyExc_ValueError,
+                        "einstein sum subscript string does not "
+                        "contain proper '->' output specified");
+            return NULL;
+        }
+        subscripts += 2;
+
+        /* Parse the output subscript string */
+        ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
+                                        ndim_broadcast, label_counts,
+                                        output_labels, &broadcast[nop]);
+    }
+    if (ndim_output < 0) {
+        return NULL;
+    }
+
+    if (out != NULL && PyArray_NDIM(out) != ndim_output) {
+        PyErr_Format(PyExc_ValueError,
+                "out parameter does not have the correct number of "
+                "dimensions, has %d but should have %d",
+                (int)PyArray_NDIM(out), (int)ndim_output);
+        return NULL;
+    }
+
+    /* Set all the op references to NULL */
+    for (iop = 0; iop < nop; ++iop) {
+        op[iop] = NULL;
+    }
+
+    /*
+     * Process all the input ops, combining dimensions into their
+     * diagonal where specified.
+     */
+    for (iop = 0; iop < nop; ++iop) {
+        char *labels = op_labels[iop];
+        int combine, ndim;
+
+        ndim = PyArray_NDIM(op_in[iop]);
+
+        /*
+         * If there's just one operand and no output parameter,
+         * first try remapping the axes to the output to return
+         * a view instead of a copy.
+         */
+        if (iop == 0 && nop == 1 && out == NULL) {
+            PyArrayObject *ret = NULL;
+
+            if (!get_single_op_view(op_in[iop], iop, labels,
+                                    ndim_output, output_labels,
+                                    &ret)) {
+                return NULL;
+            }
+
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+
+        /* Check whether any dimensions need to be combined */
+        combine = 0;
+        for (idim = 0; idim < ndim; ++idim) {
+            if (labels[idim] < 0) {
+                combine = 1;
+            }
+        }
+
+        /* If any dimensions are combined, create a view which combines them */
+        if (combine) {
+            op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+            if (op[iop] == NULL) {
+                goto fail;
+            }
+        }
+        /* No combining needed */
+        else {
+            Py_INCREF(op_in[iop]);
+            op[iop] = op_in[iop];
+        }
+    }
+
+    /* Set the output op */
+    op[nop] = out;
+
+    /*
+     * Set up the labels for the iterator (output + combined labels).
+     * Can just share the output_labels memory, because iter_labels
+     * is output_labels with some more labels appended.
+     */
+    iter_labels = output_labels;
+    ndim_iter = ndim_output;
+    for (label = min_label; label <= max_label; ++label) {
+        if (label_counts[label] > 0 &&
+                memchr(output_labels, label, ndim_output) == NULL) {
+            if (ndim_iter >= NPY_MAXDIMS) {
+                PyErr_SetString(PyExc_ValueError,
+                            "too many subscripts in einsum");
+                goto fail;
+            }
+            iter_labels[ndim_iter++] = label;
+        }
+    }
+
+    /* Set up the op_axes for the iterator */
+    for (iop = 0; iop < nop; ++iop) {
+        op_axes[iop] = op_axes_arrays[iop];
+
+        if (!prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels, broadcast[iop])) {
+            goto fail;
+        }
+    }
+
+    /* Set up the op_dtypes if dtype was provided */
+    if (dtype == NULL) {
+        op_dtypes = NULL;
+    }
+    else {
+        op_dtypes = op_dtypes_array;
+        for (iop = 0; iop <= nop; ++iop) {
+            op_dtypes[iop] = dtype;
+        }
+    }
+
+    /* Set the op_axes for the output */
+    op_axes[nop] = op_axes_arrays[nop];
+    for (idim = 0; idim < ndim_output; ++idim) {
+        op_axes[nop][idim] = idim;
+    }
+    for (idim = ndim_output; idim < ndim_iter; ++idim) {
+        op_axes[nop][idim] = -1;
+    }
+
+    /* Set the iterator per-op flags */
+
+    for (iop = 0; iop < nop; ++iop) {
+        op_flags[iop] = NPY_ITER_READONLY|
+                        NPY_ITER_NBO|
+                        NPY_ITER_ALIGNED;
+    }
+    op_flags[nop] = NPY_ITER_READWRITE|
+                    NPY_ITER_NBO|
+                    NPY_ITER_ALIGNED|
+                    NPY_ITER_ALLOCATE|
+                    NPY_ITER_NO_BROADCAST;
+
+    /* Allocate the iterator */
+    iter = NpyIter_MultiNew(nop+1, op, NPY_ITER_NO_INNER_ITERATION|
+                ((dtype != NULL) ? 0 : NPY_ITER_COMMON_DTYPE)|
+                                       NPY_ITER_BUFFERED|
+                                       NPY_ITER_DELAY_BUFALLOC|
+                                       NPY_ITER_GROWINNER|
+                                       NPY_ITER_REDUCE_OK|
+                                       NPY_ITER_REFS_OK|
+                                       NPY_ITER_ZEROSIZE_OK,
+                                       order, casting,
+                                       op_flags, op_dtypes,
+                                       ndim_iter, op_axes, 0);
+    
+    if (iter == NULL) {
+        goto fail;
+    }
+
+    /* Initialize the output to all zeros and reset the iterator */
+    ret = NpyIter_GetOperandArray(iter)[nop];
+    Py_INCREF(ret);
+    PyArray_FillWithZero(ret);
+    if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
+        Py_DECREF(ret);
+        goto fail;
+    }
+
+    /*
+     * Get an inner loop function, specializing it based on
+     * the strides that are fixed for the whole loop.
+     */
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    sop = get_sum_of_products_function(nop,
+                        NpyIter_GetDescrArray(iter)[0]->type_num,
+                        NpyIter_GetDescrArray(iter)[0]->elsize,
+                        fixed_strides);
+
+    #if 0
+    NpyIter_DebugPrint(iter);
+    printf("fixed strides:\n");
+    for (iop = 0; iop <= nop; ++iop) {
+        printf("%ld ", fixed_strides[iop]);
+    }
+    printf("\n");
+    #endif
+
+    /* Finally, the main loop */
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        Py_DECREF(ret);
+        ret = NULL;
+    }
+    else if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNext_Fn iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *countptr;
+        int needs_api = NpyIter_IterationNeedsAPI(iter);
+        NPY_BEGIN_THREADS_DEF;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            Py_DECREF(ret);
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+        do {
+            sop(nop, dataptr, stride, *countptr);
+        } while(iternext(iter));
+        if (!needs_api) {
+            NPY_END_THREADS;
+        }
+
+        /* If the API was needed, it may have thrown an error */
+        if (needs_api && PyErr_Occurred()) {
+            Py_DECREF(ret);
+            ret = NULL;
+        }
+    }
+
+    NpyIter_Deallocate(iter);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_DECREF(op[iop]);
+    }
+
+    return ret;
+
+fail:
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+    }
+
+    return NULL;
+}
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index ca27ef083..7e7dc3e2d 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -576,13 +576,13 @@ arrayflags_richcompare(PyObject *self, PyObject *other, int cmp_op)
     if (PyObject_TypeCheck(other, &PyArrayFlags_Type)) {
         cmp = arrayflags_compare((PyArrayFlagsObject *)self,
                                  (PyArrayFlagsObject *)other);
-    }
 
-    if (cmp_op == Py_EQ) {
-        result = (cmp == 0) ? Py_True : Py_False;
-    }
-    else if (cmp_op == Py_NE) {
-        result = (cmp != 0) ? Py_True : Py_False;
+        if (cmp_op == Py_EQ) {
+            result = (cmp == 0) ? Py_True : Py_False;
+        }
+        else if (cmp_op == Py_NE) {
+            result = (cmp != 0) ? Py_True : Py_False;
+        }
     }
 
     Py_INCREF(result);
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index a636383a6..8367329d1 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -418,10 +418,21 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
     }
 
     if (newtype->elsize == 0) {
-        PyErr_SetString(PyExc_TypeError,
-                        "data-type must not be 0-sized");
-        Py_DECREF(newtype);
-        return -1;
+        /* Allow a void view */
+        if (newtype->type_num == NPY_VOID) {
+            PyArray_DESCR_REPLACE(newtype);
+            if (newtype == NULL) {
+                return -1;
+            }
+            newtype->elsize = self->descr->elsize;
+        }
+        /* But no other flexible types */
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                            "data-type must not be 0-sized");
+            Py_DECREF(newtype);
+            return -1;
+        }
     }
 
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 5c896dc76..8e25f8f3b 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -15,6 +15,7 @@
 
 #include "common.h"
 #include "ctors.h"
+#include "lowlevel_strided_loops.h"
 
 #define PyAO PyArrayObject
 #define _check_axis PyArray_CheckAxis
@@ -1682,75 +1683,222 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
 }
 
 /*NUMPY_API
+ * Counts the number of non-zero elements in the array
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT npy_intp
+PyArray_CountNonzero(PyArrayObject *self)
+{
+    PyArray_NonzeroFunc *nonzero = self->descr->f->nonzero;
+    char *data;
+    npy_intp stride, count;
+    npy_intp nonzero_count = 0;
+
+    NpyIter *iter;
+    NpyIter_IterNext_Fn iternext;
+    char **dataptr;
+    npy_intp *strideptr, *innersizeptr;
+
+    /* If it's a trivial one-dimensional loop, don't use an iterator */
+    if (PyArray_TRIVIALLY_ITERABLE(self)) {
+        PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
+
+        while (count--) {
+            if (nonzero(data, self)) {
+                ++nonzero_count;
+            }
+            data += stride;
+        }
+
+        return nonzero_count;
+    }
+
+    /*
+     * If the array has size zero, return zero (the iterator rejects
+     * size zero arrays)
+     */
+    if (PyArray_SIZE(self) == 0) {
+        return 0;
+    }
+
+    /* Otherwise create and use an iterator to count the nonzeros */
+    iter = NpyIter_New(self, NPY_ITER_READONLY|
+                             NPY_ITER_NO_INNER_ITERATION|
+                             NPY_ITER_REFS_OK,
+                        NPY_KEEPORDER, NPY_NO_CASTING,
+                        NULL, 0, NULL, 0);
+    if (iter == NULL) {
+        return -1;
+    }
+
+    /* Get the pointers for inner loop iteration */
+    iternext = NpyIter_GetIterNext(iter, NULL);
+    if (iternext == NULL) {
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+    dataptr = NpyIter_GetDataPtrArray(iter);
+    strideptr = NpyIter_GetInnerStrideArray(iter);
+    innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+    /* Iterate over all the elements to count the nonzeros */
+    do {
+        data = *dataptr;
+        stride = *strideptr;
+        count = *innersizeptr;
+
+        while (count--) {
+            if (nonzero(data, self)) {
+                ++nonzero_count;
+            }
+            data += stride;
+        }
+
+    } while(iternext(iter));
+
+    NpyIter_Deallocate(iter);
+
+    return nonzero_count;
+}
+
+/*NUMPY_API
  * Nonzero
+ *
+ * TODO: In NumPy 2.0, should make the iteration order a parameter.
  */
 NPY_NO_EXPORT PyObject *
 PyArray_Nonzero(PyArrayObject *self)
 {
-    int n = self->nd, j;
-    intp count = 0, i, size;
-    PyArrayIterObject *it = NULL;
-    PyObject *ret = NULL, *item;
-    intp *dptr[MAX_DIMS];
-
-    it = (PyArrayIterObject *)PyArray_IterNew((PyObject *)self);
-    if (it == NULL) {
+    int i, ndim = PyArray_NDIM(self);
+    PyArrayObject *ret = NULL;
+    PyObject *ret_tuple;
+    npy_intp ret_dims[2];
+    PyArray_NonzeroFunc *nonzero = self->descr->f->nonzero;
+    char *data;
+    npy_intp stride, count;
+    npy_intp nonzero_count = PyArray_CountNonzero(self);
+    npy_intp *coords;
+
+    NpyIter *iter;
+    NpyIter_IterNext_Fn iternext;
+    NpyIter_GetCoords_Fn getcoords;
+    char **dataptr;
+    npy_intp *innersizeptr;
+
+    /* Allocate the result as a 2D array */
+    ret_dims[0] = nonzero_count;
+    ret_dims[1] = (ndim == 0) ? 1 : ndim;
+    ret = (PyArrayObject *)PyArray_New(&PyArray_Type, 2, ret_dims,
+                       NPY_INTP, NULL, NULL, 0, 0,
+                       NULL);
+    if (ret == NULL) {
         return NULL;
     }
-    /* One pass through 'self', counting the non-zero elements */
-    size = it->size;
-    for (i = 0; i < size; i++) {
-        if (self->descr->f->nonzero(it->dataptr, self)) {
-            count++;
+
+    /* If it's a one-dimensional result, don't use an iterator */
+    if (ndim <= 1) {
+        npy_intp i;
+
+        coords = (npy_intp *)PyArray_DATA(ret);
+        data = PyArray_BYTES(self);
+        stride = (ndim == 0) ? 0 : PyArray_STRIDE(self, 0);
+        count = (ndim == 0) ? 1 : PyArray_DIM(self, 0);
+
+        for (i = 0; i < count; ++i) {
+            if (nonzero(data, self)) {
+                *coords++ = i;
+            }
+            data += stride;
         }
-        PyArray_ITER_NEXT(it);
+
+        goto finish;
     }
 
-    PyArray_ITER_RESET(it);
-    /* Allocate the tuple of coordinates */
-    ret = PyTuple_New(n);
-    if (ret == NULL) {
-        goto fail;
+    /* Build an iterator with coordinates, in C order */
+    iter = NpyIter_New(self, NPY_ITER_READONLY|
+                             NPY_ITER_COORDS|
+                             NPY_ITER_ZEROSIZE_OK|
+                             NPY_ITER_REFS_OK,
+                        NPY_CORDER, NPY_NO_CASTING,
+                        NULL, 0, NULL, 0);
+
+    if (iter == NULL) {
+        Py_DECREF(ret);
+        return NULL;
     }
-    for (j = 0; j < n; j++) {
-        item = PyArray_New(Py_TYPE(self), 1, &count,
-                           PyArray_INTP, NULL, NULL, 0, 0,
-                           (PyObject *)self);
-        if (item == NULL) {
-            goto fail;
+
+    if (NpyIter_GetIterSize(iter) != 0) {
+        /* Get the pointers for inner loop iteration */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            Py_DECREF(ret);
+            return NULL;
         }
-        PyTuple_SET_ITEM(ret, j, item);
-        dptr[j] = (intp *)PyArray_DATA(item);
-    }
-    /* A second pass through 'self', recording the indices */
-    if (n == 1) {
-        for (i = 0; i < size; i++) {
-            if (self->descr->f->nonzero(it->dataptr, self)) {
-                *(dptr[0])++ = i;
-            }
-            PyArray_ITER_NEXT(it);
+        getcoords = NpyIter_GetGetCoords(iter, NULL);
+        if (getcoords == NULL) {
+            NpyIter_Deallocate(iter);
+            Py_DECREF(ret);
+            return NULL;
         }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        coords = (npy_intp *)PyArray_DATA(ret);
+
+        /* Get the coordinates for each non-zero element */
+        do {
+            if (nonzero(*dataptr, self)) {
+                getcoords(iter, coords);
+                coords += ndim;
+            }
+        } while(iternext(iter));
+    }
+
+    NpyIter_Deallocate(iter);
+
+finish:
+    /* Treat zero-dimensional as shape (1,) */
+    if (ndim == 0) {
+        ndim = 1;
+    }
+
+    ret_tuple = PyTuple_New(ndim);
+    if (ret_tuple == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    /* Create views into ret, one for each dimension */
+    if (ndim == 1) {
+        /* Directly switch to one dimensions (dimension 1 is 1 anyway) */
+        ret->nd = 1;
+        PyTuple_SET_ITEM(ret_tuple, i, (PyObject *)ret);
     }
     else {
-        /* reset contiguous so that coordinates gets updated */
-        it->contiguous = 0;
-        for (i = 0; i < size; i++) {
-            if (self->descr->f->nonzero(it->dataptr, self)) {
-                for (j = 0; j < n; j++) {
-                    *(dptr[j])++ = it->coordinates[j];
-                }
+        for (i = 0; i < ndim; ++i) {
+            npy_intp stride = ndim*NPY_SIZEOF_INTP;
+            PyArrayObject *view;
+
+            view = (PyArrayObject *)PyArray_New(Py_TYPE(self), 1,
+                                &nonzero_count,
+                                NPY_INTP, &stride,
+                                PyArray_BYTES(ret) + i*NPY_SIZEOF_INTP,
+                                0, 0, (PyObject *)self);
+            if (view == NULL) {
+                Py_DECREF(ret);
+                Py_DECREF(ret_tuple);
+                return NULL;
             }
-            PyArray_ITER_NEXT(it);
+            Py_INCREF(ret);
+            view->base = (PyObject *)ret;
+            PyTuple_SET_ITEM(ret_tuple, i, (PyObject *)view);
         }
-    }
 
-    Py_DECREF(it);
-    return ret;
-
- fail:
-    Py_XDECREF(ret);
-    Py_XDECREF(it);
-    return NULL;
+        Py_DECREF(ret);
+    }
 
+    return ret_tuple;
 }
 
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index f841006ec..6c1d97e8e 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -20,10 +20,10 @@
 #define RubberIndex -2
 #define SingleIndex -3
 
-NPY_NO_EXPORT intp
-parse_subindex(PyObject *op, intp *step_size, intp *n_steps, intp max)
+NPY_NO_EXPORT npy_intp
+parse_subindex(PyObject *op, npy_intp *step_size, npy_intp *n_steps, npy_intp max)
 {
-    intp index;
+    npy_intp index;
 
     if (op == Py_None) {
         *n_steps = PseudoIndex;
@@ -34,7 +34,7 @@ parse_subindex(PyObject *op, intp *step_size, intp *n_steps, intp max)
         index = 0;
     }
     else if (PySlice_Check(op)) {
-        intp stop;
+        npy_intp stop;
         if (slice_GetIndices((PySliceObject *)op, max,
                              &index, &stop, step_size, n_steps) < 0) {
             if (!PyErr_Occurred()) {
@@ -77,11 +77,11 @@ parse_subindex(PyObject *op, intp *step_size, intp *n_steps, intp max)
 
 NPY_NO_EXPORT int
 parse_index(PyArrayObject *self, PyObject *op,
-            intp *dimensions, intp *strides, intp *offset_ptr)
+            npy_intp *dimensions, npy_intp *strides, npy_intp *offset_ptr)
 {
     int i, j, n;
     int nd_old, nd_new, n_add, n_pseudo;
-    intp n_steps, start, offset, step_size;
+    npy_intp n_steps, start, offset, step_size;
     PyObject *op1 = NULL;
     int is_slice;
 
@@ -180,7 +180,7 @@ parse_index(PyArrayObject *self, PyObject *op,
 }
 
 static int
-slice_coerce_index(PyObject *o, intp *v)
+slice_coerce_index(PyObject *o, npy_intp *v)
 {
     *v = PyArray_PyIntAsIntp(o);
     if (error_converting(*v)) {
@@ -193,11 +193,11 @@ slice_coerce_index(PyObject *o, intp *v)
 /* This is basically PySlice_GetIndicesEx, but with our coercion
  * of indices to integers (plus, that function is new in Python 2.3) */
 NPY_NO_EXPORT int
-slice_GetIndices(PySliceObject *r, intp length,
-                 intp *start, intp *stop, intp *step,
-                 intp *slicelength)
+slice_GetIndices(PySliceObject *r, npy_intp length,
+                 npy_intp *start, npy_intp *stop, npy_intp *step,
+                 npy_intp *slicelength)
 {
-    intp defstop;
+    npy_intp defstop;
 
     if (r->step == Py_None) {
         *step = 1;
@@ -363,7 +363,7 @@ PyArray_IterNew(PyObject *obj)
  * Get Iterator broadcast to a particular shape
  */
 NPY_NO_EXPORT PyObject *
-PyArray_BroadcastToShape(PyObject *obj, intp *dims, int nd)
+PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd)
 {
     PyArrayIterObject *it;
     int i, diff, j, compat, k;
@@ -451,7 +451,7 @@ PyArray_IterAllButAxis(PyObject *obj, int *inaxis)
     }
     if (*inaxis < 0) {
         int i, minaxis = 0;
-        intp minstride = 0;
+        npy_intp minstride = 0;
         i = 0;
         while (minstride == 0 && i < PyArray_NDIM(obj)) {
             minstride = PyArray_STRIDE(obj,i);
@@ -496,8 +496,8 @@ PyArray_RemoveSmallest(PyArrayMultiIterObject *multi)
     PyArrayIterObject *it;
     int i, j;
     int axis;
-    intp smallest;
-    intp sumstrides[NPY_MAXDIMS];
+    npy_intp smallest;
+    npy_intp sumstrides[NPY_MAXDIMS];
 
     if (multi->nd == 0) {
         return -1;
@@ -562,9 +562,9 @@ iter_length(PyArrayIterObject *self)
 static PyObject *
 iter_subscript_Bool(PyArrayIterObject *self, PyArrayObject *ind)
 {
-    intp index, strides;
+    npy_intp index, strides;
     int itemsize;
-    intp count = 0;
+    npy_intp count = 0;
     char *dptr, *optr;
     PyObject *r;
     int swap;
@@ -623,18 +623,18 @@ iter_subscript_Bool(PyArrayIterObject *self, PyArrayObject *ind)
 static PyObject *
 iter_subscript_int(PyArrayIterObject *self, PyArrayObject *ind)
 {
-    intp num;
+    npy_intp num;
     PyObject *r;
     PyArrayIterObject *ind_it;
     int itemsize;
     int swap;
     char *optr;
-    intp index;
+    npy_intp index;
     PyArray_CopySwapFunc *copyswap;
 
     itemsize = self->ao->descr->elsize;
     if (ind->nd == 0) {
-        num = *((intp *)ind->data);
+        num = *((npy_intp *)ind->data);
         if (num < 0) {
             num += self->size;
         }
@@ -671,7 +671,7 @@ iter_subscript_int(PyArrayIterObject *self, PyArrayObject *ind)
     copyswap = PyArray_DESCR(r)->f->copyswap;
     swap = (PyArray_ISNOTSWAPPED(r) != PyArray_ISNOTSWAPPED(self->ao));
     while (index--) {
-        num = *((intp *)(ind_it->dataptr));
+        num = *((npy_intp *)(ind_it->dataptr));
         if (num < 0) {
             num += self->size;
         }
@@ -700,8 +700,8 @@ NPY_NO_EXPORT PyObject *
 iter_subscript(PyArrayIterObject *self, PyObject *ind)
 {
     PyArray_Descr *indtype = NULL;
-    intp start, step_size;
-    intp n_steps;
+    npy_intp start, step_size;
+    npy_intp n_steps;
     PyObject *r;
     char *dptr;
     int size;
@@ -739,7 +739,7 @@ iter_subscript(PyArrayIterObject *self, PyObject *ind)
             return PyArray_ToScalar(self->dataptr, self->ao);
         }
         else { /* empty array */
-            intp ii = 0;
+            npy_intp ii = 0;
             Py_INCREF(self->ao->descr);
             r = PyArray_NewFromDescr(Py_TYPE(self->ao),
                                      self->ao->descr,
@@ -848,7 +848,7 @@ static int
 iter_ass_sub_Bool(PyArrayIterObject *self, PyArrayObject *ind,
                   PyArrayIterObject *val, int swap)
 {
-    intp index, strides;
+    npy_intp index, strides;
     char *dptr;
     PyArray_CopySwapFunc *copyswap;
 
@@ -890,15 +890,15 @@ iter_ass_sub_int(PyArrayIterObject *self, PyArrayObject *ind,
                  PyArrayIterObject *val, int swap)
 {
     PyArray_Descr *typecode;
-    intp num;
+    npy_intp num;
     PyArrayIterObject *ind_it;
-    intp index;
+    npy_intp index;
     PyArray_CopySwapFunc *copyswap;
 
     typecode = self->ao->descr;
     copyswap = self->ao->descr->f->copyswap;
     if (ind->nd == 0) {
-        num = *((intp *)ind->data);
+        num = *((npy_intp *)ind->data);
         PyArray_ITER_GOTO1D(self, num);
         copyswap(self->dataptr, val->dataptr, swap, self->ao);
         return 0;
@@ -909,7 +909,7 @@ iter_ass_sub_int(PyArrayIterObject *self, PyArrayObject *ind,
     }
     index = ind_it->size;
     while (index--) {
-        num = *((intp *)(ind_it->dataptr));
+        num = *((npy_intp *)(ind_it->dataptr));
         if (num < 0) {
             num += self->size;
         }
@@ -941,8 +941,8 @@ iter_ass_subscript(PyArrayIterObject *self, PyObject *ind, PyObject *val)
     PyArray_Descr *type;
     PyArray_Descr *indtype = NULL;
     int swap, retval = -1;
-    intp start, step_size;
-    intp n_steps;
+    npy_intp start, step_size;
+    npy_intp n_steps;
     PyObject *obj = NULL;
     PyArray_CopySwapFunc *copyswap;
 
@@ -1122,7 +1122,7 @@ iter_array(PyArrayIterObject *it, PyObject *NPY_UNUSED(op))
 {
 
     PyObject *r;
-    intp size;
+    npy_intp size;
 
     /* Any argument ignored */
 
@@ -1155,8 +1155,7 @@ iter_array(PyArrayIterObject *it, PyObject *NPY_UNUSED(op))
         if (r == NULL) {
             return NULL;
         }
-        if (_flat_copyinto(r, (PyObject *)it->ao,
-                           PyArray_CORDER) < 0) {
+        if (PyArray_CopyAnyInto((PyArrayObject *)r, it->ao) < 0) {
             Py_DECREF(r);
             return NULL;
         }
@@ -1226,7 +1225,7 @@ iter_coords_get(PyArrayIterObject *self)
          * coordinates not kept track of ---
          * need to generate from index
          */
-        intp val;
+        npy_intp val;
         int i;
         val = self->index;
         for (i = 0; i < nd; i++) {
@@ -1321,7 +1320,7 @@ NPY_NO_EXPORT int
 PyArray_Broadcast(PyArrayMultiIterObject *mit)
 {
     int i, nd, k, j;
-    intp tmp;
+    npy_intp tmp;
     PyArrayIterObject *it;
 
     /* Discover the broadcast number of dimensions */
@@ -1944,7 +1943,7 @@ get_ptr_circular(PyArrayIterObject* _iter, npy_intp *coordinates)
  * A Neighborhood Iterator object.
 */
 NPY_NO_EXPORT PyObject*
-PyArray_NeighborhoodIterNew(PyArrayIterObject *x, intp *bounds,
+PyArray_NeighborhoodIterNew(PyArrayIterObject *x, npy_intp *bounds,
                             int mode, PyArrayObject* fill)
 {
     int i;
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
new file mode 100644
index 000000000..bc0912aa5
--- /dev/null
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -0,0 +1,1207 @@
+/*
+ * This file contains low-level loops for copying and byte-swapping
+ * strided data.
+ *
+ * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
+ * The Univerity of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+#include <numpy/ufuncobject.h>
+#include <numpy/npy_cpu.h>
+#include <numpy/halffloat.h>
+
+#include "lowlevel_strided_loops.h"
+
+/* x86 platform works with unaligned reads and writes */
+#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
+#  define NPY_USE_UNALIGNED_ACCESS 0 //1
+#else
+#  define NPY_USE_UNALIGNED_ACCESS 0
+#endif
+
+#define _NPY_NOP1(x) (x)
+#define _NPY_NOP2(x) (x)
+#define _NPY_NOP4(x) (x)
+#define _NPY_NOP8(x) (x)
+
+#define _NPY_SWAP2(x) (((((npy_uint16)x)&0xffu) << 8) | \
+                       (((npy_uint16)x) >> 8))
+
+#define _NPY_SWAP4(x) (((((npy_uint32)x)&0xffu) << 24) | \
+                       ((((npy_uint32)x)&0xff00u) << 8) | \
+                       ((((npy_uint32)x)&0xff0000u) >> 8) | \
+                       (((npy_uint32)x) >> 24))
+
+#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \
+                       ((((npy_uint32)x)&0xff00u) >> 8) | \
+                       ((((npy_uint32)x)&0xff0000u) << 8) | \
+                       ((((npy_uint32)x)&0xff000000u) >> 8))
+
+#define _NPY_SWAP8(x) (((((npy_uint64)x)&0xffu) << 56) | \
+                       ((((npy_uint64)x)&0xff00u) << 40) | \
+                       ((((npy_uint64)x)&0xff0000u) << 24) | \
+                       ((((npy_uint64)x)&0xff000000u) << 8) | \
+                       ((((npy_uint64)x)&0xff00000000u) >> 8) | \
+                       ((((npy_uint64)x)&0xff0000000000u) >> 24) | \
+                       ((((npy_uint64)x)&0xff000000000000u) >> 40) | \
+                       (((npy_uint64)x) >> 56))
+
+#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffu) << 24) | \
+                       ((((npy_uint64)x)&0xff00u) << 8) | \
+                       ((((npy_uint64)x)&0xff0000u) >> 8) | \
+                       ((((npy_uint64)x)&0xff000000u) >> 24) | \
+                       ((((npy_uint64)x)&0xff00000000u) << 24) | \
+                       ((((npy_uint64)x)&0xff0000000000u) << 8) | \
+                       ((((npy_uint64)x)&0xff000000000000u) >> 8) | \
+                       ((((npy_uint64)x)&0xff00000000000000u) >> 24))
+
+#define _NPY_SWAP_INPLACE2(x) { \
+        char a = (x)[0]; (x)[0] = (x)[1]; (x)[1] = a; \
+        }
+
+#define _NPY_SWAP_INPLACE4(x) { \
+        char a = (x)[0]; (x)[0] = (x)[3]; (x)[3] = a; \
+        a = (x)[1]; (x)[1] = (x)[2]; (x)[2] = a; \
+        }
+
+#define _NPY_SWAP_INPLACE8(x) { \
+        char a = (x)[0]; (x)[0] = (x)[7]; (x)[7] = a; \
+        a = (x)[1]; (x)[1] = (x)[6]; (x)[6] = a; \
+        a = (x)[2]; (x)[2] = (x)[5]; (x)[5] = a; \
+        a = (x)[3]; (x)[3] = (x)[4]; (x)[4] = a; \
+        }
+
+#define _NPY_SWAP_INPLACE16(x) { \
+        char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \
+        a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \
+        a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \
+        a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \
+        a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \
+        a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \
+        a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \
+        a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \
+        }
+
+/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/
+
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ * #elsize_half = 0, 1, 2, 4, 8#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128#
+ */
+/**begin repeat1
+ * #oper = strided_to_strided, strided_to_contig,
+ *         contig_to_strided, contig_to_contig#
+ * #src_contig = 0, 0, 1 ,1#
+ * #dst_contig = 0, 1, 0 ,1#
+ */
+/**begin repeat2
+ * #swap = _NPY_NOP, _NPY_NOP, _NPY_SWAP_INPLACE, _NPY_SWAP,
+ *         _NPY_SWAP_INPLACE, _NPY_SWAP_PAIR#
+ * #prefix = , _aligned, _swap, _aligned_swap, _swap_pair, _aligned_swap_pair#
+ * #is_aligned = 0, 1, 0, 1, 0, 1#
+ * #minelsize = 1, 1, 2, 2, 4, 4#
+ * #is_swap = 0, 0, 1, 1, 2, 2#
+ */
+
+#if (@elsize@ >= @minelsize@) && \
+    (@elsize@ > 1 || @is_aligned@) && \
+    (!NPY_USE_UNALIGNED_ACCESS || @is_aligned@)
+
+
+#if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+static void
+@prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+    /*printf("fn @prefix@_@oper@_size@elsize@\n");*/
+    while (N > 0) {
+#if @is_aligned@
+
+        /* aligned copy and swap */
+#  if @elsize@ != 16
+        (*((@type@ *)dst)) = @swap@@elsize@(*((@type@ *)src));
+#  else
+#    if @is_swap@ == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif @is_swap@ == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif @is_swap@ == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memcpy(dst, src, @elsize@);
+#  if @is_swap@ == 1
+        @swap@@elsize@(dst);
+#  elif @is_swap@ == 2
+        @swap@@elsize_half@(dst);
+        @swap@@elsize_half@(dst + @elsize_half@);
+#  endif
+
+#endif
+
+#if @dst_contig@
+        dst += @elsize@;
+#else
+        dst += dst_stride;
+#endif
+
+#if @src_contig@
+        src += @elsize@;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+}
+#endif
+
+
+/* specialized copy and swap for source stride 0 */
+#if (@src_contig@ == 0) && @is_aligned@
+static void
+@prefix@_@oper@_size@elsize@_srcstride0(char *dst,
+                        npy_intp dst_stride,
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+#if @elsize@ != 16
+    @type@ temp = @swap@@elsize@(*((@type@ *)src));
+#else
+    npy_uint64 temp0, temp1;
+#    if @is_swap@ == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif @is_swap@ == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif @is_swap@ == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#endif
+    while (N > 0) {
+#if @elsize@ != 16
+        *((@type@ *)dst) = temp;
+#else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#endif
+#if @dst_contig@
+        dst += @elsize@;
+#else
+        dst += dst_stride;
+#endif
+        --N;
+    }
+}
+#endif
+
+#endif/* @elsize@ >= @minelsize@ */
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+static void
+_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_swap_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    char *a, *b, c;
+
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + src_itemsize - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    char *a, *b, c;
+    npy_intp itemsize_half = src_itemsize / 2;
+
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        /* general in-place swap */
+        a = dst + itemsize_half;
+        b = dst + 2*itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_strided_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        dst += src_itemsize;
+        src += src_stride;
+        --N;
+    }
+}
+
+static void
+_contig_to_strided(char *dst, npy_intp dst_stride,
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    while (N > 0) {
+        memcpy(dst, src, src_itemsize);
+        dst += dst_stride;
+        src += src_itemsize;
+        --N;
+    }
+}
+
+static void
+_contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
+                        char *src, npy_intp NPY_UNUSED(src_stride),
+                        npy_intp N, npy_intp src_itemsize,
+                        void *NPY_UNUSED(data))
+{
+    memcpy(dst, src, src_itemsize*N);
+}
+
+
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride,
+                         npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return
+                          &_aligned_strided_to_contig_size@elsize@_srcstride0;
+/**end repeat**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                return &_contig_to_contig;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_strided_to_contig_size@elsize@;
+/**end repeat**/
+                }
+            }
+
+            return &_strided_to_contig;
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return
+                          &_aligned_strided_to_strided_size@elsize@_srcstride0;
+/**end repeat**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_contig_to_strided_size@elsize@;
+/**end repeat**/
+                }
+
+                return &_contig_to_strided;
+            }
+            else {
+                switch (itemsize) {
+/**begin repeat
+ * #elsize = 1, 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_aligned_strided_to_strided_size@elsize@;
+/**end repeat**/
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                return &_contig_to_contig;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_strided_to_contig_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_strided_to_contig_size@elsize@;
+/**end repeat**/
+                }
+            }
+
+            return &_strided_to_contig;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_contig_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_contig_to_strided_size@elsize@;
+/**end repeat**/
+                }
+
+                return &_contig_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_strided_to_strided_size1;
+/**begin repeat
+ * #elsize = 2, 4, 8, 16#
+ */
+                    case @elsize@:
+                        return &_strided_to_strided_size@elsize@;
+/**end repeat**/
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_strided_to_strided;
+}
+
+/*
+ * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are
+ * nearly identical, so can do a repeat for them.
+ */
+/**begin repeat
+ * #function = PyArray_GetStridedCopySwapFn, PyArray_GetStridedCopySwapPairFn#
+ * #tag = , _pair#
+ * #not_pair = 1, 0#
+ */
+
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+@function@(npy_intp aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return
+                 &_aligned_swap@tag@_strided_to_contig_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return
+                &_aligned_swap@tag@_strided_to_strided_size@elsize@_srcstride0;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+
+                return &_contig_to_strided;
+            }
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_aligned_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_contig_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                    case @elsize@:
+                        return &_swap@tag@_strided_to_contig_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+
+            return &_strided_to_contig;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_contig_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+
+                return &_contig_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+/**begin repeat1
+ * #elsize = 2, 4, 8, 16#
+ */
+#if @not_pair@ || @elsize@ > 2
+                case @elsize@:
+                    return &_swap@tag@_strided_to_strided_size@elsize@;
+#endif
+/**end repeat1**/
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_swap@tag@_strided_to_strided;
+}
+
+/**end repeat**/
+
+/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
+
+/**begin repeat
+ *
+ * #NAME1 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name1 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #rname1 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          float, double, longdouble#
+ * #is_bool1 = 1, 0*17#
+ * #is_half1 = 0*11, 1, 0*6#
+ * #is_float1 = 0*12, 1, 0, 0, 1, 0, 0#
+ * #is_double1 = 0*13, 1, 0, 0, 1, 0#
+ * #is_complex1 = 0*15, 1*3#
+ */
+
+/**begin repeat1
+ *
+ * #NAME2 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name2 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ * #rname2 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          float, double, longdouble#
+ * #is_bool2 = 1, 0*17#
+ * #is_half2 = 0*11, 1, 0*6#
+ * #is_float2 = 0*12, 1, 0, 0, 1, 0, 0#
+ * #is_double2 = 0*13, 1, 0, 0, 1, 0#
+ * #is_complex2 = 0*15, 1*3#
+ */
+
+/**begin repeat2
+ * #prefix = _aligned,,_aligned_contig,_contig#
+ * #aligned = 1,0,1,0#
+ * #contig = 0,0,1,1#
+ */
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !@aligned@)
+
+/* For half types, don't use actual double/float types in conversion */
+#if @is_half1@ || @is_half2@
+
+#  if @is_float1@
+#    define _TYPE1 npy_uint32
+#  elif @is_double1@
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_@rname1@
+#  endif
+
+#  if @is_float2@
+#    define _TYPE2 npy_uint32
+#  elif @is_double2@
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_@rname2@
+#  endif
+
+#else
+
+#define _TYPE1 npy_@rname1@
+#define _TYPE2 npy_@rname2@
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if @is_half1@
+
+#  if @is_float2@
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif @is_double2@
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif @is_half2@
+#    define _CONVERT_FN(x) (x)
+#  elif @is_bool2@
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif @is_half2@
+
+#  if @is_float1@
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif @is_double1@
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if @is_bool2@
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static void
+@prefix@_cast_@name1@_to_@name2@(
+                        char *dst, npy_intp dst_stride,
+                        char *src, npy_intp src_stride,
+                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
+                        void *NPY_UNUSED(data))
+{
+#if @is_complex1@
+    _TYPE1 src_value[2];
+#elif !@aligned@
+    _TYPE1 src_value;
+#endif
+#if @is_complex2@
+    _TYPE2 dst_value[2];
+#elif !@aligned@
+    _TYPE2 dst_value;
+#endif
+
+    /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
+
+    while (N--) {
+#if @aligned@
+#  if @is_complex1@
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  elif !@aligned@
+        src_value = *((_TYPE1 *)src);
+#  endif
+#else
+        memcpy(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if @is_complex1@
+#  if @is_complex2@
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !@aligned@
+    dst_value = _CONVERT_FN(src_value[0]);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#  endif
+#else
+#  if @is_complex2@
+#    if !@aligned@
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !@aligned@
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if @aligned@
+#  if @is_complex2@
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  elif !@aligned@
+        *((_TYPE2 *)dst) = dst_value;
+#  endif
+#else
+        memcpy(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if @contig@
+        dst += sizeof(npy_@name2@);
+        src += sizeof(npy_@name1@);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+/**end repeat2**/
+
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedNumericCastFn(npy_intp aligned, npy_intp src_stride,
+                             npy_intp dst_stride,
+                             int src_type_num, int dst_type_num)
+{
+    switch (src_type_num) {
+/**begin repeat
+ *
+ * #NAME1 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name1 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ */
+
+        case NPY_@NAME1@:
+            //printf("test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);
+            switch (dst_type_num) {
+/**begin repeat1
+ *
+ * #NAME2 = BOOL,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #name2 = bool,
+ *          ubyte, ushort, uint, ulong, ulonglong,
+ *          byte, short, int, long, longlong,
+ *          half, float, double, longdouble,
+ *          cfloat, cdouble, clongdouble#
+ */
+
+                case NPY_@NAME2@:
+                    //printf("ret fn %d %d\n", NPY_@NAME1@, NPY_@NAME2@);
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_@name1@) &&
+                                dst_stride == sizeof(npy_@name2@)) {
+                        return &_aligned_contig_cast_@name1@_to_@name2@;
+                    }
+                    else {
+                        return &_aligned_cast_@name1@_to_@name2@;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_@name1@) &&
+                                dst_stride == sizeof(npy_@name2@)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_@name1@_to_@name2@ :
+                                    &_contig_cast_@name1@_to_@name2@;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_@name1@_to_@name2@ :
+                                         &_cast_@name1@_to_@name2@;
+                    }
+#  endif
+
+/**end repeat1**/
+            }
+            //printf("switched test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);
+
+/**end repeat**/
+    }
+
+    return NULL;
+}
+
+
+/************** STRIDED TRANSFER FUNCTION MEMORY MANAGEMENT **************/
+
+typedef void (*_npy_stridedtransfer_dealloc)(void *);
+NPY_NO_EXPORT void
+PyArray_FreeStridedTransferData(void *transferdata)
+{
+    if (transferdata != NULL) {
+        _npy_stridedtransfer_dealloc dealloc =
+                        *((_npy_stridedtransfer_dealloc *)transferdata);
+        dealloc(transferdata);
+    }
+}
+
+typedef void *(*_npy_stridedtransfer_copy)(void *);
+NPY_NO_EXPORT void *
+PyArray_CopyStridedTransferData(void *transferdata)
+{
+    if (transferdata != NULL) {
+        _npy_stridedtransfer_copy copy =
+                        *((_npy_stridedtransfer_copy *)transferdata + 1);
+        return copy(transferdata);
+    }
+
+    return NULL;
+}
+
+/****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+                char *dst, npy_intp dst_stride,
+                char *src, npy_intp *src_strides, npy_intp src_strides_inc,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedTransferFn *stransfer,
+                void *data)
+{
+    npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    src_stride0 = src_strides[0];
+    N = shape0 - coord0;
+    if (N >= count) {
+        stransfer(dst, dst_stride, src, src_stride0, count, src_itemsize, data);
+        return 0;
+    }
+    stransfer(dst, dst_stride, src, src_stride0, N, src_itemsize, data);
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    src_stride1 = (src_strides + src_strides_inc)[0];
+    src = src - coord0*src_stride0 + src_stride1;
+    dst += N*dst_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        if (shape0 >= count) {
+            stransfer(dst, dst_stride, src, src_stride0,
+                        count, src_itemsize, data);
+            return 0;
+        }
+        else {
+            stransfer(dst, dst_stride, src, src_stride0,
+                        shape0, src_itemsize, data);
+        }
+        count -= shape0;
+        src += src_stride1;
+        dst += shape0*dst_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, src_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        src_strides += 2*src_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].src_stride = src_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            src_strides += src_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the src pointer from the dimension 0 and 1 loop */
+            src = src - shape1*src_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                src += it[i].src_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    src -= it[i].src_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                if (shape0 >= count) {
+                    stransfer(dst, dst_stride, src, src_stride0,
+                                count, src_itemsize, data);
+                    return 0;
+                }
+                else {
+                    stransfer(dst, dst_stride, src, src_stride0,
+                                shape0, src_itemsize, data);
+                }
+                count -= shape0;
+                src += src_stride1;
+                dst += shape0*dst_stride;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedTransferFn *stransfer,
+                void *data)
+{
+    npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    dst_stride0 = dst_strides[0];
+    N = shape0 - coord0;
+    if (N >= count) {
+        stransfer(dst, dst_stride0, src, src_stride, count, src_itemsize, data);
+        return 0;
+    }
+    stransfer(dst, dst_stride0, src, src_stride, N, src_itemsize, data);
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    dst_stride1 = (dst_strides + dst_strides_inc)[0];
+    dst = dst - coord0*dst_stride0 + dst_stride1;
+    src += N*src_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        if (shape0 >= count) {
+            stransfer(dst, dst_stride0, src, src_stride,
+                        count, src_itemsize, data);
+            return 0;
+        }
+        else {
+            stransfer(dst, dst_stride0, src, src_stride,
+                        shape0, src_itemsize, data);
+        }
+        count -= shape0;
+        dst += dst_stride1;
+        src += shape0*src_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, dst_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        dst_strides += 2*dst_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].dst_stride = dst_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            dst_strides += dst_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the dst pointer from the dimension 0 and 1 loop */
+            dst = dst - shape1*dst_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                dst += it[i].dst_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    dst -= it[i].dst_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                if (shape0 >= count) {
+                    stransfer(dst, dst_stride0, src, src_stride,
+                                count, src_itemsize, data);
+                    return 0;
+                }
+                else {
+                    stransfer(dst, dst_stride0, src, src_stride,
+                                shape0, src_itemsize, data);
+                }
+                count -= shape0;
+                dst += dst_stride1;
+                src += shape0*src_stride;
+            }
+        }
+    }
+}
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.h b/numpy/core/src/multiarray/lowlevel_strided_loops.h
new file mode 100644
index 000000000..5fc42bc40
--- /dev/null
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.h
@@ -0,0 +1,397 @@
+#ifndef __LOWLEVEL_STRIDED_LOOPS_H
+#define __LOWLEVEL_STRIDED_LOOPS_H
+
+/*
+ * NOTE: This API should remain private for the time being, to allow
+ *       for further refinement.  I think the 'aligned' mechanism
+ *       needs changing, for example.
+ */
+
+/*
+ * This function pointer is for functions that transfer an arbitrarily strided
+ * input to a an arbitrarily strided output.  It may be a fully general
+ * function, or a specialized function when the strides or item size
+ * have special values.
+ *
+ * Examples of transfer functions are a straight copy, a byte-swap,
+ * and a casting operation,
+ *
+ * The 'transferdata' parameter is slightly special, and must always contain
+ * pointer to deallocation and copying routines at its beginning.  The function
+ * PyArray_FreeStridedTransferData should be used to deallocate such
+ * pointers, and calls the first function pointer, while the function
+ * PyArray_CopyStridedTransferData should be used to copy it.
+ * 
+ */
+typedef void (PyArray_StridedTransferFn)(char *dst, npy_intp dst_stride,
+                                    char *src, npy_intp src_stride,
+                                    npy_intp N, npy_intp src_itemsize,
+                                    void *transferdata);
+
+/*
+ * Deallocates a PyArray_StridedTransferFunction data object.  See
+ * the comment with the function typedef for more details.
+ */
+NPY_NO_EXPORT void
+PyArray_FreeStridedTransferData(void *transferdata);
+
+/*
+ * Copies a PyArray_StridedTransferFunction data object.  See
+ * the comment with the function typedef for more details.
+ */
+NPY_NO_EXPORT void *
+PyArray_CopyStridedTransferData(void *transferdata);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * strided memory.  Returns NULL if there is a problem with the inputs.
+ *
+ * aligned:
+ *      Should be 1 if the src and dst pointers are always aligned,
+ *      0 otherwise.
+ * src_stride:  
+ *      Should be the src stride if it will always be the same,
+ *      NPY_MAX_INTP otherwise.
+ * dst_stride:  
+ *      Should be the dst stride if it will always be the same,
+ *      NPY_MAX_INTP otherwise.
+ * itemsize:
+ *      Should be the item size if it will always be the same, 0 otherwise.
+ *
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedCopyFn(npy_intp aligned, npy_intp src_stride,
+                         npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * and swapping strided memory.  This assumes each element is a single
+ * value to be swapped.
+ *
+ * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
+ * see above.
+ *
+ * Parameters are as for PyArray_GetStridedCopyFn.
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedCopySwapFn(npy_intp aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * Gives back a function pointer to a specialized function for copying
+ * and swapping strided memory.  This assumes each element is a pair
+ * of values, each of which needs to be swapped.
+ *
+ * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
+ * see above.
+ *
+ * Parameters are as for PyArray_GetStridedCopyFn.
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedCopySwapPairFn(npy_intp aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize);
+
+/*
+ * Gives back a transfer function and transfer data pair which copies
+ * the data from source to dest, truncating it if the data doesn't
+ * fit, and padding with zero bytes if there's too much space.
+ *
+ * For information on the 'aligned', 'src_stride' and 'dst_stride' parameters
+ * see above.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL
+ */
+NPY_NO_EXPORT int
+PyArray_GetStridedZeroPadCopyFn(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            npy_intp src_itemsize, npy_intp dst_itemsize,
+                            PyArray_StridedTransferFn **outstransfer,
+                            void **outtransferdata);
+
+/*
+ * For casts between built-in numeric types,
+ * this produces a function pointer for casting from src_type_num
+ * to dst_type_num.  If a conversion is unsupported, returns NULL
+ * without setting a Python exception.
+ */
+NPY_NO_EXPORT PyArray_StridedTransferFn *
+PyArray_GetStridedNumericCastFn(npy_intp aligned, npy_intp src_stride,
+                             npy_intp dst_stride,
+                             int src_type_num, int dst_type_num);
+
+/*
+ * If it's possible, gives back a transfer function which casts and/or
+ * byte swaps data with the dtype 'src_dtype' into data with the dtype
+ * 'dst_dtype'.  If the outtransferdata is populated with a non-NULL value,
+ * it must be deallocated with the ``PyArray_FreeStridedTransferData``
+ * function when the transfer function is no longer required.
+ *
+ * aligned:
+ *      Should be 1 if the src and dst pointers are always aligned,
+ *      0 otherwise.
+ * src_stride:  
+ *      Should be the src stride if it will always be the same,
+ *      NPY_MAX_INTP otherwise.
+ * dst_stride:  
+ *      Should be the dst stride if it will always be the same,
+ *      NPY_MAX_INTP otherwise.
+ * src_dtype:
+ *      The data type of source data.  If this is NULL, a transfer
+ *      function which sets the destination to zeros is produced.
+ * dst_dtype:
+ *      The data type of destination data.  If this is NULL and
+ *      move_references is 1, a transfer function which decrements
+ *      source data references is produced.
+ * move_references:
+ *      If 0, the destination data gets new reference ownership.
+ *      If 1, the references from the source data are moved to
+ *      the destination data.
+ * out_stransfer:
+ *      The resulting transfer function is placed here.
+ * out_transferdata:
+ *      The auxiliary data for the transfer function is placed here.
+ *      When finished with the transfer function, the caller must call
+ *      ``PyArray_FreeStridedTransferData`` on this data.
+ * out_needs_api:
+ *      If this is non-NULL, and the transfer function produced needs
+ *      to call into the (Python) API, this gets set to 1.  This
+ *      remains untouched if no API access is required.
+ *
+ * WARNING: If you set move_references to 1, it is best that src_stride is
+ *          never zero when calling the transfer function.  Otherwise, the
+ *          first destination reference will get the value and all the rest
+ *          will get NULL.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+PyArray_GetDTypeTransferFunction(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedTransferFn **out_stransfer,
+                            void **out_transferdata,
+                            int *out_needs_api);
+
+/*
+ * These two functions copy or convert the data of an n-dimensional array
+ * to/from a 1-dimensional strided buffer.  These functions will only call
+ * 'stransfer' with the provided dst_stride/src_stride and
+ * dst_strides[0]/src_strides[0], so the caller can use those values to
+ * specialize the function.
+ *
+ * The return value is the number of elements it couldn't copy.  A return value
+ * of 0 means all elements were copied, a larger value means the end of
+ * the n-dimensional array was reached before 'count' elements were copied.
+ *
+ * ndim:
+ *      The number of dimensions of the n-dimensional array.
+ * dst/src:
+ *      The destination or src starting pointer.
+ * dst_stride/src_stride:
+ *      The stride of the 1-dimensional strided buffer
+ * dst_strides/src_strides:
+ *      The strides of the n-dimensional array.
+ * dst_strides_inc/src_strides_inc:
+ *      How much to add to the ..._strides pointer to get to the next stride.
+ * coords:
+ *      The starting coordinates in the n-dimensional array.
+ * coords_inc:
+ *      How much to add to the coords pointer to get to the next coordinate.
+ * shape:
+ *      The shape of the n-dimensional array.
+ * shape_inc:
+ *      How much to add to the shape pointer to get to the next shape entry.
+ * count:
+ *      How many elements to transfer
+ * src_itemsize:
+ *      How big each element is.  If transfering between elements of different
+ *      sizes, for example a casting operation, the 'stransfer' function
+ *      should be specialized for that, in which case 'stransfer' will use
+ *      this parameter as the source item size.
+ * stransfer:
+ *      The strided transfer function.
+ * transferdata:
+ *      An auxiliary data pointer passed to the strided transfer function.
+ *      If a non-NULL value is returned, it must be deallocated with the
+ *      function PyArray_FreeStridedTransferData.
+ */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+                char *dst, npy_intp dst_stride,
+                char *src, npy_intp *src_strides, npy_intp src_strides_inc,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedTransferFn *stransfer,
+                void *transferdata);
+
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_intp *coords, npy_intp coords_inc,
+                npy_intp *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                PyArray_StridedTransferFn *stransfer,
+                void *transferdata);
+
+/*
+ *            TRIVIAL ITERATION
+ *
+ * In some cases when the iteration order isn't important, iteration over
+ * arrays is trivial.  This is the case when:
+ *   * The array has 0 or 1 dimensions.
+ *   * The array is C or Fortran contiguous.
+ * Use of an iterator can be skipped when this occurs.  These macros assist
+ * in detecting and taking advantage of the situation.  Note that it may
+ * be worthwhile to further check if the stride is a contiguous stride
+ * and take advantage of that.
+ *
+ * Here is example code for a single array:
+ *
+ *      if (PyArray_TRIVIALLY_ITERABLE(self) {
+ *          char *data;
+ *          npy_intp count, stride;
+ *
+ *          PyArray_PREPARE_TRIVIAL_ITERATION(self, count, data, stride);
+ *
+ *          while (count--) {
+ *              // Use the data pointer
+ *
+ *              data += stride;
+ *          }
+ *      }
+ *      else {
+ *          // Create iterator, etc...
+ *      }
+ *
+ * Here is example code for a pair of arrays:
+ *
+ *      if (PyArray_TRIVIALLY_ITERABLE_PAIR(a1, a2) {
+ *          char *data1, *data2;
+ *          npy_intp count, stride1, stride2;
+ *
+ *          PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(a1, a2, count,
+ *                                  data1, data2, stride1, stride2);
+ *
+ *          while (count--) {
+ *              // Use the data1 and data2 pointers
+ *
+ *              data1 += stride1;
+ *              data2 += stride2;
+ *          }
+ *      }
+ *      else {
+ *          // Create iterator, etc...
+ *      }
+ */
+
+/*
+ * Note: Equivalently iterable macro requires one of arr1 or arr2 be
+ *       trivially iterable to be valid.
+ */
+#define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) ( \
+                        PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
+                        PyArray_CompareLists(PyArray_DIMS(arr1), \
+                                             PyArray_DIMS(arr2), \
+                                             PyArray_NDIM(arr1)) && \
+                        (arr1->flags&(NPY_CONTIGUOUS|NPY_FORTRAN)) == \
+                                (arr2->flags&(NPY_CONTIGUOUS|NPY_FORTRAN)) \
+                        )
+
+#define PyArray_TRIVIALLY_ITERABLE(arr) ( \
+                    PyArray_NDIM(arr) <= 1 || \
+                    PyArray_CHKFLAGS(arr, NPY_CONTIGUOUS) || \
+                    PyArray_CHKFLAGS(arr, NPY_FORTRAN) \
+                    )
+#define PyArray_PREPARE_TRIVIAL_ITERATION(arr, count, data, stride) \
+                    count = PyArray_SIZE(arr), \
+                    data = PyArray_BYTES(arr), \
+                    stride = ((PyArray_NDIM(arr) == 0) ? 0 : \
+                                (PyArray_CHKFLAGS(arr, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr, 0) : \
+                                            PyArray_STRIDE(arr, \
+                                                PyArray_NDIM(arr)-1)))
+
+#define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2) (\
+                    PyArray_TRIVIALLY_ITERABLE(arr1) && \
+                        (PyArray_NDIM(arr2) == 0 || \
+                         PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) || \
+                         (PyArray_NDIM(arr1) == 0 && \
+                             PyArray_TRIVIALLY_ITERABLE(arr2) \
+                         ) \
+                        ) \
+                    )
+#define PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(arr1, arr2, \
+                                        count, \
+                                        data1, data2, \
+                                        stride1, stride2) { \
+                    npy_intp size1 = PyArray_SIZE(arr1); \
+                    npy_intp size2 = PyArray_SIZE(arr2); \
+                    count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
+                    data1 = PyArray_BYTES(arr1); \
+                    data2 = PyArray_BYTES(arr2); \
+                    stride1 = (size1 == 1 ? 0 : \
+                                (PyArray_CHKFLAGS(arr1, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr1, 0) : \
+                                            PyArray_STRIDE(arr1, \
+                                                PyArray_NDIM(arr1)-1))); \
+                    stride2 = (size2 == 1 ? 0 : \
+                                (PyArray_CHKFLAGS(arr2, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr2, 0) : \
+                                            PyArray_STRIDE(arr2, \
+                                                PyArray_NDIM(arr2)-1))); \
+                }
+
+#define PyArray_TRIVIALLY_ITERABLE_TRIPLE(arr1, arr2, arr3) (\
+                PyArray_TRIVIALLY_ITERABLE(arr1) && \
+                    ((PyArray_NDIM(arr2) == 0 && \
+                        (PyArray_NDIM(arr3) == 0 || \
+                            PyArray_EQUIVALENTLY_ITERABLE(arr1, arr3) \
+                        ) \
+                     ) || \
+                     (PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) && \
+                        (PyArray_NDIM(arr3) == 0 || \
+                            PyArray_EQUIVALENTLY_ITERABLE(arr1, arr3) \
+                        ) \
+                     ) || \
+                     (PyArray_NDIM(arr1) == 0 && \
+                        PyArray_TRIVIALLY_ITERABLE(arr2) && \
+                            (PyArray_NDIM(arr3) == 0 || \
+                                PyArray_EQUIVALENTLY_ITERABLE(arr2, arr3) \
+                            ) \
+                     ) \
+                    ) \
+                )
+
+#define PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(arr1, arr2, arr3, \
+                                        count, \
+                                        data1, data2, data3, \
+                                        stride1, stride2, stride3) { \
+                    npy_intp size1 = PyArray_SIZE(arr1); \
+                    npy_intp size2 = PyArray_SIZE(arr2); \
+                    npy_intp size3 = PyArray_SIZE(arr3); \
+                    count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
+                    count = ((size3 > count) || size3 == 0) ? size3 : count; \
+                    data1 = PyArray_BYTES(arr1); \
+                    data2 = PyArray_BYTES(arr2); \
+                    data3 = PyArray_BYTES(arr3); \
+                    stride1 = (size1 == 1 ? 0 : \
+                                (PyArray_CHKFLAGS(arr1, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr1, 0) : \
+                                            PyArray_STRIDE(arr1, \
+                                                PyArray_NDIM(arr1)-1))); \
+                    stride2 = (size2 == 1 ? 0 : \
+                                (PyArray_CHKFLAGS(arr2, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr2, 0) : \
+                                            PyArray_STRIDE(arr2, \
+                                                PyArray_NDIM(arr2)-1))); \
+                    stride3 = (size3 == 1 ? 0 : \
+                                (PyArray_CHKFLAGS(arr3, NPY_FORTRAN) ? \
+                                            PyArray_STRIDE(arr3, 0) : \
+                                            PyArray_STRIDE(arr3, \
+                                                PyArray_NDIM(arr3)-1))); \
+                }
+
+#endif
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 6ccac9f6e..688510e14 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -57,12 +57,10 @@ array_take(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"indices", "axis", "out", "mode", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O&O&", kwlist,
-                                     &indices, PyArray_AxisConverter,
-                                     &dimension,
-                                     PyArray_OutputConverter,
-                                     &out,
-                                     PyArray_ClipmodeConverter,
-                                     &mode))
+                                     &indices,
+                                     PyArray_AxisConverter, &dimension,
+                                     PyArray_OutputConverter, &out,
+                                     PyArray_ClipmodeConverter, &mode))
         return NULL;
 
     return _ARET(PyArray_TakeFrom(self, indices, dimension, out, mode));
@@ -90,9 +88,9 @@ array_put(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"indices", "values", "mode", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O&", kwlist,
-                                     &indices, &values,
-                                     PyArray_ClipmodeConverter,
-                                     &mode))
+                                     &indices,
+                                     &values,
+                                     PyArray_ClipmodeConverter, &mode))
         return NULL;
     return PyArray_PutTo(self, values, indices, mode);
 }
@@ -203,10 +201,8 @@ array_argmax(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return _ARET(PyArray_ArgMax(self, axis, out));
@@ -220,10 +216,8 @@ array_argmin(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return _ARET(PyArray_ArgMin(self, axis, out));
@@ -237,10 +231,8 @@ array_max(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return PyArray_Max(self, axis, out);
@@ -254,10 +246,8 @@ array_ptp(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return PyArray_Ptp(self, axis, out);
@@ -272,10 +262,8 @@ array_min(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return PyArray_Min(self, axis, out);
@@ -335,8 +323,8 @@ array_getfield(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"dtype", "offset", 0};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|i", kwlist,
-                                     PyArray_DescrConverter,
-                                     &dtype, &offset)) {
+                                     PyArray_DescrConverter, &dtype,
+                                     &offset)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -388,8 +376,9 @@ array_setfield(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"value", "dtype", "offset", 0};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|i", kwlist,
-                                     &value, PyArray_DescrConverter,
-                                     &dtype, &offset)) {
+                                     &value,
+                                     PyArray_DescrConverter, &dtype,
+                                     &offset)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -459,7 +448,8 @@ array_byteswap(PyArrayObject *self, PyObject *args)
 {
     Bool inplace = FALSE;
 
-    if (!PyArg_ParseTuple(args, "|O&", PyArray_BoolConverter, &inplace)) {
+    if (!PyArg_ParseTuple(args, "|O&",
+                            PyArray_BoolConverter, &inplace)) {
         return NULL;
     }
     return PyArray_Byteswap(self, inplace);
@@ -482,8 +472,7 @@ array_tostring(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
-                                     PyArray_OrderConverter,
-                                     &order)) {
+                                     PyArray_OrderConverter, &order)) {
         return NULL;
     }
     return PyArray_ToString(self, order);
@@ -504,7 +493,9 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"file", "sep", "format", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|ss", kwlist,
-                                     &file, &sep, &format)) {
+                                     &file,
+                                     &sep,
+                                     &format)) {
         return NULL;
     }
 
@@ -744,12 +735,35 @@ array_setscalar(PyArrayObject *self, PyObject *args) {
     return Py_None;
 }
 
+/* Sets the array values from another array as if they were flat */
+static PyObject *
+array_setasflat(PyArrayObject *self, PyObject *args)
+{
+    PyObject *arr_in;
+    PyArrayObject *arr;
+    
+    if (!PyArg_ParseTuple(args, "O", &arr_in)) {
+        return NULL;
+    }
+
+    arr = (PyArrayObject *)PyArray_FromAny(arr_in, NULL, 0, 0, 0, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+
+    if (PyArray_CopyAnyInto(self, arr) != 0) {
+        Py_DECREF(arr);
+        return NULL;
+    }
+
+    Py_DECREF(arr);
+    Py_RETURN_NONE;
+}
 
 static PyObject *
-array_cast(PyArrayObject *self, PyObject *args)
+array_astype(PyArrayObject *self, PyObject *args)
 {
     PyArray_Descr *descr = NULL;
-    PyObject *obj;
 
     if (!PyArg_ParseTuple(args, "O&", PyArray_DescrConverter,
                           &descr)) {
@@ -757,19 +771,6 @@ array_cast(PyArrayObject *self, PyObject *args)
         return NULL;
     }
 
-    if (PyArray_EquivTypes(descr, self->descr)) {
-        obj = _ARET(PyArray_NewCopy(self,NPY_ANYORDER));
-        Py_XDECREF(descr);
-        return obj;
-    }
-    if (descr->names != NULL) {
-        int flags;
-        flags = NPY_FORCECAST;
-        if (PyArray_ISFORTRAN(self)) {
-            flags |= NPY_FORTRAN;
-        }
-        return PyArray_FromArray(self, descr, flags);
-    }
     return PyArray_CastToType(self, descr, PyArray_ISFORTRAN(self));
 }
 
@@ -865,8 +866,8 @@ array_getarray(PyArrayObject *self, PyObject *args)
     PyArray_Descr *newtype = NULL;
     PyObject *ret;
 
-    if (!PyArg_ParseTuple(args, "|O&", PyArray_DescrConverter,
-                          &newtype)) {
+    if (!PyArg_ParseTuple(args, "|O&",
+                            PyArray_DescrConverter, &newtype)) {
         Py_XDECREF(newtype);
         return NULL;
     }
@@ -914,15 +915,15 @@ array_getarray(PyArrayObject *self, PyObject *args)
 static PyObject *
 array_copy(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER fortran=PyArray_CORDER;
+    PyArray_ORDER order = PyArray_CORDER;
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
-                            PyArray_OrderConverter, &fortran)) {
+                            PyArray_OrderConverter, &order)) {
         return NULL;
     }
 
-    return PyArray_NewCopy(self, fortran);
+    return PyArray_NewCopy(self, order);
 }
 
 #include <stdio.h>
@@ -976,8 +977,8 @@ array_repeat(PyArrayObject *self, PyObject *args, PyObject *kwds) {
     static char *kwlist[] = {"repeats", "axis", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&", kwlist,
-                                     &repeats, PyArray_AxisConverter,
-                                     &axis)) {
+                                     &repeats,
+                                     PyArray_AxisConverter, &axis)) {
         return NULL;
     }
     return _ARET(PyArray_Repeat(self, repeats, axis));
@@ -1015,15 +1016,16 @@ array_sort(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int axis=-1;
     int val;
-    PyArray_SORTKIND which = PyArray_QUICKSORT;
+    PyArray_SORTKIND sortkind = PyArray_QUICKSORT;
     PyObject *order = NULL;
     PyArray_Descr *saved = NULL;
     PyArray_Descr *newd;
     static char *kwlist[] = {"axis", "kind", "order", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO&O", kwlist, &axis,
-                                     PyArray_SortkindConverter, &which,
-                                     &order)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO&O", kwlist,
+                                    &axis,
+                                    PyArray_SortkindConverter, &sortkind,
+                                    &order)) {
         return NULL;
     }
     if (order == Py_None) {
@@ -1053,7 +1055,7 @@ array_sort(PyArrayObject *self, PyObject *args, PyObject *kwds)
         self->descr = newd;
     }
 
-    val = PyArray_Sort(self, axis, which);
+    val = PyArray_Sort(self, axis, sortkind);
     if (order != NULL) {
         Py_XDECREF(self->descr);
         self->descr = saved;
@@ -1069,14 +1071,14 @@ static PyObject *
 array_argsort(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int axis = -1;
-    PyArray_SORTKIND which = PyArray_QUICKSORT;
+    PyArray_SORTKIND sortkind = PyArray_QUICKSORT;
     PyObject *order = NULL, *res;
     PyArray_Descr *newd, *saved=NULL;
     static char *kwlist[] = {"axis", "kind", "order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O", kwlist,
                                      PyArray_AxisConverter, &axis,
-                                     PyArray_SortkindConverter, &which,
+                                     PyArray_SortkindConverter, &sortkind,
                                      &order)) {
         return NULL;
     }
@@ -1107,7 +1109,7 @@ array_argsort(PyArrayObject *self, PyObject *args, PyObject *kwds)
         self->descr = newd;
     }
 
-    res = PyArray_ArgSort(self, axis, which);
+    res = PyArray_ArgSort(self, axis, sortkind);
     if (order != NULL) {
         Py_XDECREF(self->descr);
         self->descr = saved;
@@ -1355,14 +1357,19 @@ array_setstate(PyArrayObject *self, PyObject *args)
     /* This will free any memory associated with a and
        use the string in setstate as the (writeable) memory.
     */
-    if (!PyArg_ParseTuple(args, "(iO!O!iO)", &version, &PyTuple_Type,
-                          &shape, &PyArrayDescr_Type, &typecode,
-                          &fortran, &rawdata)) {
+    if (!PyArg_ParseTuple(args, "(iO!O!iO)",
+                            &version,
+                            &PyTuple_Type, &shape,
+                            &PyArrayDescr_Type, &typecode,
+                            &fortran,
+                            &rawdata)) {
         PyErr_Clear();
         version = 0;
-        if (!PyArg_ParseTuple(args, "(O!O!iO)", &PyTuple_Type,
-                              &shape, &PyArrayDescr_Type, &typecode,
-                              &fortran, &rawdata)) {
+        if (!PyArg_ParseTuple(args, "(O!O!iO)",
+                            &PyTuple_Type, &shape,
+                            &PyArrayDescr_Type, &typecode,
+                            &fortran,
+                            &rawdata)) {
             return NULL;
         }
     }
@@ -1675,11 +1682,9 @@ array_mean(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1699,11 +1704,9 @@ array_sum(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1724,11 +1727,9 @@ array_cumsum(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1748,11 +1749,9 @@ array_prod(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1772,11 +1771,9 @@ array_cumprod(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1818,10 +1815,8 @@ array_any(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return PyArray_Any(self, axis, out);
@@ -1836,10 +1831,8 @@ array_all(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out))
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out))
         return NULL;
 
     return PyArray_All(self, axis, out);
@@ -1857,11 +1850,10 @@ array_stddev(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", "ddof", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&i", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out, &ddof)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out,
+                                     &ddof)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1883,11 +1875,10 @@ array_variance(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"axis", "dtype", "out", "ddof", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&O&O&i", kwlist,
-                                     PyArray_AxisConverter,
-                                     &axis, PyArray_DescrConverter2,
-                                     &dtype,
-                                     PyArray_OutputConverter,
-                                     &out, &ddof)) {
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_DescrConverter2, &dtype,
+                                     PyArray_OutputConverter, &out,
+                                     &ddof)) {
         Py_XDECREF(dtype);
         return NULL;
     }
@@ -1907,10 +1898,9 @@ array_compress(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"condition", "axis", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O&", kwlist,
-                                     &condition, PyArray_AxisConverter,
-                                     &axis,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     &condition,
+                                     PyArray_AxisConverter, &axis,
+                                     PyArray_OutputConverter, &out)) {
         return NULL;
     }
     return _ARET(PyArray_Compress(self, condition, axis, out));
@@ -1937,7 +1927,9 @@ array_trace(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"offset", "axis1", "axis2", "dtype", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iiiO&O&", kwlist,
-                                     &offset, &axis1, &axis2,
+                                     &offset,
+                                     &axis1,
+                                     &axis2,
                                      PyArray_DescrConverter2, &dtype,
                                      PyArray_OutputConverter, &out)) {
         Py_XDECREF(dtype);
@@ -1960,9 +1952,9 @@ array_clip(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"min", "max", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO&", kwlist,
-                                     &min, &max,
-                                     PyArray_OutputConverter,
-                                     &out)) {
+                                     &min,
+                                     &max,
+                                     PyArray_OutputConverter, &out)) {
         return NULL;
     }
     if (max == NULL && min == NULL) {
@@ -1994,7 +1986,9 @@ array_diagonal(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"offset", "axis1", "axis2", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iii", kwlist,
-                                     &offset, &axis1, &axis2)) {
+                                     &offset,
+                                     &axis1,
+                                     &axis2)) {
         return NULL;
     }
     return _ARET(PyArray_Diagonal(self, offset, axis1, axis2));
@@ -2004,28 +1998,28 @@ array_diagonal(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_flatten(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER fortran = PyArray_CORDER;
+    PyArray_ORDER order = PyArray_CORDER;
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
-                            PyArray_OrderConverter, &fortran)) {
+                            PyArray_OrderConverter, &order)) {
         return NULL;
     }
-    return PyArray_Flatten(self, fortran);
+    return PyArray_Flatten(self, order);
 }
 
 
 static PyObject *
 array_ravel(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyArray_ORDER fortran = PyArray_CORDER;
+    PyArray_ORDER order = PyArray_CORDER;
     static char *kwlist[] = {"order", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&", kwlist,
-                            PyArray_OrderConverter, &fortran)) {
+                            PyArray_OrderConverter, &order)) {
         return NULL;
     }
-    return PyArray_Ravel(self, fortran);
+    return PyArray_Ravel(self, order);
 }
 
 
@@ -2037,8 +2031,8 @@ array_round(PyArrayObject *self, PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"decimals", "out", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO&", kwlist,
-                                     &decimals, PyArray_OutputConverter,
-                                     &out)) {
+                                     &decimals,
+                                     PyArray_OutputConverter, &out)) {
         return NULL;
     }
     return _ARET(PyArray_Round(self, decimals, out));
@@ -2056,7 +2050,9 @@ array_setflags(PyArrayObject *self, PyObject *args, PyObject *kwds)
     int flagback = self->flags;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO", kwlist,
-                                     &write, &align, &uic))
+                                     &write,
+                                     &align,
+                                     &uic))
         return NULL;
 
     if (align != Py_None) {
@@ -2181,7 +2177,7 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
         (PyCFunction)array_argsort,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"astype",
-        (PyCFunction)array_cast,
+        (PyCFunction)array_astype,
         METH_VARARGS, NULL},
     {"byteswap",
         (PyCFunction)array_byteswap,
@@ -2231,6 +2227,9 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"itemset",
         (PyCFunction) array_setscalar,
         METH_VARARGS, NULL},
+    {"setasflat",
+        (PyCFunction) array_setasflat,
+        METH_VARARGS, NULL},
     {"max",
         (PyCFunction)array_max,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d4dba719c..695a17a30 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -42,6 +42,11 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "number.h"
 #include "scalartypes.h"
 #include "numpymemoryview.h"
+#include "convert_datatype.h"
+#include "new_iterator_pywrap.h"
+
+/* Only here for API compatibility */
+NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
 
 /*NUMPY_API
  * Get Priority from object
@@ -84,10 +89,10 @@ PyArray_MultiplyIntList(int *l1, int n)
 /*NUMPY_API
  * Multiply a List
  */
-NPY_NO_EXPORT intp
-PyArray_MultiplyList(intp *l1, int n)
+NPY_NO_EXPORT npy_intp
+PyArray_MultiplyList(npy_intp *l1, int n)
 {
-    intp s = 1;
+    npy_intp s = 1;
 
     while (n--) {
         s *= (*l1++);
@@ -98,15 +103,15 @@ PyArray_MultiplyList(intp *l1, int n)
 /*NUMPY_API
  * Multiply a List of Non-negative numbers with over-flow detection.
  */
-NPY_NO_EXPORT intp
-PyArray_OverflowMultiplyList(intp *l1, int n)
+NPY_NO_EXPORT npy_intp
+PyArray_OverflowMultiplyList(npy_intp *l1, int n)
 {
-    intp prod = 1;
-    intp imax = NPY_MAX_INTP;
+    npy_intp prod = 1;
+    npy_intp imax = NPY_MAX_INTP;
     int i;
 
     for (i = 0; i < n; i++) {
-        intp dim = l1[i];
+        npy_intp dim = l1[i];
 
         if (dim == 0) {
             return 0;
@@ -124,10 +129,10 @@ PyArray_OverflowMultiplyList(intp *l1, int n)
  * Produce a pointer into array
  */
 NPY_NO_EXPORT void *
-PyArray_GetPtr(PyArrayObject *obj, intp* ind)
+PyArray_GetPtr(PyArrayObject *obj, npy_intp* ind)
 {
     int n = obj->nd;
-    intp *strides = obj->strides;
+    npy_intp *strides = obj->strides;
     char *dptr = obj->data;
 
     while (n--) {
@@ -140,7 +145,7 @@ PyArray_GetPtr(PyArrayObject *obj, intp* ind)
  * Compare Lists
  */
 NPY_NO_EXPORT int
-PyArray_CompareLists(intp *l1, intp *l2, int n)
+PyArray_CompareLists(npy_intp *l1, npy_intp *l2, int n)
 {
     int i;
 
@@ -167,11 +172,11 @@ PyArray_CompareLists(intp *l1, intp *l2, int n)
  * steals a reference to typedescr -- can be NULL
  */
 NPY_NO_EXPORT int
-PyArray_AsCArray(PyObject **op, void *ptr, intp *dims, int nd,
+PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int nd,
                  PyArray_Descr* typedescr)
 {
     PyArrayObject *ap;
-    intp n, m, i, j;
+    npy_intp n, m, i, j;
     char **ptr2;
     char ***ptr3;
 
@@ -215,7 +220,7 @@ PyArray_AsCArray(PyObject **op, void *ptr, intp *dims, int nd,
         }
         *((char ****)ptr) = ptr3;
     }
-    memcpy(dims, ap->dimensions, nd*sizeof(intp));
+    memcpy(dims, ap->dimensions, nd*sizeof(npy_intp));
     *op = (PyObject *)ap;
     return 0;
 
@@ -232,7 +237,7 @@ PyArray_AsCArray(PyObject **op, void *ptr, intp *dims, int nd,
 NPY_NO_EXPORT int
 PyArray_As1D(PyObject **op, char **ptr, int *d1, int typecode)
 {
-    intp newd1;
+    npy_intp newd1;
     PyArray_Descr *descr;
     char msg[] = "PyArray_As1D: use PyArray_AsCArray.";
 
@@ -253,7 +258,7 @@ PyArray_As1D(PyObject **op, char **ptr, int *d1, int typecode)
 NPY_NO_EXPORT int
 PyArray_As2D(PyObject **op, char ***ptr, int *d1, int *d2, int typecode)
 {
-    intp newdims[2];
+    npy_intp newdims[2];
     PyArray_Descr *descr;
     char msg[] = "PyArray_As1D: use PyArray_AsCArray.";
 
@@ -347,7 +352,7 @@ PyArray_Concatenate(PyObject *op, int axis)
     char *data;
     PyTypeObject *subtype;
     double prior1, prior2;
-    intp numbytes;
+    npy_intp numbytes;
 
     n = PySequence_Length(op);
     if (n == -1) {
@@ -579,7 +584,7 @@ PyArray_CanCoerceScalar(int thistype, int neededtype,
  */
 static PyArrayObject *
 new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2,
-                  int nd, intp dimensions[], int typenum)
+                  int nd, npy_intp dimensions[], int typenum)
 {
     PyArrayObject *ret;
     PyTypeObject *subtype;
@@ -615,11 +620,11 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
 {
     PyArrayObject *ap1, *ap2, *ret = NULL;
     PyArrayIterObject *it1, *it2;
-    intp i, j, l;
+    npy_intp i, j, l;
     int typenum, nd, axis;
-    intp is1, is2, os;
+    npy_intp is1, is2, os;
     char *op;
-    intp dimensions[MAX_DIMS];
+    npy_intp dimensions[MAX_DIMS];
     PyArray_DotFunc *dot;
     PyArray_Descr *typec;
     NPY_BEGIN_THREADS_DEF;
@@ -723,11 +728,11 @@ PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
 {
     PyArrayObject *ap1, *ap2, *ret = NULL;
     PyArrayIterObject *it1, *it2;
-    intp i, j, l;
+    npy_intp i, j, l;
     int typenum, nd, axis, matchDim;
-    intp is1, is2, os;
+    npy_intp is1, is2, os;
     char *op;
-    intp dimensions[MAX_DIMS];
+    npy_intp dimensions[MAX_DIMS];
     PyArray_DotFunc *dot;
     PyArray_Descr *typec;
     NPY_BEGIN_THREADS_DEF;
@@ -735,6 +740,7 @@ PyArray_MatrixProduct(PyObject *op1, PyObject *op2)
     typenum = PyArray_ObjectType(op1, 0);
     typenum = PyArray_ObjectType(op2, typenum);
     typec = PyArray_DescrFromType(typenum);
+
     Py_INCREF(typec);
     ap1 = (PyArrayObject *)PyArray_FromAny(op1, typec, 0, 0, ALIGNED, NULL);
     if (ap1 == NULL) {
@@ -853,8 +859,8 @@ PyArray_CopyAndTranspose(PyObject *op)
 {
     PyObject *ret, *arr;
     int nd;
-    intp dims[2];
-    intp i,j;
+    npy_intp dims[2];
+    npy_intp i,j;
     int elsize, str2;
     char *iptr;
     char *optr;
@@ -920,9 +926,9 @@ _pyarray_correlate(PyArrayObject *ap1, PyArrayObject *ap2, int typenum,
                    int mode, int *inverted)
 {
     PyArrayObject *ret;
-    intp length;
-    intp i, n1, n2, n, n_left, n_right;
-    intp is1, is2, os;
+    npy_intp length;
+    npy_intp i, n1, n2, n, n_left, n_right;
+    npy_intp is1, is2, os;
     char *ip1, *ip2, *op;
     PyArray_DotFunc *dot;
 
@@ -951,7 +957,7 @@ _pyarray_correlate(PyArrayObject *ap1, PyArrayObject *ap2, int typenum,
         n_left = n_right = 0;
         break;
     case 1:
-        n_left = (intp)(n/2);
+        n_left = (npy_intp)(n/2);
         n_right = n - n_left - 1;
         break;
     case 2:
@@ -1025,11 +1031,11 @@ clean_ret:
 static int
 _pyarray_revert(PyArrayObject *ret)
 {
-    intp length;
-    intp i;
+    npy_intp length;
+    npy_intp i;
     PyArray_CopySwapFunc *copyswap;
     char *tmp = NULL, *sw1, *sw2;
-    intp os;
+    npy_intp os;
     char *op;
 
     length = ret->dimensions[0];
@@ -1196,14 +1202,15 @@ array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
 }
 
 /*NUMPY_API
- * Convert an object to FORTRAN / C / ANY
+ * Convert an object to FORTRAN / C / ANY / KEEP
  */
 NPY_NO_EXPORT int
 PyArray_OrderConverter(PyObject *object, NPY_ORDER *val)
 {
     char *str;
+    /* Leave the desired default from the caller for NULL/Py_None */
     if (object == NULL || object == Py_None) {
-        *val = PyArray_ANYORDER;
+        return PY_SUCCEED;
     }
     else if (PyUnicode_Check(object)) {
         PyObject *tmp;
@@ -1215,10 +1222,10 @@ PyArray_OrderConverter(PyObject *object, NPY_ORDER *val)
     }
     else if (!PyBytes_Check(object) || PyBytes_GET_SIZE(object) < 1) {
         if (PyObject_IsTrue(object)) {
-            *val = PyArray_FORTRANORDER;
+            *val = NPY_FORTRANORDER;
         }
         else {
-            *val = PyArray_CORDER;
+            *val = NPY_CORDER;
         }
         if (PyErr_Occurred()) {
             return PY_FAIL;
@@ -1228,13 +1235,16 @@ PyArray_OrderConverter(PyObject *object, NPY_ORDER *val)
     else {
         str = PyBytes_AS_STRING(object);
         if (str[0] == 'C' || str[0] == 'c') {
-            *val = PyArray_CORDER;
+            *val = NPY_CORDER;
         }
         else if (str[0] == 'F' || str[0] == 'f') {
-            *val = PyArray_FORTRANORDER;
+            *val = NPY_FORTRANORDER;
         }
         else if (str[0] == 'A' || str[0] == 'a') {
-            *val = PyArray_ANYORDER;
+            *val = NPY_ANYORDER;
+        }
+        else if (str[0] == 'K' || str[0] == 'k') {
+            *val = NPY_KEEPORDER;
         }
         else {
             PyErr_SetString(PyExc_TypeError,
@@ -1405,10 +1415,16 @@ _equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
 NPY_NO_EXPORT unsigned char
 PyArray_EquivTypes(PyArray_Descr *typ1, PyArray_Descr *typ2)
 {
-    int typenum1 = typ1->type_num;
-    int typenum2 = typ2->type_num;
-    int size1 = typ1->elsize;
-    int size2 = typ2->elsize;
+    int typenum1, typenum2, size1, size2;
+
+    if (typ1 == typ2) {
+        return TRUE;
+    }
+
+    typenum1 = typ1->type_num;
+    typenum2 = typ2->type_num;
+    size1 = typ1->elsize;
+    size2 = typ2->elsize;
 
     if (size1 != size2) {
         return FALSE;
@@ -1455,8 +1471,8 @@ PyArray_EquivTypenums(int typenum1, int typenum2)
 static PyObject *
 _prepend_ones(PyArrayObject *arr, int nd, int ndmin)
 {
-    intp newdims[MAX_DIMS];
-    intp newstrides[MAX_DIMS];
+    npy_intp newdims[MAX_DIMS];
+    npy_intp newstrides[MAX_DIMS];
     int i, k, num;
     PyObject *ret;
 
@@ -1481,10 +1497,10 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin)
 
 #define _ARET(x) PyArray_Return((PyArrayObject *)(x))
 
-#define STRIDING_OK(op, order) ((order) == PyArray_ANYORDER ||          \
-                                ((order) == PyArray_CORDER &&           \
+#define STRIDING_OK(op, order) ((order) == NPY_ANYORDER ||          \
+                                ((order) == NPY_CORDER &&           \
                                  PyArray_ISCONTIGUOUS(op)) ||           \
-                                ((order) == PyArray_FORTRANORDER &&     \
+                                ((order) == NPY_FORTRANORDER &&     \
                                  PyArray_ISFORTRAN(op)))
 
 static PyObject *
@@ -1498,7 +1514,7 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     int ndmin = 0, nd;
     PyArray_Descr *type = NULL;
     PyArray_Descr *oldtype = NULL;
-    NPY_ORDER order=PyArray_ANYORDER;
+    NPY_ORDER order = NPY_ANYORDER;
     int flags = 0;
 
     if (PyTuple_GET_SIZE(args) > 2) {
@@ -1559,11 +1575,11 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     if (copy) {
         flags = ENSURECOPY;
     }
-    if (order == PyArray_CORDER) {
+    if (order == NPY_CORDER) {
         flags |= CONTIGUOUS;
     }
-    else if ((order == PyArray_FORTRANORDER)
-             /* order == PyArray_ANYORDER && */
+    else if ((order == NPY_FORTRANORDER)
+             /* order == NPY_ANYORDER && */
              || (PyArray_Check(op) && PyArray_ISFORTRAN(op))) {
         flags |= FORTRAN;
     }
@@ -1598,7 +1614,7 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"shape","dtype","order",NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
-    NPY_ORDER order = PyArray_CORDER;
+    NPY_ORDER order = NPY_CORDER;
     Bool fortran;
     PyObject *ret = NULL;
 
@@ -1608,12 +1624,20 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                 PyArray_OrderConverter, &order)) {
         goto fail;
     }
-    if (order == PyArray_FORTRANORDER) {
-        fortran = TRUE;
-    }
-    else {
-        fortran = FALSE;
+
+    switch (order) {
+        case NPY_CORDER:
+            fortran = FALSE;
+            break;
+        case NPY_FORTRANORDER:
+            fortran = TRUE;
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                            "only 'C' or 'F' order is permitted");
+            goto fail;
     }
+
     ret = PyArray_Empty(shape.len, shape.ptr, typecode, fortran);
     PyDimMem_FREE(shape.ptr);
     return ret;
@@ -1624,6 +1648,33 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     return NULL;
 }
 
+static PyObject *
+array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+
+    static char *kwlist[] = {"prototype","dtype","order",NULL};
+    PyArrayObject *prototype = NULL;
+    PyArray_Descr *dtype = NULL;
+    NPY_ORDER order = NPY_KEEPORDER;
+    PyObject *ret = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&", kwlist,
+                PyArray_Converter, &prototype,
+                PyArray_DescrConverter2, &dtype,
+                PyArray_OrderConverter, &order)) {
+        goto fail;
+    }
+    /* steals the reference to dtype if it's not NULL */
+    ret = PyArray_NewLikeArray(prototype, order, dtype);
+    Py_DECREF(prototype);
+    return ret;
+
+ fail:
+    Py_XDECREF(prototype);
+    Py_XDECREF(dtype);
+    return NULL;
+}
+
 /*
  * This function is needed for supporting Pickles of
  * numpy scalar objects.
@@ -1694,7 +1745,7 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     static char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
-    NPY_ORDER order = PyArray_CORDER;
+    NPY_ORDER order = NPY_CORDER;
     Bool fortran = FALSE;
     PyObject *ret = NULL;
 
@@ -1704,12 +1755,20 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                 PyArray_OrderConverter, &order)) {
         goto fail;
     }
-    if (order == PyArray_FORTRANORDER) {
-        fortran = TRUE;
-    }
-    else {
-        fortran = FALSE;
+
+    switch (order) {
+        case NPY_CORDER:
+            fortran = FALSE;
+            break;
+        case NPY_FORTRANORDER:
+            fortran = TRUE;
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                            "only 'C' or 'F' order is permitted");
+            goto fail;
     }
+
     ret = PyArray_Zeros(shape.len, shape.ptr, typecode, (int) fortran);
     PyDimMem_FREE(shape.ptr);
     return ret;
@@ -1721,6 +1780,29 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *array_in;
+    PyArrayObject *array;
+    npy_intp count;
+
+    if (!PyArg_ParseTuple(args, "O", &array_in)) {
+        return NULL;
+    }
+
+    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL, 0, 0, 0, NULL);
+    if (array == NULL) {
+        return NULL;
+    }
+
+    count =  PyArray_CountNonzero(array);
+
+    Py_DECREF(array);
+
+    return (count == -1) ? NULL : PyInt_FromSsize_t(count);
+}
+
+static PyObject *
 array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
 {
     char *data;
@@ -1736,7 +1818,7 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
         Py_XDECREF(descr);
         return NULL;
     }
-    return PyArray_FromString(data, (intp)s, descr, (intp)nin, sep);
+    return PyArray_FromString(data, (npy_intp)s, descr, (npy_intp)nin, sep);
 }
 
 
@@ -1777,7 +1859,7 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
     if (type == NULL) {
         type = PyArray_DescrFromType(PyArray_DEFAULT);
     }
-    ret = PyArray_FromFile(fp, type, (intp) nin, sep);
+    ret = PyArray_FromFile(fp, type, (npy_intp) nin, sep);
     ok = npy_PyFile_DupClose(file, fp);
     Py_DECREF(file);
     if (ok < 0) {
@@ -1801,7 +1883,7 @@ array_fromiter(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
         Py_XDECREF(descr);
         return NULL;
     }
-    return PyArray_FromIter(iter, descr, (intp)nin);
+    return PyArray_FromIter(iter, descr, (npy_intp)nin);
 }
 
 static PyObject *
@@ -1821,7 +1903,7 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
     if (type == NULL) {
         type = PyArray_DescrFromType(PyArray_DEFAULT);
     }
-    return PyArray_FromBuffer(obj, type, (intp)nin, (intp)offset);
+    return PyArray_FromBuffer(obj, type, (npy_intp)nin, (npy_intp)offset);
 }
 
 static PyObject *
@@ -1861,6 +1943,122 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 }
 
 static PyObject *
+array_einsum(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    char *subscripts;
+    int i, nop;
+    PyArrayObject *op[NPY_MAXARGS];
+    NPY_ORDER order = NPY_KEEPORDER;
+    NPY_CASTING casting = NPY_SAFE_CASTING;
+    PyArrayObject *out = NULL;
+    PyArray_Descr *dtype = NULL;
+    PyObject *ret = NULL;
+
+    nop = PyTuple_GET_SIZE(args) - 1;
+    if (nop <= 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "must specify the einstein sum subscripts string "
+                        "and at least one operand");
+        return NULL;
+    }
+    else if (nop > NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError, "too many operands");
+        return NULL;
+    }
+
+    /* Get the subscripts string */
+    subscripts = PyString_AsString(PyTuple_GET_ITEM(args, 0));
+    if (subscripts == NULL) {
+        return NULL;
+    }
+
+    /* Set the operands to NULL */
+    for (i = 0; i < nop; ++i) {
+        op[i] = NULL;
+    }
+
+    /* Get the operands */
+    for (i = 0; i < nop; ++i) {
+        PyObject *obj = PyTuple_GET_ITEM(args, i+1);
+        if (PyArray_Check(obj)) {
+            Py_INCREF(obj);
+            op[i] = (PyArrayObject *)obj;
+        }
+        else {
+            op[i] = (PyArrayObject *)PyArray_FromAny(obj,
+                                    NULL, 0, 0, NPY_ENSUREARRAY, NULL);
+            if (op[i] == NULL) {
+                goto finish;
+            }
+        }
+    }
+
+    /* Get the keyword arguments */
+    if (kwds != NULL) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwds, &pos, &key, &value)) {
+            char *str = PyString_AsString(key);
+
+            if (str == NULL) {
+                PyErr_Clear();
+                PyErr_SetString(PyExc_TypeError, "invalid keyword");
+                goto finish;
+            }
+
+            if (strcmp(str,"out") == 0) {
+                if (PyArray_Check(value)) {
+                    out = (PyArrayObject *)value;
+                }
+                else {
+                    PyErr_SetString(PyExc_TypeError,
+                                "keyword parameter out must be an "
+                                "array for einsum");
+                    goto finish;
+                }
+            }
+            else if (strcmp(str,"order") == 0) {
+                if (!PyArray_OrderConverter(value, &order)) {
+                    goto finish;
+                }
+            }
+            else if (strcmp(str,"casting") == 0) {
+                if (!PyArray_CastingConverter(value, &casting)) {
+                    goto finish;
+                }
+            }
+            else if (strcmp(str,"dtype") == 0) {
+                if (!PyArray_DescrConverter2(value, &dtype)) {
+                    goto finish;
+                }
+            }
+            else {
+                PyErr_Format(PyExc_TypeError,
+                            "'%s' is an invalid keyword for einsum",
+                            str);
+                goto finish;
+            }
+        }
+    }
+
+    ret = (PyObject *)PyArray_EinsteinSum(subscripts, nop, op, dtype,
+                                        order, casting, out);
+
+    /* If no output was supplied, possibly convert to a scalar */
+    if (ret != NULL && out == NULL) {
+        ret = _ARET(ret);
+    }
+
+finish:
+    for (i = 0; i < nop; ++i) {
+        Py_XDECREF(op[i]);
+    }
+    Py_XDECREF(dtype);
+
+    return ret;
+}
+
+static PyObject *
 array_fastCopyAndTranspose(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *a0;
@@ -1923,7 +2121,7 @@ array_arange(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws) {
 NPY_NO_EXPORT unsigned int
 PyArray_GetNDArrayCVersion(void)
 {
-    return (unsigned int)NPY_VERSION;
+    return (unsigned int)NPY_ABI_VERSION;
 }
 
 /*NUMPY_API
@@ -1932,7 +2130,7 @@ PyArray_GetNDArrayCVersion(void)
 NPY_NO_EXPORT unsigned int
 PyArray_GetNDArrayCFeatureVersion(void)
 {
-    return (unsigned int)NPY_FEATURE_VERSION;
+    return (unsigned int)NPY_API_VERSION;
 }
 
 static PyObject *
@@ -2160,23 +2358,51 @@ static PyObject *
 array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
         PyObject *kwds)
 {
+    PyObject *from_obj = NULL;
     PyArray_Descr *d1 = NULL;
     PyArray_Descr *d2 = NULL;
     Bool ret;
     PyObject *retobj = NULL;
-    static char *kwlist[] = {"from", "to", NULL};
+    NPY_CASTING casting = NPY_SAFE_CASTING;
+    static char *kwlist[] = {"from", "to", "casting", NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "O&O&", kwlist,
-                PyArray_DescrConverter, &d1, PyArray_DescrConverter, &d2)) {
+    if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&", kwlist,
+                &from_obj,
+                PyArray_DescrConverter2, &d2,
+                PyArray_CastingConverter, &casting)) {
         goto finish;
     }
-    if (d1 == NULL || d2 == NULL) {
+    if (d2 == NULL) {
         PyErr_SetString(PyExc_TypeError,
                 "did not understand one of the types; 'None' not accepted");
         goto finish;
     }
 
-    ret = PyArray_CanCastTo(d1, d2);
+    /* If the first parameter is an object or scalar, use CanCastArrayTo */
+    if (PyArray_Check(from_obj)) {
+        ret = PyArray_CanCastArrayTo((PyArrayObject *)from_obj, d2, casting);
+    }
+    else if (PyArray_IsScalar(from_obj, Generic) ||
+                                PyArray_IsPythonNumber(from_obj)) {
+        PyArrayObject *arr;
+        arr = (PyArrayObject *)PyArray_FromAny(from_obj,
+                                        NULL, 0, 0, 0, NULL);
+        if (arr == NULL) {
+            goto finish;
+        }
+        ret = PyArray_CanCastArrayTo(arr, d2, casting);
+        Py_DECREF(arr);
+    }
+    /* Otherwise use CanCastTypeTo */
+    else {
+        if (!PyArray_DescrConverter2(from_obj, &d1) || d1 == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                    "did not understand one of the types; 'None' not accepted");
+            goto finish;
+        }
+        ret = PyArray_CanCastTypeTo(d1, d2, casting);
+    }
+
     retobj = ret ? Py_True : Py_False;
     Py_INCREF(retobj);
 
@@ -2186,6 +2412,118 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
     return retobj;
 }
 
+static PyObject *
+array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    PyArray_Descr *d1 = NULL;
+    PyArray_Descr *d2 = NULL;
+    PyObject *ret = NULL;
+    if(!PyArg_ParseTuple(args, "O&O&",
+                PyArray_DescrConverter2, &d1, PyArray_DescrConverter2, &d2)) {
+        goto finish;
+    }
+
+    if (d1 == NULL || d2 == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "did not understand one of the types");
+        goto finish;
+    }
+
+    ret = (PyObject *)PyArray_PromoteTypes(d1, d2);
+
+ finish:
+    Py_XDECREF(d1);
+    Py_XDECREF(d2);
+    return ret;
+}
+
+static PyObject *
+array_min_scalar_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    PyObject *array_in = NULL;
+    PyArrayObject *array;
+    PyObject *ret = NULL;
+
+    if(!PyArg_ParseTuple(args, "O", &array_in)) {
+        return NULL;
+    }
+
+    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL, 0, 0, 0, NULL);
+    if (array == NULL) {
+        return NULL;
+    }
+
+    ret = (PyObject *)PyArray_MinScalarType(array);
+    Py_DECREF(array);
+    return ret;
+}
+
+static PyObject *
+array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    npy_intp i, len, narr = 0, ndtypes = 0;
+    PyArrayObject *arr[NPY_MAXARGS];
+    PyArray_Descr *dtypes[NPY_MAXARGS];
+    PyObject *ret = NULL;
+
+    len = PyTuple_GET_SIZE(args);
+    if (len == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "at least one array or dtype is required");
+        goto finish;
+    }
+
+    for (i = 0; i < len; ++i) {
+        PyObject *obj = PyTuple_GET_ITEM(args, i);
+        if (PyArray_Check(obj)) {
+            if (narr == NPY_MAXARGS) {
+                PyErr_SetString(PyExc_ValueError,
+                                "too many arguments");
+                goto finish;
+            }
+            Py_INCREF(obj);
+            arr[narr] = (PyArrayObject *)obj;
+            ++narr;
+        }
+        else if (PyArray_IsScalar(obj, Generic) ||
+                                    PyArray_IsPythonNumber(obj)) {
+            if (narr == NPY_MAXARGS) {
+                PyErr_SetString(PyExc_ValueError,
+                                "too many arguments");
+                goto finish;
+            }
+            arr[narr] = (PyArrayObject *)PyArray_FromAny(obj,
+                                            NULL, 0, 0, 0, NULL);
+            if (arr[narr] == NULL) {
+                goto finish;
+            }
+            ++narr;
+        }
+        else {
+            if (ndtypes == NPY_MAXARGS) {
+                PyErr_SetString(PyExc_ValueError,
+                                "too many arguments");
+                goto finish;
+            }
+            if (!PyArray_DescrConverter2(obj, &dtypes[ndtypes])) {
+                goto finish;
+            }
+            ++ndtypes;
+        }
+    }
+
+    ret = (PyObject *)PyArray_ResultType(narr, arr, ndtypes, dtypes);
+
+finish:
+    for (i = 0; i < narr; ++i) {
+        Py_DECREF(arr[i]);
+    }
+    for (i = 0; i < ndtypes; ++i) {
+        Py_DECREF(dtypes[i]);
+    }
+    return ret;
+}
+
 #if !defined(NPY_PY3K)
 static PyObject *
 new_buffer(PyObject *NPY_UNUSED(dummy), PyObject *args)
@@ -2731,15 +3069,24 @@ static struct PyMethodDef array_module_methods[] = {
     {"array",
         (PyCFunction)_array_fromobject,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"nested_iters",
+        (PyCFunction)NpyIter_NestedIters,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"arange",
         (PyCFunction)array_arange,
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"zeros",
         (PyCFunction)array_zeros,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"count_nonzero",
+        (PyCFunction)array_count_nonzero,
+        METH_VARARGS, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"empty_like",
+        (PyCFunction)array_empty_like,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"scalar",
         (PyCFunction)array_scalar,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -2767,6 +3114,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"dot",
         (PyCFunction)array_matrixproduct,
         METH_VARARGS, NULL},
+    {"einsum",
+        (PyCFunction)array_einsum,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"_fastCopyAndTranspose",
         (PyCFunction)array_fastCopyAndTranspose,
         METH_VARARGS, NULL},
@@ -2785,6 +3135,15 @@ static struct PyMethodDef array_module_methods[] = {
     {"can_cast",
         (PyCFunction)array_can_cast_safely,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"promote_types",
+        (PyCFunction)array_promote_types,
+        METH_VARARGS, NULL},
+    {"min_scalar_type",
+        (PyCFunction)array_min_scalar_type,
+        METH_VARARGS, NULL},
+    {"result_type",
+        (PyCFunction)array_result_type,
+        METH_VARARGS, NULL},
 #if !defined(NPY_PY3K)
     {"newbuffer",
         (PyCFunction)new_buffer,
@@ -3067,6 +3426,7 @@ PyMODINIT_FUNC initmultiarray(void) {
         goto err;
     }
     PyArrayIter_Type.tp_iter = PyObject_SelfIter;
+    NpyIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMultiIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMultiIter_Type.tp_free = _pya_free;
     if (PyType_Ready(&PyArrayIter_Type) < 0) {
@@ -3082,6 +3442,9 @@ PyMODINIT_FUNC initmultiarray(void) {
     if (PyType_Ready(&PyArrayNeighborhoodIter_Type) < 0) {
         return RETVAL;
     }
+    if (PyType_Ready(&NpyIter_Type) < 0) {
+        return RETVAL;
+    }
 
     PyArrayDescr_Type.tp_hash = PyArray_DescrHash;
     if (PyType_Ready(&PyArrayDescr_Type) < 0) {
@@ -3159,6 +3522,8 @@ PyMODINIT_FUNC initmultiarray(void) {
     Py_INCREF(&PyArrayIter_Type);
     PyDict_SetItemString(d, "flatiter", (PyObject *)&PyArrayIter_Type);
     Py_INCREF(&PyArrayMultiIter_Type);
+    PyDict_SetItemString(d, "newiter", (PyObject *)&NpyIter_Type);
+    Py_INCREF(&NpyIter_Type);
     PyDict_SetItemString(d, "broadcast",
                          (PyObject *)&PyArrayMultiIter_Type);
     Py_INCREF(&PyArrayDescr_Type);
diff --git a/numpy/core/src/multiarray/multiarraymodule_onefile.c b/numpy/core/src/multiarray/multiarraymodule_onefile.c
index ab6b4cc80..ceef0a56e 100644
--- a/numpy/core/src/multiarray/multiarraymodule_onefile.c
+++ b/numpy/core/src/multiarray/multiarraymodule_onefile.c
@@ -35,6 +35,12 @@
 #include "conversion_utils.c"
 #include "buffer.c"
 
+#include "new_iterator.c"
+#include "new_iterator_pywrap.c"
+#include "lowlevel_strided_loops.c"
+#include "dtype_transfer.c"
+#include "einsum.c"
+
 
 #ifndef Py_UNICODE_WIDE
 #include "ucsnarrow.c"
diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
new file mode 100644
index 000000000..89de0821b
--- /dev/null
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -0,0 +1,5990 @@
+/*
+ * This file implements the new, highly flexible iterator for NumPy.
+ *
+ * Copyright (c) 2010-2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The Univerity of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+#include "convert_datatype.h"
+
+#include "lowlevel_strided_loops.h"
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_IT_DBG_TRACING 0
+
+#if NPY_IT_DBG_TRACING
+#define NPY_IT_DBG_PRINTF(...) printf(__VA_ARGS__)
+#else
+#define NPY_IT_DBG_PRINTF(...)
+#endif
+/**********************************************/
+
+/* Rounds up a number of bytes to be divisible by sizeof intp */
+#if NPY_SIZEOF_INTP == 4
+#define NPY_INTP_ALIGNED(size) ((size + 0x3)&(-0x4))
+#else
+#define NPY_INTP_ALIGNED(size) ((size + 0x7)&(-0x8))
+#endif
+
+/* Internal iterator flags */
+
+/* The perm is the identity */
+#define NPY_ITFLAG_IDENTPERM    0x0001
+/* The perm has negative entries (indicating flipped axes) */
+#define NPY_ITFLAG_NEGPERM      0x0002
+/* The iterator is tracking an index */
+#define NPY_ITFLAG_HASINDEX     0x0004
+/* The iterator is tracking coordinates */
+#define NPY_ITFLAG_HASCOORDS    0x0008
+/* The iteration order was forced on construction */
+#define NPY_ITFLAG_FORCEDORDER  0x0010
+/* The inner loop is handled outside the iterator */
+#define NPY_ITFLAG_NOINNER      0x0020
+/* The iterator is ranged */
+#define NPY_ITFLAG_RANGE        0x0040
+/* The iterator is buffered */
+#define NPY_ITFLAG_BUFFER       0x0080
+/* The iterator should grow the buffered inner loop when possible */
+#define NPY_ITFLAG_GROWINNER    0x0100
+/* There is just one iteration, can specialize iternext for that */
+#define NPY_ITFLAG_ONEITERATION 0x0200
+/* Delay buffer allocation until first Reset* call */
+#define NPY_ITFLAG_DELAYBUF     0x0400
+/* Iteration needs API access during iternext */
+#define NPY_ITFLAG_NEEDSAPI     0x0800
+/* Iteration includes one or more operands being reduced */
+#define NPY_ITFLAG_REDUCE       0x1000
+/* Reduce iteration doesn't need to recalculate reduce loops next time */
+#define NPY_ITFLAG_REUSE_REDUCE_LOOPS 0x2000
+
+/* Internal iterator per-operand iterator flags */
+
+/* The operand will be written to */
+#define NPY_OP_ITFLAG_WRITE        0x01
+/* The operand will be read from */
+#define NPY_OP_ITFLAG_READ         0x02
+/* The operand needs type conversion/byte swapping/alignment */
+#define NPY_OP_ITFLAG_CAST         0x04
+/* The operand never needs buffering */
+#define NPY_OP_ITFLAG_BUFNEVER     0x08
+/* The operand is aligned */
+#define NPY_OP_ITFLAG_ALIGNED      0x10
+/* The operand is being reduced */
+#define NPY_OP_ITFLAG_REDUCE       0x20
+
+/*
+ * The data layout of the iterator is fully specified by
+ * a triple (itflags, ndim, niter).  These three variables
+ * are expected to exist in all functions calling these macros,
+ * either as true variables initialized to the correct values
+ * from the iterator, or as constants in the case of specialized
+ * functions such as the various iternext functions.
+ */
+
+struct NpyIter_InternalOnly {
+    /* Initial fixed position data */
+    npy_uint32 itflags;
+    npy_uint16 ndim, niter;
+    npy_intp itersize, iterstart, iterend;
+    /* iterindex is only used if RANGED or BUFFERED is set */
+    npy_intp iterindex;
+    /* The rest is variable */
+    char iter_flexdata;
+};
+
+typedef struct NpyIter_AD NpyIter_AxisData;
+typedef struct NpyIter_BD NpyIter_BufferData;
+
+/* Byte sizes of the iterator members */
+#define NIT_PERM_SIZEOF(itflags, ndim, niter) \
+        NPY_INTP_ALIGNED(NPY_MAXDIMS)
+#define NIT_DTYPES_SIZEOF(itflags, ndim, niter) \
+        ((NPY_SIZEOF_INTP)*(niter))
+#define NIT_RESETDATAPTR_SIZEOF(itflags, ndim, niter) \
+        ((NPY_SIZEOF_INTP)*(niter+1))
+#define NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter) \
+        ((NPY_SIZEOF_INTP)*(niter+1))
+#define NIT_OPERANDS_SIZEOF(itflags, ndim, niter) \
+        ((NPY_SIZEOF_INTP)*(niter))
+#define NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter) \
+        (NPY_INTP_ALIGNED(niter))
+#define NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter) \
+        ((itflags&NPY_ITFLAG_BUFFER) ? ((NPY_SIZEOF_INTP)*(6 + 9*niter)) : 0)
+
+/* Byte offsets of the iterator members starting from iter->iter_flexdata */
+#define NIT_PERM_OFFSET() \
+        (0)
+#define NIT_DTYPES_OFFSET(itflags, ndim, niter) \
+        (NIT_PERM_OFFSET() + \
+         NIT_PERM_SIZEOF(itflags, ndim, niter))
+#define NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter) \
+        (NIT_DTYPES_OFFSET(itflags, ndim, niter) + \
+         NIT_DTYPES_SIZEOF(itflags, ndim, niter))
+#define NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) \
+        (NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter) + \
+         NIT_RESETDATAPTR_SIZEOF(itflags, ndim, niter))
+#define NIT_OPERANDS_OFFSET(itflags, ndim, niter) \
+        (NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter) + \
+         NIT_BASEOFFSETS_SIZEOF(itflags, ndim, niter))
+#define NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) \
+        (NIT_OPERANDS_OFFSET(itflags, ndim, niter) + \
+         NIT_OPERANDS_SIZEOF(itflags, ndim, niter))
+#define NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) \
+        (NIT_OPITFLAGS_OFFSET(itflags, ndim, niter) + \
+         NIT_OPITFLAGS_SIZEOF(itflags, ndim, niter))
+#define NIT_AXISDATA_OFFSET(itflags, ndim, niter) \
+        (NIT_BUFFERDATA_OFFSET(itflags, ndim, niter) + \
+         NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter))
+
+/* Internal-only ITERATOR DATA MEMBER ACCESS */
+#define NIT_ITFLAGS(iter) \
+        ((iter)->itflags)
+#define NIT_NDIM(iter) \
+        ((iter)->ndim)
+#define NIT_NITER(iter) \
+        ((iter)->niter)
+#define NIT_ITERSIZE(iter) \
+        (iter->itersize)
+#define NIT_ITERSTART(iter) \
+        (iter->iterstart)
+#define NIT_ITEREND(iter) \
+        (iter->iterend)
+#define NIT_ITERINDEX(iter) \
+        (iter->iterindex)
+#define NIT_PERM(iter)  ((char*)( \
+        &(iter)->iter_flexdata + NIT_PERM_OFFSET()))
+#define NIT_DTYPES(iter) ((PyArray_Descr **)( \
+        &(iter)->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, niter)))
+#define NIT_RESETDATAPTR(iter) ((char **)( \
+        &(iter)->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, niter)))
+#define NIT_BASEOFFSETS(iter) ((npy_intp *)( \
+        &(iter)->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, niter)))
+#define NIT_OPERANDS(iter) ((PyArrayObject **)( \
+        &(iter)->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, niter)))
+#define NIT_OPITFLAGS(iter) ( \
+        &(iter)->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, niter))
+#define NIT_BUFFERDATA(iter) ((NpyIter_BufferData *)( \
+        &(iter)->iter_flexdata + NIT_BUFFERDATA_OFFSET(itflags, ndim, niter)))
+#define NIT_AXISDATA(iter) ((NpyIter_AxisData *)( \
+        &(iter)->iter_flexdata + NIT_AXISDATA_OFFSET(itflags, ndim, niter)))
+
+/* Internal-only BUFFERDATA MEMBER ACCESS */
+struct NpyIter_BD {
+    npy_intp buffersize, size, bufiterend,
+             reduce_pos, reduce_outersize, reduce_outerdim;
+    npy_intp bd_flexdata;
+};
+#define NBF_BUFFERSIZE(bufferdata) ((bufferdata)->buffersize)
+#define NBF_SIZE(bufferdata) ((bufferdata)->size)
+#define NBF_BUFITEREND(bufferdata) ((bufferdata)->bufiterend)
+#define NBF_REDUCE_POS(bufferdata) ((bufferdata)->reduce_pos)
+#define NBF_REDUCE_OUTERSIZE(bufferdata) ((bufferdata)->reduce_outersize)
+#define NBF_REDUCE_OUTERDIM(bufferdata) ((bufferdata)->reduce_outerdim)
+#define NBF_STRIDES(bufferdata) ( \
+        &(bufferdata)->bd_flexdata + 0)
+#define NBF_PTRS(bufferdata) ((char **) \
+        (&(bufferdata)->bd_flexdata + 1*(niter)))
+#define NBF_REDUCE_OUTERSTRIDES(bufferdata) ( \
+        (&(bufferdata)->bd_flexdata + 2*(niter)))
+#define NBF_REDUCE_OUTERPTRS(bufferdata) ((char **) \
+        (&(bufferdata)->bd_flexdata + 3*(niter)))
+#define NBF_READTRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
+        (&(bufferdata)->bd_flexdata + 4*(niter)))
+#define NBF_READTRANSFERDATA(bufferdata) ((void **) \
+        (&(bufferdata)->bd_flexdata + 5*(niter)))
+#define NBF_WRITETRANSFERFN(bufferdata) ((PyArray_StridedTransferFn **) \
+        (&(bufferdata)->bd_flexdata + 6*(niter)))
+#define NBF_WRITETRANSFERDATA(bufferdata) ((void **) \
+        (&(bufferdata)->bd_flexdata + 7*(niter)))
+#define NBF_BUFFERS(bufferdata) ((char **) \
+        (&(bufferdata)->bd_flexdata + 8*(niter)))
+
+/* Internal-only AXISDATA MEMBER ACCESS. */
+struct NpyIter_AD {
+    npy_intp shape, coord;
+    npy_intp ad_flexdata;
+};
+#define NAD_SHAPE(axisdata) ((axisdata)->shape)
+#define NAD_COORD(axisdata) ((axisdata)->coord)
+#define NAD_STRIDES(axisdata) ( \
+        &(axisdata)->ad_flexdata + 0)
+#define NAD_PTRS(axisdata) ((char **) \
+        &(axisdata)->ad_flexdata + 1*(niter+1))
+
+#define NAD_NSTRIDES() \
+        ((niter) + ((itflags&NPY_ITFLAG_HASINDEX) ? 1 : 0))
+
+/* Size of one AXISDATA struct within the iterator */
+#define NIT_AXISDATA_SIZEOF(itflags, ndim, niter) (( \
+        /* intp shape */ \
+        1 + \
+        /* intp coord */ \
+        1 + \
+        /* intp stride[niter+1] AND char* ptr[niter+1] */ \
+        2*((niter)+1) \
+        )*NPY_SIZEOF_INTP )
+
+/*
+ * Macro to advance an AXISDATA pointer by a specified count.
+ * Requires that sizeof_axisdata be previously initialized
+ * to NIT_AXISDATA_SIZEOF(itflags, ndim, niter).
+ */
+#define NIT_ADVANCE_AXISDATA(axisdata, count) \
+        (*((char **)(&axisdata))) += (count)*sizeof_axisdata
+#define NIT_INDEX_AXISDATA(axisdata, index) ((NpyIter_AxisData *) \
+        (((char *)(axisdata)) + (index)*sizeof_axisdata))
+
+/* Size of the whole iterator */
+#define NIT_SIZEOF_ITERATOR(itflags, ndim, niter) ( \
+        sizeof(struct NpyIter_InternalOnly) + \
+        NIT_AXISDATA_OFFSET(itflags, ndim, niter) + \
+        NIT_AXISDATA_SIZEOF(itflags, ndim, niter)*(ndim))
+
+/* Internal helper functions */
+static int
+npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
+static int
+npyiter_check_op_axes(npy_intp niter, npy_intp oa_ndim, npy_intp **op_axes);
+static npy_intp
+npyiter_calculate_ndim(npy_intp niter, PyArrayObject **op_in,
+                       npy_intp oa_ndim);
+static int
+npyiter_check_per_op_flags(npy_uint32 flags, char *op_itflags);
+static int
+npyiter_prepare_one_operand(PyArrayObject **op,
+                        char **op_dataptr,
+                        PyArray_Descr *op_request_dtype,
+                        PyArray_Descr** op_dtype,
+                        npy_uint32 flags,
+                        npy_uint32 op_flags, char *op_itflags);
+static int
+npyiter_prepare_operands(npy_intp niter, PyArrayObject **op_in,
+                    PyArrayObject **op,
+                    char **op_dataptr,
+                    PyArray_Descr **op_request_dtypes,
+                    PyArray_Descr **op_dtype,
+                    npy_uint32 flags,
+                    npy_uint32 *op_flags, char *op_itflags);
+static int
+npyiter_check_casting(npy_intp niter, PyArrayObject **op,
+                    PyArray_Descr **op_dtype,
+                    NPY_CASTING casting,
+                    char *op_itflags);
+static int
+npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
+                    char **op_dataptr,
+                    npy_uint32 *op_flags, npy_intp **op_axes,
+                    int output_scalars);
+static void
+npyiter_replace_axisdata(NpyIter *iter, npy_intp iiter,
+                      PyArrayObject *op,
+                      npy_intp op_ndim, char *op_dataptr,
+                      npy_intp *op_axes);
+static void
+npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags);
+static void
+npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order);
+
+static void
+npyiter_flip_negative_strides(NpyIter *iter);
+static void
+npyiter_reverse_axis_ordering(NpyIter *iter);
+static void 
+npyiter_find_best_axis_ordering(NpyIter *iter);
+static void 
+npyiter_coalesce_axes(NpyIter *iter);
+
+static PyArray_Descr *
+npyiter_get_common_dtype(npy_intp niter, PyArrayObject **op,
+                        char *op_itflags, PyArray_Descr **op_dtype,
+                        PyArray_Descr **op_request_dtypes,
+                        int only_inputs, int output_scalars);
+
+static PyArrayObject *
+npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
+                npy_uint32 flags, char *op_itflags,
+                npy_intp op_ndim, npy_intp *shape,
+                PyArray_Descr *op_dtype, npy_intp *op_axes);
+static int
+npyiter_allocate_arrays(NpyIter *iter,
+                        npy_uint32 flags,
+                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
+                        npy_uint32 *op_flags, char *op_itflags,
+                        npy_intp **op_axes, int output_scalars);
+static void
+npyiter_get_priority_subtype(npy_intp niter, PyArrayObject **op,
+                            char *op_itflags,
+                            double *subtype_priority, PyTypeObject **subtype);
+
+static int
+npyiter_allocate_transfer_functions(NpyIter *iter);
+static int
+npyiter_allocate_buffers(NpyIter *iter, char **errmsg);
+static void npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex);
+static void
+npyiter_copy_from_buffers(NpyIter *iter);
+static void
+npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs);
+static npy_intp
+npyiter_checkreducesize(NpyIter *iter, npy_intp count,
+                                npy_intp *reduce_innersize,
+                                npy_intp *reduce_outerdim);
+
+/*NUMPY_API
+ * Allocate a new iterator for multiple array objects
+ */
+NPY_NO_EXPORT NpyIter *
+NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
+                 NPY_ORDER order, NPY_CASTING casting,
+                 npy_uint32 *op_flags,
+                 PyArray_Descr **op_request_dtypes,
+                 npy_intp oa_ndim, npy_intp **op_axes, npy_intp buffersize)
+{
+    npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
+    npy_intp idim, ndim;
+    npy_intp iiter;
+
+    /* The iterator being constructed */
+    NpyIter *iter;
+
+    /* Per-operand values */
+    PyArrayObject **op;
+    PyArray_Descr **op_dtype;
+    char *op_itflags;
+    char **op_dataptr;
+
+    char *perm;
+    NpyIter_BufferData *bufferdata = NULL;
+    int any_allocate = 0, any_missing_dtypes = 0,
+            output_scalars = 0, need_subtype = 0;
+
+    /* The subtype for automatically allocated outputs */
+    double subtype_priority = NPY_PRIORITY;
+    PyTypeObject *subtype = &PyArray_Type;
+
+    if (niter > NPY_MAXARGS) {
+        PyErr_Format(PyExc_ValueError,
+            "Cannot construct an iterator with more than %d operands "
+            "(%d were requested)", (int)NPY_MAXARGS, (int)niter);
+        return NULL;
+    }
+
+    /* Error check 'oa_ndim' and 'op_axes', which must be used together */
+    if (!npyiter_check_op_axes(niter, oa_ndim, op_axes)) {
+        return NULL;
+    }
+
+    /* Check the global iterator flags */
+    if (!npyiter_check_global_flags(flags, &itflags)) {
+        return NULL;
+    }
+
+    /* Calculate how many dimensions the iterator should have */
+    ndim = npyiter_calculate_ndim(niter, op_in, oa_ndim);
+
+    /* If 'ndim' is zero, any outputs should be scalars */
+    if (ndim == 0) {
+        output_scalars = 1;
+        ndim = 1;
+    }
+
+    /* Allocate memory for the iterator */
+    iter = (NpyIter*)
+                PyArray_malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
+
+    /* Fill in the basic data */
+    NIT_ITFLAGS(iter) = itflags;
+    NIT_NDIM(iter) = ndim;
+    NIT_NITER(iter) = niter;
+    NIT_ITERINDEX(iter) = 0;
+    memset(NIT_BASEOFFSETS(iter), 0, (niter+1)*NPY_SIZEOF_INTP);
+
+    op = NIT_OPERANDS(iter);
+    op_dtype = NIT_DTYPES(iter);
+    op_itflags = NIT_OPITFLAGS(iter);
+    op_dataptr = NIT_RESETDATAPTR(iter);
+
+    /* Prepare all the operands */
+    if (!npyiter_prepare_operands(niter, op_in, op, op_dataptr,
+                        op_request_dtypes, op_dtype,
+                        flags,
+                        op_flags, op_itflags)) {
+        PyArray_free(iter);
+        return NULL;
+    }
+    /* Set resetindex to zero as well (it's just after the resetdataptr) */
+    op_dataptr[niter] = 0;
+
+    /*
+     * Initialize buffer data (must set the buffers and transferdata
+     * to NULL before we might deallocate the iterator).
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        bufferdata = NIT_BUFFERDATA(iter);
+        NBF_SIZE(bufferdata) = 0;
+        memset(NBF_BUFFERS(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+        memset(NBF_READTRANSFERDATA(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+        memset(NBF_WRITETRANSFERDATA(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+    }
+
+    /* Fill in the AXISDATA arrays and set the ITERSIZE field */
+    if (!npyiter_fill_axisdata(iter, flags, op_itflags, op_dataptr,
+                                        op_flags, op_axes, output_scalars)) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        /*
+         * If buffering is enabled and no buffersize was given, use a default
+         * chosen to be big enough to get some amortization benefits, but
+         * small enough to be cache-friendly.
+         */
+        if (buffersize <= 0) {
+            buffersize = 1 << 12;
+        }
+        /* No point in a buffer bigger than the iteration size */
+        if (buffersize > NIT_ITERSIZE(iter)) {
+            buffersize = NIT_ITERSIZE(iter);
+        }
+        NBF_BUFFERSIZE(bufferdata) = buffersize;
+    }
+
+    /*
+     * If an index was requested, compute the strides for it.
+     * Note that we must do this before changing the order of the
+     * axes
+     */
+    npyiter_compute_index_strides(iter, flags);
+
+    /* Initialize the perm to the identity */
+    perm = NIT_PERM(iter);
+    for(idim = 0; idim < ndim; ++idim) {
+        perm[idim] = (char)idim;
+    }
+
+    /*
+     * If an iteration order is being forced, apply it.
+     */
+    npyiter_apply_forced_iteration_order(iter, order);
+    itflags = NIT_ITFLAGS(iter);
+
+    /* Set some flags for allocated outputs */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] == NULL) {
+            /* Flag this so later we can avoid flipping axes */
+            any_allocate = 1;
+            /* If a subtype may be used, indicate so */
+            if (!(op_flags[iiter]&NPY_ITER_NO_SUBTYPE)) {
+                need_subtype = 1;
+            }
+            /*
+             * If the data type wasn't provided, will need to
+             * calculate it.
+             */
+            if (op_dtype[iiter] == NULL) {
+                any_missing_dtypes = 1;
+            }
+        }
+    }
+
+    /*
+     * If the ordering was not forced, reorder the axes
+     * and flip negative strides to find the best one.
+     */
+    if (!(itflags&NPY_ITFLAG_FORCEDORDER)) {
+        if (ndim > 1) {
+            npyiter_find_best_axis_ordering(iter);
+        }
+        /*
+         * If there's an output being allocated, we must not negate
+         * any strides.
+         */
+        if (!any_allocate && !(flags&NPY_ITER_DONT_REVERSE_AXES)) {
+            npyiter_flip_negative_strides(iter);
+        }
+        itflags = NIT_ITFLAGS(iter);
+    }
+
+    if (need_subtype) {
+        npyiter_get_priority_subtype(niter, op, op_itflags,
+                                     &subtype_priority, &subtype);
+    }
+
+    /*
+     * If an automatically allocated output didn't have a specified
+     * dtype, we need to figure it out now, before allocating the outputs.
+     */
+    if (any_missing_dtypes || (flags&NPY_ITER_COMMON_DTYPE)) {
+        PyArray_Descr *dtype;
+        int only_inputs = !(flags&NPY_ITER_COMMON_DTYPE);
+
+        op = NIT_OPERANDS(iter);
+        op_dtype = NIT_DTYPES(iter);
+
+        dtype = npyiter_get_common_dtype(niter, op,
+                                    op_itflags, op_dtype,
+                                    op_request_dtypes,
+                                    only_inputs,
+                                    output_scalars);
+        if (dtype == NULL) {
+            NpyIter_Deallocate(iter);
+            return NULL;
+        }
+        if (flags&NPY_ITER_COMMON_DTYPE) {
+            NPY_IT_DBG_PRINTF("Iterator: Replacing all data types\n");
+            /* Replace all the data types */
+            for (iiter = 0; iiter < niter; ++iiter) {
+                Py_XDECREF(op_dtype[iiter]);
+                Py_INCREF(dtype);
+                op_dtype[iiter] = dtype;
+            }
+        }
+        else {
+            NPY_IT_DBG_PRINTF("Iterator: Setting unset output data types\n");
+            /* Replace the NULL data types */
+            for (iiter = 0; iiter < niter; ++iiter) {
+                if (op_dtype[iiter] == NULL) {
+                    Py_INCREF(dtype);
+                    op_dtype[iiter] = dtype;
+                }
+            }
+        }
+        Py_DECREF(dtype);
+    }
+
+    /*
+     * All of the data types have been settled, so it's time
+     * to check that data type conversions are following the
+     * casting rules.
+     */
+    if (!npyiter_check_casting(niter, op, op_dtype, casting, op_itflags)) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+
+    /*
+     * At this point, the iteration order has been finalized. so
+     * any allocation of ops that were NULL, or any temporary
+     * copying due to casting/byte order/alignment can be
+     * done now using a memory layout matching the iterator.
+     */
+    if (!npyiter_allocate_arrays(iter, flags, op_dtype, subtype, op_flags,
+                            op_itflags, op_axes, output_scalars)) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+
+    /*
+     * Finally, if coords weren't requested,
+     * it may be possible to coalesce some axes together.
+     */
+    if (ndim > 1 && !(itflags&NPY_ITFLAG_HASCOORDS)) {
+        npyiter_coalesce_axes(iter);
+        /*
+         * The operation may have changed the layout, so we have to
+         * get the internal pointers again.
+         */
+        itflags = NIT_ITFLAGS(iter);
+        ndim = NIT_NDIM(iter);
+        op = NIT_OPERANDS(iter);
+        op_dtype = NIT_DTYPES(iter);
+        op_itflags = NIT_OPITFLAGS(iter);
+        op_dataptr = NIT_RESETDATAPTR(iter);
+    }
+
+    /*
+     * Now that the axes are finished, check whether we can apply
+     * the single iteration optimization to the iternext function.
+     */
+    if (!(itflags&NPY_ITFLAG_BUFFER)) {
+        NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+        if (itflags&NPY_ITFLAG_NOINNER) {
+            if (NIT_ITERSIZE(iter) == NAD_SHAPE(axisdata)) {
+                NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
+            }
+        }
+        else if (NIT_ITERSIZE(iter) == 1) {
+            NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
+        }
+    }
+
+    /*
+     * If REFS_OK was specified, check whether there are any
+     * reference arrays and flag it if so.
+     */ 
+    if (flags&NPY_ITER_REFS_OK) {
+        for (iiter = 0; iiter < niter; ++iiter) {
+            PyArray_Descr *rdt = op_dtype[iiter];
+            if ((rdt->flags&(NPY_ITEM_REFCOUNT|
+                                     NPY_ITEM_IS_POINTER|
+                                     NPY_NEEDS_PYAPI)) != 0) {
+                /* Iteration needs API access */
+                NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI;
+            }
+        }
+    }
+
+    /* If buffering is set without delayed allocation */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        if (!npyiter_allocate_transfer_functions(iter)) {
+            NpyIter_Deallocate(iter);
+            return NULL;
+        }
+        if (itflags&NPY_ITFLAG_DELAYBUF) {
+            bufferdata = NIT_BUFFERDATA(iter);
+            /* Make the data pointers NULL */
+            memset(NBF_PTRS(bufferdata), 0, niter*NPY_SIZEOF_INTP);
+        }
+        else {
+            /* Allocate the buffers */
+            if (!npyiter_allocate_buffers(iter, NULL)) {
+                NpyIter_Deallocate(iter);
+                return NULL;
+            }
+
+            /* Prepare the next buffers and set iterend/size */
+            npyiter_copy_to_buffers(iter, NULL);
+        }
+    }
+
+    return iter;
+}
+
+/*NUMPY_API
+ * Allocate a new iterator for one array object
+ */
+NPY_NO_EXPORT NpyIter *
+NpyIter_New(PyArrayObject *op, npy_uint32 flags,
+                  NPY_ORDER order, NPY_CASTING casting,
+                  PyArray_Descr* dtype,
+                  npy_intp a_ndim, npy_intp *axes, npy_intp buffersize)
+{
+    /* Split the flags into separate global and op flags */
+    npy_uint32 op_flags = flags&NPY_ITER_PER_OP_FLAGS;
+    flags &= NPY_ITER_GLOBAL_FLAGS;
+
+    if (a_ndim > 0) {
+        return NpyIter_MultiNew(1, &op, flags, order, casting,
+                                &op_flags, &dtype,
+                                a_ndim, &axes, buffersize);
+    }
+    else {
+        return NpyIter_MultiNew(1, &op, flags, order, casting,
+                                &op_flags, &dtype,
+                                0, NULL, buffersize);
+    }
+}
+
+/*NUMPY_API
+ * Makes a copy of the iterator
+ */
+NPY_NO_EXPORT NpyIter *
+NpyIter_Copy(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+    int out_of_memory = 0;
+
+    npy_intp size;
+    NpyIter *newiter;
+    PyArrayObject **objects;
+    PyArray_Descr **dtypes;
+
+    /* Allocate memory for the new iterator */
+    size = NIT_SIZEOF_ITERATOR(itflags, ndim, niter);
+    newiter = (NpyIter*)PyArray_malloc(size);
+
+    /* Copy the raw values to the new iterator */
+    memcpy(newiter, iter, size);
+
+    /* Take ownership of references to the operands and dtypes */
+    objects = NIT_OPERANDS(newiter);
+    dtypes = NIT_DTYPES(newiter);
+    for (iiter = 0; iiter < niter; ++iiter) {
+        Py_INCREF(objects[iiter]);
+        Py_INCREF(dtypes[iiter]);
+    }
+
+    /* Allocate buffers and make copies of the transfer data if necessary */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata;
+        npy_intp buffersize, itemsize;
+        char **buffers;
+        void **readtransferdata, **writetransferdata;
+
+        bufferdata = NIT_BUFFERDATA(newiter);
+        buffers = NBF_BUFFERS(bufferdata);
+        readtransferdata = NBF_READTRANSFERDATA(bufferdata);
+        writetransferdata = NBF_WRITETRANSFERDATA(bufferdata);
+        buffersize = NBF_BUFFERSIZE(bufferdata);
+
+        for (iiter = 0; iiter < niter; ++iiter) {
+            if (buffers[iiter] != NULL) {
+                if (out_of_memory) {
+                    buffers[iiter] = NULL;
+                }
+                else {
+                    itemsize = dtypes[iiter]->elsize;
+                    buffers[iiter] = PyArray_malloc(itemsize*buffersize);
+                    if (buffers[iiter] == NULL) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
+
+            if (readtransferdata[iiter] != NULL) {
+                if (out_of_memory) {
+                    readtransferdata[iiter] = NULL;
+                }
+                else {
+                    readtransferdata[iiter] =
+                      PyArray_CopyStridedTransferData(readtransferdata[iiter]);
+                    if (readtransferdata[iiter] == NULL) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
+
+            if (writetransferdata[iiter] != NULL) {
+                if (out_of_memory) {
+                    writetransferdata[iiter] = NULL;
+                }
+                else {
+                    writetransferdata[iiter] =
+                      PyArray_CopyStridedTransferData(writetransferdata[iiter]);
+                    if (writetransferdata[iiter] == NULL) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
+        }
+
+        /* Initialize the buffers to the current iterindex */
+        if (!out_of_memory && NBF_SIZE(bufferdata) > 0) {
+            npyiter_goto_iterindex(newiter, NIT_ITERINDEX(newiter));
+
+            /* Prepare the next buffers and set iterend/size */
+            npyiter_copy_to_buffers(newiter, NULL);
+        }
+    }
+
+    if (out_of_memory) {
+        NpyIter_Deallocate(newiter);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return newiter;
+}
+
+/*NUMPY_API
+ * Deallocate an iterator
+ */
+NPY_NO_EXPORT int
+NpyIter_Deallocate(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    PyArray_Descr **dtype = NIT_DTYPES(iter);
+    PyArrayObject **object = NIT_OPERANDS(iter);
+
+    /* Deallocate any buffers and buffering data */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        char **buffers;
+        void **transferdata;
+
+        /* buffers */
+        buffers = NBF_BUFFERS(bufferdata);
+        for(iiter = 0; iiter < niter; ++iiter, ++buffers) {
+            if (*buffers) {
+                PyArray_free(*buffers);
+            }
+        }
+        /* read bufferdata */
+        transferdata = NBF_READTRANSFERDATA(bufferdata);
+        for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+            if (*transferdata) {
+                PyArray_FreeStridedTransferData(*transferdata);
+            }
+        }
+        /* write bufferdata */
+        transferdata = NBF_WRITETRANSFERDATA(bufferdata);
+        for(iiter = 0; iiter < niter; ++iiter, ++transferdata) {
+            if (*transferdata) {
+                PyArray_FreeStridedTransferData(*transferdata);
+            }
+        }
+    }
+
+    /* Deallocate all the dtypes and objects that were iterated */
+    for(iiter = 0; iiter < niter; ++iiter, ++dtype, ++object) {
+        Py_XDECREF(*dtype);
+        Py_XDECREF(*object);
+    }
+
+    /* Deallocate the iterator memory */
+    PyArray_free(iter);
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Removes an axis from iteration. This requires that NPY_ITER_COORDS
+ * was set for iterator creation, and does not work if buffering is
+ * enabled. This function also resets the iterator to its initial state.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+NpyIter_RemoveAxis(NpyIter *iter, npy_intp axis)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    npy_intp xdim = 0;
+    char *perm = NIT_PERM(iter);
+    NpyIter_AxisData *axisdata0 = NIT_AXISDATA(iter), *axisdata;
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    npy_intp *baseoffsets = NIT_BASEOFFSETS(iter);
+    char **resetdataptr = NIT_RESETDATAPTR(iter);
+
+    if (!(itflags&NPY_ITFLAG_HASCOORDS)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Iterator RemoveAxis may only be called "
+                "if coordinates are being tracked");
+        return NPY_FAIL;
+    }
+    else if (itflags&NPY_ITFLAG_HASINDEX) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Iterator RemoveAxis may not be called on "
+                "an index is being tracked");
+        return NPY_FAIL;
+    }
+    else if (itflags&NPY_ITFLAG_BUFFER) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Iterator RemoveAxis may not be called on "
+                "a buffered iterator");
+        return NPY_FAIL;
+    }
+    else if (axis < 0 || axis >= ndim) {
+        PyErr_SetString(PyExc_ValueError,
+                "axis out of bounds in iterator RemoveAxis");
+        return NPY_FAIL;
+    }
+
+    /* Reverse axis, since the iterator treats them that way */
+    axis = ndim-1-axis;
+
+    /* First find the axis in question */
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata0, 1)) {
+        /* If this is it, and it's iterated forward, done */
+        if (perm[idim] == axis) {
+            xdim = idim;
+            break;
+        }
+        /* If this is it, but it's iterated backward, must reverse the axis */
+        else if (-1-perm[idim] == axis) {
+            npy_intp *strides = NAD_STRIDES(axisdata0);
+            npy_intp shape = NAD_SHAPE(axisdata0), offset;
+
+            xdim = idim;
+
+            /*
+             * Adjust baseoffsets and resetbaseptr back to the start of
+             * this axis.
+             */
+            for (iiter = 0; iiter < niter; ++iiter) {
+                offset = (shape-1)*strides[iiter];
+                baseoffsets[iiter] += offset;
+                resetdataptr[iiter] += offset;
+            }
+            break;
+        }
+    }
+
+    if (idim == ndim) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "internal error in iterator perm");
+        return NPY_FAIL;
+    }
+
+    /* Adjust the permutation */
+    for (idim = 0; idim < ndim-1; ++idim) {
+        char p = (idim < xdim) ? perm[idim] : perm[idim+1];
+        if (p >= 0) {
+            if (p > axis) {
+                --p;
+            }
+        }
+        else if (p <= 0) {
+            if (p < -1-axis) {
+                ++p;
+            }
+        }
+        perm[idim] = p;
+    }
+
+    /* Adjust the iteration size */
+    NIT_ITERSIZE(iter) /= NAD_SHAPE(axisdata0);
+
+    /* Shift all the axisdata structures by one */
+    axisdata = NIT_INDEX_AXISDATA(axisdata0, 1);
+    memmove(axisdata0, axisdata, (ndim-1-xdim)*sizeof_axisdata);
+
+    /* If there is more than one dimension, shrink the iterator */
+    if (ndim > 1) {
+        NIT_NDIM(iter) = ndim-1;
+    }
+    /* Otherwise convert it to a singleton dimension */
+    else {
+        npy_intp *strides = NAD_STRIDES(axisdata0);
+        NAD_SHAPE(axisdata0) = 1;
+        for (iiter = 0; iiter < niter; ++iiter) {
+            strides[iiter] = 0;
+        }
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
+    }
+
+    return NpyIter_Reset(iter, NULL);
+}
+
+/*NUMPY_API
+ * Removes coords support from an iterator.
+ *
+ * Returns NPY_SUCCEED or NPY_FAIL.
+ */
+NPY_NO_EXPORT int
+NpyIter_RemoveCoords(NpyIter *iter)
+{
+    npy_uint32 itflags;
+
+    /* Make sure the iterator is reset */
+    if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+
+    itflags = NIT_ITFLAGS(iter);
+    if (itflags&NPY_ITFLAG_HASCOORDS) {
+        NIT_ITFLAGS(iter) = itflags & ~NPY_ITFLAG_HASCOORDS;
+        npyiter_coalesce_axes(iter);
+    }
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Removes the inner loop handling (so HasInnerLoop returns false)
+ */
+NPY_NO_EXPORT int
+NpyIter_RemoveInnerLoop(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    /* Check conditions under which this can be done */
+    if (itflags&(NPY_ITFLAG_HASINDEX|NPY_ITFLAG_HASCOORDS)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator flag NO_INNER_ITERATION cannot be used "
+                "if coords or an index is being tracked");
+        return NPY_FAIL;
+    }
+    if ((itflags&(NPY_ITFLAG_BUFFER|NPY_ITFLAG_RANGE|NPY_ITFLAG_NOINNER))
+                        == (NPY_ITFLAG_RANGE|NPY_ITFLAG_NOINNER)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator flag NO_INNER_ITERATION cannot be used "
+                "with ranged iteration unless buffering is also enabled");
+        return NPY_FAIL;
+    }
+    /* Set the flag */
+    if (!(itflags&NPY_ITFLAG_NOINNER)) {
+        itflags |= NPY_ITFLAG_NOINNER;
+        NIT_ITFLAGS(iter) = itflags;
+        
+        /*
+         * Check whether we can apply the single iteration
+         * optimization to the iternext function.
+         */
+        if (!(itflags&NPY_ITFLAG_BUFFER)) {
+            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+            if (NIT_ITERSIZE(iter) == NAD_SHAPE(axisdata)) {
+                NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
+            }
+        }
+    }
+
+    /* Reset the iterator */
+    return NpyIter_Reset(iter, NULL);
+}
+
+/*NUMPY_API
+ * Resets the iterator to its initial state
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT int
+NpyIter_Reset(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata;
+
+        /* If buffer allocation was delayed, do it now */
+        if (itflags&NPY_ITFLAG_DELAYBUF) {
+            if (!npyiter_allocate_buffers(iter, errmsg)) {
+                return NPY_FAIL;
+            }
+            NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
+        }
+        else {
+            /*
+             * If the iterindex is already right, no need to
+             * do anything
+             */
+            bufferdata = NIT_BUFFERDATA(iter);
+            if (NIT_ITERINDEX(iter) == NIT_ITERSTART(iter) &&
+                    NBF_BUFITEREND(bufferdata) <= NIT_ITEREND(iter) &&
+                    NBF_SIZE(bufferdata) > 0) {
+                return NPY_SUCCEED;
+            }
+
+            /* Copy any data from the buffers back to the arrays */
+            npyiter_copy_from_buffers(iter);
+        }
+    }
+    
+    npyiter_goto_iterindex(iter, NIT_ITERSTART(iter));
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        /* Prepare the next buffers and set iterend/size */
+        npyiter_copy_to_buffers(iter, NULL);
+    }
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Resets the iterator to its initial state, with new base data pointers
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT int
+NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    char **resetdataptr = NIT_RESETDATAPTR(iter);
+    npy_intp *baseoffsets = NIT_BASEOFFSETS(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        /* If buffer allocation was delayed, do it now */
+        if (itflags&NPY_ITFLAG_DELAYBUF) {
+            if (!npyiter_allocate_buffers(iter, errmsg)) {
+                return NPY_FAIL;
+            }
+            NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
+        }
+        else {
+            /* Copy any data from the buffers back to the arrays */
+            npyiter_copy_from_buffers(iter);
+        }
+    }
+
+    /* The new data pointers for resetting */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        resetdataptr[iiter] = baseptrs[iiter] + baseoffsets[iiter];
+    }
+    
+    npyiter_goto_iterindex(iter, NIT_ITERSTART(iter));
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        /* Prepare the next buffers and set iterend/size */
+        npyiter_copy_to_buffers(iter, NULL);
+    }
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Resets the iterator to a new iterator index range
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT int
+NpyIter_ResetToIterIndexRange(NpyIter *iter,
+                              npy_intp istart, npy_intp iend, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    /*npy_intp niter = NIT_NITER(iter);*/
+
+    if (!(itflags&NPY_ITFLAG_RANGE)) {
+        if (errmsg == NULL) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Cannot call ResetToIterIndexRange on an iterator without "
+                    "requesting ranged iteration support in the constructor");
+        }
+        else {
+            *errmsg = "Cannot call ResetToIterIndexRange on an iterator "
+                      "without requesting ranged iteration support in the "
+                    "constructor";
+        }
+        return NPY_FAIL;
+    }
+
+    if (istart < 0 || iend > NIT_ITERSIZE(iter)) {
+        if (errmsg == NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "Out-of-bounds range [%d, %d) passed to "
+                    "ResetToIterIndexRange", (int)istart, (int)iend);
+        }
+        else {
+            *errmsg = "Out-of-bounds range passed to ResetToIterIndexRange";
+        }
+        return NPY_FAIL;
+    }
+    else if (iend < istart) {
+        if (errmsg == NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "Invalid range [%d, %d) passed to ResetToIterIndexRange",
+                    (int)istart, (int)iend);
+        }
+        else {
+            *errmsg = "Invalid range passed to ResetToIterIndexRange";
+        }
+        return NPY_FAIL;
+    }
+
+    NIT_ITERSTART(iter) = istart;
+    NIT_ITEREND(iter) = iend;
+
+    return NpyIter_Reset(iter, errmsg);
+}
+
+/*NUMPY_API
+ * Sets the iterator to the specified coordinates, which must have the
+ * correct number of entries for 'ndim'.  It is only valid
+ * when NPY_ITER_COORDS was passed to the constructor.  This operation
+ * fails if the coordinates are out of bounds.
+ *
+ * Returns NPY_SUCCEED on success, NPY_FAIL on failure.
+ */
+NPY_NO_EXPORT int
+NpyIter_GotoCoords(NpyIter *iter, npy_intp *coords)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp iterindex, factor;
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    char *perm;
+
+    if (!(itflags&NPY_ITFLAG_HASCOORDS)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoCoords on an iterator without "
+                "requesting coordinates in the constructor");
+        return NPY_FAIL;
+    }
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoCoords on an iterator which "
+                "is buffered");
+        return NPY_FAIL;
+    }
+
+    if (itflags&NPY_ITFLAG_NOINNER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoCoords on an iterator which "
+                "has the flag NO_INNER_ITERATION");
+        return NPY_FAIL;
+    }
+
+    perm = NIT_PERM(iter);
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    /* Compute the iterindex corresponding to the coordinates */
+    iterindex = 0;
+    factor = 1;
+    for (idim = 0; idim < ndim; ++idim) {
+        char p = perm[idim];
+        npy_intp i, shape;
+
+        shape = NAD_SHAPE(axisdata);
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the coordinate */
+            i = shape - coords[ndim+p] - 1;
+        }
+        else {
+            i = coords[ndim-p-1];
+        }
+
+        /* Bounds-check this coordinate */
+        if (i >= 0 && i < shape) {
+            iterindex += factor * i;
+            factor *= shape;
+        }
+        else {
+            PyErr_SetString(PyExc_IndexError,
+                    "Iterator GotoCoords called with out-of-bounds "
+                    "coordinates.");
+            return NPY_FAIL;
+        }
+        
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+    if (iterindex < NIT_ITERSTART(iter) || iterindex >= NIT_ITEREND(iter)) {
+        PyErr_SetString(PyExc_IndexError,
+                "Iterator GotoCoords called with coordinates outside the "
+                "iteration range.");
+        return NPY_FAIL;
+    }
+
+    npyiter_goto_iterindex(iter, iterindex);
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * If the iterator is tracking an index, sets the iterator
+ * to the specified index.
+ *
+ * Returns NPY_SUCCEED on success, NPY_FAIL on failure.
+ */
+NPY_NO_EXPORT int
+NpyIter_GotoIndex(NpyIter *iter, npy_intp index)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp iterindex, factor;
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+
+    if (!(itflags&NPY_ITFLAG_HASINDEX)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoIndex on an iterator without "
+                "requesting an index in the constructor");
+        return NPY_FAIL;
+    }
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoIndex on an iterator which "
+                "is buffered");
+        return NPY_FAIL;
+    }
+
+    if (itflags&NPY_ITFLAG_NOINNER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoIndex on an iterator which "
+                "has the flag NO_INNER_ITERATION");
+        return NPY_FAIL;
+    }
+
+    if (index < 0 || index >= NIT_ITERSIZE(iter)) {
+        PyErr_SetString(PyExc_IndexError,
+                "Iterator GotoIndex called with an out-of-bounds "
+                "index.");
+        return NPY_FAIL;
+    }
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    /* Compute the iterindex corresponding to the index */
+    iterindex = 0;
+    factor = 1;
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp i, shape, iterstride;
+
+        iterstride = NAD_STRIDES(axisdata)[niter];
+        shape = NAD_SHAPE(axisdata);
+
+        /* Extract the coordinate from the index */
+        if (iterstride == 0) {
+            i = 0;
+        }
+        else if (iterstride < 0) {
+            i = shape - (index/(-iterstride))%shape - 1;
+        }
+        else {
+            i = (index/iterstride)%shape;
+        }
+
+        /* Add its contribution to iterindex */
+        iterindex += factor * i;
+        factor *= shape;
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+
+    if (iterindex < NIT_ITERSTART(iter) || iterindex >= NIT_ITEREND(iter)) {
+        PyErr_SetString(PyExc_IndexError,
+                "Iterator GotoIndex called with an index outside the "
+                "iteration range.");
+        return NPY_FAIL;
+    }
+
+    npyiter_goto_iterindex(iter, iterindex);
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Sets the iterator position to the specified iterindex,
+ * which matches the iteration order of the iterator.
+ *
+ * Returns NPY_SUCCEED on success, NPY_FAIL on failure.
+ */
+NPY_NO_EXPORT int
+NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    if (itflags&NPY_ITFLAG_NOINNER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot call GotoIterIndex on an iterator which "
+                "has the flag NO_INNER_ITERATION");
+        return NPY_FAIL;
+    }
+
+    if (iterindex < NIT_ITERSTART(iter) || iterindex >= NIT_ITEREND(iter)) {
+        PyErr_SetString(PyExc_IndexError,
+                "Iterator GotoIterIndex called with an iterindex outside the "
+                "iteration range.");
+        return NPY_FAIL;
+    }
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        npy_intp bufiterend, size;
+
+        size = NBF_SIZE(bufferdata);
+        bufiterend = NBF_BUFITEREND(bufferdata);
+        /* Check if the new iterindex is already within the buffer */
+        if (!(itflags&NPY_ITFLAG_REDUCE) && iterindex < bufiterend &&
+                                        iterindex >= bufiterend - size) {
+            npy_intp *strides, delta;
+            char **ptrs;
+
+            strides = NBF_STRIDES(bufferdata);
+            ptrs = NBF_PTRS(bufferdata);
+            delta = iterindex - NIT_ITERINDEX(iter);
+
+            for (iiter = 0; iiter < niter; ++iiter) {
+                ptrs[iiter] += delta * strides[iiter];
+            }
+
+            NIT_ITERINDEX(iter) = iterindex;
+        }
+        /* Start the buffer at the provided iterindex */
+        else {
+            /* Write back to the arrays */
+            npyiter_copy_from_buffers(iter);
+
+            npyiter_goto_iterindex(iter, iterindex);
+
+            /* Prepare the next buffers and set iterend/size */
+            npyiter_copy_to_buffers(iter, NULL);
+        }
+    }
+    else {
+        npyiter_goto_iterindex(iter, iterindex);
+    }
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Gets the current iteration index
+ */
+NPY_NO_EXPORT npy_intp
+NpyIter_GetIterIndex(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    /* iterindex is only used if NPY_ITER_RANGED or NPY_ITER_BUFFERED was set */
+    if (itflags&(NPY_ITFLAG_RANGE|NPY_ITFLAG_BUFFER)) {
+        return NIT_ITERINDEX(iter);
+    }
+    else {
+        npy_intp iterindex;
+        NpyIter_AxisData *axisdata;
+        npy_intp sizeof_axisdata;
+
+        iterindex = 0;
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        axisdata = NIT_INDEX_AXISDATA(NIT_AXISDATA(iter), ndim-1);
+
+        for (idim = ndim-2; idim >= 0; --idim) {
+            iterindex += NAD_COORD(axisdata);
+            NIT_ADVANCE_AXISDATA(axisdata, -1);
+            iterindex *= NAD_SHAPE(axisdata);
+        }
+        iterindex += NAD_COORD(axisdata);
+
+        return iterindex;
+    }
+}
+
+/* SPECIALIZED iternext functions that handle the non-buffering part */
+
+/**begin repeat
+ * #const_itflags = 0,
+ *                  NPY_ITFLAG_HASINDEX,
+ *                  NPY_ITFLAG_NOINNER,
+ *                  NPY_ITFLAG_RANGE,
+ *                  NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX#
+ * #tag_itflags = 0, IND, NOINN, RNG, RNGuIND#
+ */
+/**begin repeat1
+ * #const_ndim = 1, 2, NPY_MAXDIMS#
+ * #tag_ndim = 1, 2, ANY#
+ */
+/**begin repeat2
+ * #const_niter = 1, 2, NPY_MAXDIMS#
+ * #tag_niter = 1, 2, ANY#
+ */
+
+/* Specialized iternext (@const_itflags@,@tag_ndim@,@tag_niter@) */
+static int
+npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@(
+                                                      NpyIter *iter)
+{
+    const npy_uint32 itflags = @const_itflags@;
+#if @const_ndim@ >= NPY_MAXDIMS
+    npy_intp idim, ndim = NIT_NDIM(iter);
+#endif
+#if @const_niter@ < NPY_MAXDIMS
+    const npy_intp niter = @const_niter@;
+#else
+    npy_intp niter = NIT_NITER(iter);
+#endif
+
+    npy_intp istrides, nstrides, sizeof_axisdata;
+#if @const_ndim@ > 0
+    NpyIter_AxisData *axisdata0;
+#endif
+#if @const_ndim@ > 1
+    NpyIter_AxisData *axisdata1;
+#endif
+#if @const_ndim@ > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (@const_itflags@&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+    nstrides = NAD_NSTRIDES();
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    axisdata0 = NIT_AXISDATA(iter);
+#  if !(@const_itflags@&NPY_ITFLAG_NOINNER)
+    /* Increment coordinate 0 */
+    NAD_COORD(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if @const_ndim@ == 1
+
+#  if !(@const_itflags@&NPY_ITFLAG_NOINNER)
+    /* Finished when the coordinate equals the shape */
+    return NAD_COORD(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    /* Get rid of unused variable warning */
+    istrides = 0;
+
+    return 0;
+#  endif
+
+#else
+
+#  if !(@const_itflags@&NPY_ITFLAG_NOINNER)
+    if (NAD_COORD(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment coordinate 1 */
+    NAD_COORD(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_COORD(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st coordinate to 0 */
+        NAD_COORD(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if @const_ndim@ == 2
+    return 0;
+# else
+    
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment coordinate 2 */
+    NAD_COORD(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_COORD(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd coordinates to 0 */
+        NAD_COORD(axisdata0) = 0;
+        NAD_COORD(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the coordinate */
+        NAD_COORD(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_COORD(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the coordinates and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the coordinate to 0 */
+                NAD_COORD(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+    
+#endif /* ndim != 1 */
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+
+/**begin repeat
+ * #const_niter = 1, 2, 3, 4, NPY_MAXDIMS#
+ * #tag_niter = 1, 2, 3, 4, ANY#
+ */
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters@tag_niter@(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+#if @const_niter@ >= NPY_MAXDIMS
+    npy_intp niter = NIT_NITER(iter);
+#else
+    const npy_intp niter = @const_niter@;
+#endif
+
+    npy_intp iiter;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    //NpyIter_DebugPrint(iter);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the coordinates and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_NOINNER)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp iiter, *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iiter = 0; iiter < niter; ++iiter) {
+                ptrs[iiter] += strides[iiter];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINTF("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iiter = 0; iiter < niter; ++iiter) {
+            char *ptr = reduce_outerptrs[iiter] + reduce_outerstrides[iiter];
+            ptrs[iiter] = ptr;
+            reduce_outerptrs[iiter] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*niter);
+
+    /* Write back to the arrays */
+    npyiter_copy_from_buffers(iter);
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    npyiter_copy_to_buffers(iter, prev_dataptrs);
+
+    return 1;
+}
+
+/**end repeat**/
+
+/* iternext function that handles the buffering part */
+static int
+npyiter_buffered_iternext(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the coordinates and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_NOINNER)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp iiter, *strides;
+            char **ptrs;
+
+            strides = NBF_STRIDES(bufferdata);
+            ptrs = NBF_PTRS(bufferdata);
+            for (iiter = 0; iiter < niter; ++iiter) {
+                ptrs[iiter] += strides[iiter];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    /* Write back to the arrays */
+    npyiter_copy_from_buffers(iter);
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    npyiter_copy_to_buffers(iter, NULL);
+
+    return 1;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/* Specialization of iternext for when the iteration size is 1 */
+static int
+npyiter_iternext_sizeone(NpyIter *iter)
+{
+    return 0;
+}
+
+/*NUMPY_API
+ * Compute the specialized iteration function for an iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_IterNext_Fn
+NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    /*
+     * When there is just one iteration and buffering is disabled
+     * the iternext function is very simple.
+     */
+    if (itflags&NPY_ITFLAG_ONEITERATION) {
+        return &npyiter_iternext_sizeone;
+    }
+
+    /*
+     * If buffering is enabled.
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        if (itflags&NPY_ITFLAG_REDUCE) {
+            switch (niter) {
+                case 1:
+                    return &npyiter_buffered_reduce_iternext_iters1;
+                case 2:
+                    return &npyiter_buffered_reduce_iternext_iters2;
+                case 3:
+                    return &npyiter_buffered_reduce_iternext_iters3;
+                case 4:
+                    return &npyiter_buffered_reduce_iternext_iters4;
+                default:
+                    return &npyiter_buffered_reduce_iternext_itersANY;
+            }
+        }
+        else {
+            return &npyiter_buffered_iternext;
+        }
+    }
+
+    /*
+     * Ignore all the flags that don't affect the iterator memory
+     * layout or the iternext function.  Currently only HASINDEX,
+     * NOINNER, and RANGE affect them here.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NOINNER|NPY_ITFLAG_RANGE);
+
+    /* Switch statements let the compiler optimize this most effectively */
+    switch (itflags) {
+    /*
+     * The combinations HASINDEX|NOINNER and RANGE|NOINNER are excluded
+     * by the New functions
+     */
+/**begin repeat
+ * #const_itflags = 0,
+ *                  NPY_ITFLAG_HASINDEX,
+ *                  NPY_ITFLAG_NOINNER,
+ *                  NPY_ITFLAG_RANGE,
+ *                  NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX#
+ * #tag_itflags = 0, IND, NOINN, RNG, RNGuIND#
+ */
+        case @const_itflags@:
+            switch (ndim) {
+/**begin repeat1
+ * #const_ndim = 1, 2#
+ * #tag_ndim = 1, 2#
+ */
+                case @const_ndim@:
+                    switch (niter) {
+/**begin repeat2
+ * #const_niter = 1, 2#
+ * #tag_niter = 1, 2#
+ */
+                        case @const_niter@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_niter@;
+/**end repeat2**/
+                        /* Not specialized on niter */
+                        default:
+                            return &npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_itersANY;
+                    }
+/**end repeat1**/
+                /* Not specialized on ndim */
+                default:
+                    switch (niter) {
+/**begin repeat1
+ * #const_niter = 1, 2#
+ * #tag_niter = 1, 2#
+ */
+                        case @const_niter@:
+                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_iters@tag_niter@;
+/**end repeat1**/
+                        /* Not specialized on niter */
+                        default:
+                            return &npyiter_iternext_itflags@tag_itflags@_dimsANY_itersANY;
+                    }
+            }
+/**end repeat**/
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetIterNext internal iterator error - unexpected "
+                "itflags/ndim/niter combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)niter);
+    }
+    else {
+        *errmsg = "GetIterNext internal iterator error - unexpected "
+                  "itflags/ndim/niter combination";
+    }
+    return NULL;
+}
+
+
+/* SPECIALIZED getcoord functions */
+
+/**begin repeat
+ * #const_itflags = 0,
+ *    NPY_ITFLAG_HASINDEX,
+ *    NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ *                BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
+ */
+static void
+npyiter_getcoord_itflags@tag_itflags@(NpyIter *iter, npy_intp *outcoord)
+{
+    const npy_uint32 itflags = @const_itflags@;
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp idim, sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
+    char* perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+#if ((@const_itflags@)&NPY_ITFLAG_IDENTPERM)
+    outcoord += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --outcoord,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *outcoord = NAD_COORD(axisdata);
+    }
+#elif !((@const_itflags@)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        char p = perm[idim];
+        outcoord[ndim-p-1] = NAD_COORD(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        char p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the coordinate */
+            outcoord[ndim+p] = NAD_SHAPE(axisdata) - NAD_COORD(axisdata) - 1;
+        }
+        else {
+            outcoord[ndim-p-1] = NAD_COORD(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+/**end repeat**/
+
+/*NUMPY_API
+ * Compute a specialized getcoords function for the iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_GetCoords_Fn
+NpyIter_GetGetCoords(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    /* These flags must be correct */
+    if ((itflags&(NPY_ITFLAG_HASCOORDS|NPY_ITFLAG_DELAYBUF)) !=
+            NPY_ITFLAG_HASCOORDS) {
+        if (!(itflags&NPY_ITFLAG_HASCOORDS)) {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetCoords function for an iterator "
+                        "that doesn't track coordinates.");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetCoords function for an "
+                          "iterator that doesn't track coordinates.";
+            }
+            return NULL;
+        }
+        else {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetCoords function for an iterator "
+                        "that used DELAY_BUFALLOC before a Reset call");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetCoords function for an "
+                          "iterator that used DELAY_BUFALLOC before a "
+                          "Reset call";
+            }
+            return NULL;
+        }
+    }
+
+    /*
+     * Only these flags affect the iterator memory layout or
+     * the getcoords behavior. IDENTPERM and NEGPERM are mutually
+     * exclusive, so that reduces the number of cases slightly.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX |
+                NPY_ITFLAG_IDENTPERM |
+                NPY_ITFLAG_NEGPERM |
+                NPY_ITFLAG_BUFFER);
+    
+    switch (itflags) {
+/**begin repeat
+ * #const_itflags = 0,
+ *    NPY_ITFLAG_HASINDEX,
+ *    NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM,
+ *    NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM,
+ *    NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER,
+ *    NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER#
+ * #tag_itflags = 0, IND, IDP, INDuIDP, NEGP, INDuNEGP,
+ *                BUF, INDuBUF, IDPuBUF, INDuIDPuBUF, NEGPuBUF, INDuNEGPuBUF#
+ */
+        case @const_itflags@:
+            return npyiter_getcoord_itflags@tag_itflags@;
+/**end repeat**/
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetGetCoords internal iterator error - unexpected "
+                "itflags/ndim/niter combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)niter);
+    }
+    else {
+        *errmsg = "GetGetCoords internal iterator error - unexpected "
+                  "itflags/ndim/niter combination";
+    }
+    return NULL;
+
+}
+
+/*NUMPY_API
+ * Whether the buffer allocation is being delayed
+ */
+NPY_NO_EXPORT int
+NpyIter_HasDelayedBufAlloc(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_DELAYBUF) != 0;
+}
+
+/*NUMPY_API
+ * Whether the iterator handles the inner loop
+ */
+NPY_NO_EXPORT int
+NpyIter_HasInnerLoop(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_NOINNER) == 0;
+}
+
+/*NUMPY_API
+ * Whether the iterator is tracking coordinates
+ */
+NPY_NO_EXPORT int
+NpyIter_HasCoords(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_HASCOORDS) != 0;
+}
+
+/*NUMPY_API
+ * Whether the iterator is tracking an index
+ */
+NPY_NO_EXPORT int
+NpyIter_HasIndex(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_HASINDEX) != 0;
+}
+
+/*NUMPY_API
+ * Whether the iteration loop, and in particular the iternext()
+ * function, needs API access.  If this is true, the GIL must
+ * be retained while iterating.
+ */
+NPY_NO_EXPORT int
+NpyIter_IterationNeedsAPI(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_NEEDSAPI) != 0;
+}
+
+/*NUMPY_API
+ * Gets the number of dimensions being iterated
+ */
+NPY_NO_EXPORT npy_intp
+NpyIter_GetNDim(NpyIter *iter)
+{
+    return NIT_NDIM(iter);
+}
+
+/*NUMPY_API
+ * Gets the number of operands being iterated
+ */
+NPY_NO_EXPORT npy_intp
+NpyIter_GetNIter(NpyIter *iter)
+{
+    return NIT_NITER(iter);
+}
+
+/*NUMPY_API
+ * Gets the number of elements being iterated
+ */
+NPY_NO_EXPORT npy_intp
+NpyIter_GetIterSize(NpyIter *iter)
+{
+    return NIT_ITERSIZE(iter);
+}
+
+/*NUMPY_API
+ * Whether the iterator is buffered
+ */
+NPY_NO_EXPORT int
+NpyIter_IsBuffered(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_BUFFER) != 0;
+}
+
+/*NUMPY_API
+ * Whether the inner loop can grow if buffering is unneeded
+ */
+NPY_NO_EXPORT int
+NpyIter_IsGrowInner(NpyIter *iter)
+{
+    return (NIT_ITFLAGS(iter)&NPY_ITFLAG_GROWINNER) != 0;
+}
+
+/*NUMPY_API
+ * Gets the size of the buffer, or 0 if buffering is not enabled
+ */
+NPY_NO_EXPORT npy_intp
+NpyIter_GetBufferSize(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+    
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        return NBF_BUFFERSIZE(bufferdata);
+    }
+    else {
+        return 0;
+    }
+
+}
+
+/*NUMPY_API
+ * Gets the range of iteration indices being iterated
+ */
+NPY_NO_EXPORT void
+NpyIter_GetIterIndexRange(NpyIter *iter,
+                          npy_intp *istart, npy_intp *iend)
+{
+    *istart = NIT_ITERSTART(iter);
+    *iend = NIT_ITEREND(iter);
+}
+
+/*NUMPY_API
+ * Gets the broadcast shape (if coords are enabled)
+ */
+NPY_NO_EXPORT int
+NpyIter_GetShape(NpyIter *iter, npy_intp *outshape)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp idim, sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+    char *perm;
+
+    if (!(itflags&NPY_ITFLAG_HASCOORDS)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot get the shape of an iterator "
+                "without coordinates requested in the constructor");
+        return NPY_FAIL;
+    }
+
+    perm = NIT_PERM(iter);
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        char p = perm[idim];
+        if (p < 0) {
+            outshape[ndim+p] = NAD_SHAPE(axisdata);
+        }
+        else {
+            outshape[ndim-p-1] = NAD_SHAPE(axisdata);
+        }
+    }
+
+    return NPY_SUCCEED;
+}
+
+/*NUMPY_API
+ * Get the array of data pointers (1 per object being iterated)
+ *
+ * This function may be safely called without holding the Python GIL.
+ */
+NPY_NO_EXPORT char **
+NpyIter_GetDataPtrArray(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        return NBF_PTRS(bufferdata);
+    }
+    else {
+        NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+        return NAD_PTRS(axisdata);
+    }
+}
+
+/*NUMPY_API
+ * Get the array of data type pointers (1 per object being iterated)
+ */
+NPY_NO_EXPORT PyArray_Descr **
+NpyIter_GetDescrArray(NpyIter *iter)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    /*npy_intp niter = NIT_NITER(iter);*/
+
+    return NIT_DTYPES(iter);
+}
+
+/*NUMPY_API
+ * Get the array of objects being iterated
+ */
+NPY_NO_EXPORT PyArrayObject **
+NpyIter_GetOperandArray(NpyIter *iter)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    return NIT_OPERANDS(iter);
+}
+
+/*NUMPY_API
+ * Returns a view to the i-th object with the iterator's internal axes
+ */
+NPY_NO_EXPORT PyArrayObject *
+NpyIter_GetIterView(NpyIter *iter, npy_intp i)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    PyArrayObject *obj, *view;
+    PyArray_Descr *dtype;
+    char *dataptr;
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    int writeable;
+
+    if (i < 0 || i >= niter) {
+        PyErr_SetString(PyExc_IndexError,
+                "index provided for an iterator view was out of bounds");
+        return NULL;
+    }
+
+    /* Don't provide views if buffering is enabled */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        PyErr_SetString(PyExc_ValueError,
+                "cannot provide an iterator view when buffering is enabled");
+        return NULL;
+    }
+
+    obj = NIT_OPERANDS(iter)[i];
+    dtype = PyArray_DESCR(obj);
+    writeable = NIT_OPITFLAGS(iter)[i]&NPY_OP_ITFLAG_WRITE;
+    dataptr = NIT_RESETDATAPTR(iter)[i];
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    /* Retrieve the shape and strides from the axisdata */
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        shape[ndim-idim-1] = NAD_SHAPE(axisdata);
+        strides[ndim-idim-1] = NAD_STRIDES(axisdata)[i];
+    }
+    
+    Py_INCREF(dtype);
+    view = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, ndim,
+                                shape, strides, dataptr,
+                                writeable ? NPY_WRITEABLE : 0,
+                                NULL);
+    if (view == NULL) {
+        return NULL;
+    }
+    /* Tell the view who owns the data */
+    Py_INCREF(obj);
+    view->base = (PyObject *)obj;
+    /* Make sure all the flags are good */
+    PyArray_UpdateFlags(view, NPY_UPDATE_ALL);
+
+    return view;
+}
+
+/*NUMPY_API
+ * Get a pointer to the index, if it is being tracked
+ */
+NPY_NO_EXPORT npy_intp *
+NpyIter_GetIndexPtr(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+
+    if (itflags&NPY_ITFLAG_HASINDEX) {
+        /* The index is just after the data pointers */
+        return (npy_intp*)NAD_PTRS(axisdata) + niter;
+    }
+    else {
+        return NULL;
+    }
+}
+
+/*NUMPY_API
+ * Gets an array of read flags (1 per object being iterated)
+ */
+NPY_NO_EXPORT void
+NpyIter_GetReadFlags(NpyIter *iter, char *outreadflags)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        outreadflags[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_READ) != 0;
+    }
+}
+
+/*NUMPY_API
+ * Gets an array of write flags (1 per object being iterated)
+ */
+NPY_NO_EXPORT void
+NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        outwriteflags[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) != 0;
+    }
+}
+
+
+/*NUMPY_API
+ * Get the array of strides for the inner loop (when HasInnerLoop is false)
+ *
+ * This function may be safely called without holding the Python GIL.
+ */
+NPY_NO_EXPORT npy_intp *
+NpyIter_GetInnerStrideArray(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
+        return NBF_STRIDES(data);
+    }
+    else {
+        NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+        return NAD_STRIDES(axisdata);
+    }
+}
+
+/*NUMPY_API
+ * Gets the array of strides for the specified axis.  Requires
+ * that the iterator be tracking coordinates, and that buffering
+ * is not enabled.
+ *
+ * Returns NULL if an error occurs.
+ */
+NPY_NO_EXPORT npy_intp *
+NpyIter_GetAxisStrideArray(NpyIter *iter, npy_intp axis)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    char *perm = NIT_PERM(iter);
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    if (!(itflags&NPY_ITFLAG_HASCOORDS)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Iterator GetAxisStrideArray may only be called "
+                "if coordinates are being tracked");
+        return NULL;
+    }
+    else if (itflags&NPY_ITFLAG_BUFFER) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Iterator GetAxisStrideArray may not be called on "
+                "a buffered iterator");
+        return NULL;
+    }
+    else if (axis < 0 || axis >= ndim) {
+        PyErr_SetString(PyExc_ValueError,
+                "axis out of bounds in iterator GetStrideAxisArray");
+        return NULL;
+    }
+
+    /* Reverse axis, since the iterator treats them that way */
+    axis = ndim-1-axis;
+
+    /* First find the axis in question */
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        if (perm[idim] == axis || -1-perm[idim] == axis) {
+            return NAD_STRIDES(axisdata);
+        }
+    }
+
+    PyErr_SetString(PyExc_RuntimeError,
+            "internal error in iterator perm");
+    return  NULL;
+}
+
+/*NUMPY_API
+ * Get an array of strides which are fixed.  Any strides which may
+ * change during iteration receive the value NPY_MAX_INTP.  Once
+ * the iterator is ready to iterate, call this to get the strides
+ * which will always be fixed in the inner loop, then choose optimized
+ * inner loop functions which take advantage of those fixed strides.
+ *
+ * This function may be safely called without holding the Python GIL.
+ */
+NPY_NO_EXPORT void
+NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
+        char *op_itflags = NIT_OPITFLAGS(iter);
+        npy_intp stride, *strides = NBF_STRIDES(data),
+                *ad_strides = NAD_STRIDES(axisdata);
+        PyArray_Descr **dtypes = NIT_DTYPES(iter);
+
+        for (iiter = 0; iiter < niter; ++iiter) {
+            stride = strides[iiter];
+            /* Operands which are always/never buffered have fixed strides */
+            if (op_itflags[iiter]&
+                            (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_BUFNEVER)) {
+                out_strides[iiter] = stride;
+            }
+            /* Reductions in the inner loop have fixed strides */
+            else if (stride == 0 && (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE)) {
+                out_strides[iiter] = stride;
+            }
+            /*
+             * Inner loop contiguous array means its stride won't change when
+             * switching between buffering and not buffering
+             */
+            else if (ad_strides[iiter] == dtypes[iiter]->elsize) {
+                out_strides[iiter] = ad_strides[iiter];
+            }
+            /*
+             * Otherwise the strides can change if the operand is sometimes
+             * buffered, sometimes not.
+             */
+            else {
+                out_strides[iiter] = NPY_MAX_INTP;
+            }
+        }
+    }
+    else {
+        /* If there's no buffering, the strides are always fixed */
+        memcpy(out_strides, NAD_STRIDES(axisdata), niter*NPY_SIZEOF_INTP);
+    }
+}
+
+
+/*NUMPY_API
+ * Get a pointer to the size of the inner loop  (when HasInnerLoop is false)
+ *
+ * This function may be safely called without holding the Python GIL.
+ */
+NPY_NO_EXPORT npy_intp *
+NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp niter = NIT_NITER(iter);
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
+        return &NBF_SIZE(data);
+    }
+    else {
+        NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+        return &NAD_SHAPE(axisdata);
+    }
+}
+
+/* Checks 'flags' for (C|F)_ORDER_INDEX, COORDS, and NO_INNER_ITERATION,
+ * setting the appropriate internal flags in 'itflags'.
+ *
+ * Returns 1 on success, 0 on error.
+ */
+static int
+npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
+{
+    if ((flags&NPY_ITER_PER_OP_FLAGS) != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                    "A per-operand flag was passed as a global flag "
+                    "to the iterator constructor");
+        return 0;
+    }
+
+    /* Check for an index */
+    if (flags&(NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) {
+        if ((flags&(NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) ==
+                    (NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator flags C_INDEX and "
+                    "F_INDEX cannot both be specified");
+            return 0;
+        }
+        (*itflags) |= NPY_ITFLAG_HASINDEX;
+    }
+    /* Check if coordinates were requested */
+    if (flags&NPY_ITER_COORDS) {
+        /*
+         * This flag primarily disables dimension manipulations that
+         * would produce a different set of coordinates.
+         */
+        (*itflags) |= NPY_ITFLAG_HASCOORDS;
+    }
+    /* Check if the caller wants to handle inner iteration */
+    if (flags&NPY_ITER_NO_INNER_ITERATION) {
+        if ((*itflags)&(NPY_ITFLAG_HASINDEX|NPY_ITFLAG_HASCOORDS)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator flag NO_INNER_ITERATION cannot be used "
+                    "if coords or an index is being tracked");
+            return 0;
+        }
+        (*itflags) |= NPY_ITFLAG_NOINNER;
+    }
+    /* Ranged */
+    if (flags&NPY_ITER_RANGED) {
+        (*itflags) |= NPY_ITFLAG_RANGE;
+        if ((flags&NPY_ITER_NO_INNER_ITERATION) &&
+                                    !(flags&NPY_ITER_BUFFERED)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator flag RANGED cannot be used with "
+                    "the flag NO_INNER_ITERATION unless "
+                    "BUFFERED is also enabled");
+            return 0;
+        }
+    }
+    /* Buffering */
+    if (flags&NPY_ITER_BUFFERED) {
+        (*itflags) |= NPY_ITFLAG_BUFFER;
+        if (flags&NPY_ITER_GROWINNER) {
+            (*itflags) |= NPY_ITFLAG_GROWINNER;
+        }
+        if (flags&NPY_ITER_DELAY_BUFALLOC) {
+            (*itflags) |= NPY_ITFLAG_DELAYBUF;
+        }
+    }
+
+    return 1;
+}
+
+static npy_intp
+npyiter_calculate_ndim(npy_intp niter, PyArrayObject **op_in,
+                       npy_intp oa_ndim)
+{
+    /* If 'op_axes' is being used, force 'ndim' */
+    if (oa_ndim > 0 ) {
+        return oa_ndim;
+    }
+    /* Otherwise it's the maximum 'ndim' from the operands */
+    else {
+        npy_intp ndim = 0, iiter;
+
+        for (iiter = 0; iiter < niter; ++iiter) {
+            if (op_in[iiter] != NULL) {
+                npy_intp ondim = PyArray_NDIM(op_in[iiter]);
+                if (ondim > ndim) {
+                    ndim = ondim;
+                }
+            }
+
+        }
+
+        return ndim;
+    }
+}
+
+static int
+npyiter_check_op_axes(npy_intp niter, npy_intp oa_ndim, npy_intp **op_axes)
+{
+    char axes_dupcheck[NPY_MAXDIMS];
+    npy_intp iiter, idim;
+
+    if (oa_ndim == 0 && op_axes != NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "If 'op_axes' is not NULL in the iterator constructor, "
+                "'oa_ndim' must be greater than zero");
+        return 0;
+    }
+    else if (oa_ndim > 0) {
+        if (oa_ndim > NPY_MAXDIMS) {
+            PyErr_Format(PyExc_ValueError,
+                "Cannot construct an iterator with more than %d dimensions "
+                "(%d were requested for op_axes)",
+                (int)NPY_MAXDIMS, (int)oa_ndim);
+            return 0;
+        }
+        else if (op_axes == NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "If 'oa_ndim' is greater than zero in the iterator "
+                    "constructor, then op_axes cannot be NULL");
+            return 0;
+        }
+
+        /* Check that there are no duplicates in op_axes */
+        for (iiter = 0; iiter < niter; ++iiter) {
+            npy_intp *axes = op_axes[iiter];
+            if (axes != NULL) {
+                memset(axes_dupcheck, 0, NPY_MAXDIMS);
+                for (idim = 0; idim < oa_ndim; ++idim) {
+                    npy_intp i = axes[idim];
+                    if (i >= 0) {
+                        if (i >= NPY_MAXDIMS) {
+                            PyErr_Format(PyExc_ValueError,
+                                    "The 'op_axes' provided to the iterator "
+                                    "constructor for operand %d "
+                                    "contained invalid "
+                                    "values %d", (int)iiter, (int)i);
+                            return 0;
+                        } else if(axes_dupcheck[i] == 1) {
+                            PyErr_Format(PyExc_ValueError,
+                                    "The 'op_axes' provided to the iterator "
+                                    "constructor for operand %d "
+                                    "contained duplicate "
+                                    "value %d", (int)iiter, (int)i);
+                            return 0;
+                        }
+                        else {
+                            axes_dupcheck[i] = 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Checks the per-operand input flags, and fills in op_itflags.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+npyiter_check_per_op_flags(npy_uint32 op_flags, char *op_itflags)
+{
+    if ((op_flags&NPY_ITER_GLOBAL_FLAGS) != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                    "A global iterator flag was passed as a per-operand flag "
+                    "to the iterator constructor");
+        return 0;
+    }
+
+    /* Check the read/write flags */
+    if (op_flags&NPY_ITER_READONLY) {
+        /* The read/write flags are mutually exclusive */
+        if (op_flags&(NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Only one of the iterator flags READWRITE, "
+                    "READONLY, and WRITEONLY may be "
+                    "specified for an operand");
+            return 0;
+        }
+
+        *op_itflags = NPY_OP_ITFLAG_READ;
+    }
+    else if (op_flags&NPY_ITER_READWRITE) {
+        /* The read/write flags are mutually exclusive */
+        if (op_flags&NPY_ITER_WRITEONLY) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Only one of the iterator flags READWRITE, "
+                    "READONLY, and WRITEONLY may be "
+                    "specified for an operand");
+            return 0;
+        }
+
+        *op_itflags = NPY_OP_ITFLAG_READ|NPY_OP_ITFLAG_WRITE;
+    }
+    else if(op_flags&NPY_ITER_WRITEONLY) {
+        *op_itflags = NPY_OP_ITFLAG_WRITE;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                "None of the iterator flags READWRITE, "
+                "READONLY, or WRITEONLY were "
+                "specified for an operand");
+        return 0;
+    }
+
+    /* Check the flags for temporary copies */
+    if (((*op_itflags)&NPY_OP_ITFLAG_WRITE) &&
+                (op_flags&(NPY_ITER_COPY|
+                           NPY_ITER_UPDATEIFCOPY)) == NPY_ITER_COPY) {
+        PyErr_SetString(PyExc_ValueError,
+                "If an iterator operand is writeable, must use "
+                "the flag UPDATEIFCOPY instead of "
+                "COPY");
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Prepares a a constructor operand.  Assumes a reference to 'op'
+ * is owned, and that 'op' may be replaced.  Fills in 'op_dtype'
+ * and 'ndim'.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+npyiter_prepare_one_operand(PyArrayObject **op,
+                        char **op_dataptr,
+                        PyArray_Descr *op_request_dtype,
+                        PyArray_Descr **op_dtype,
+                        npy_uint32 flags,
+                        npy_uint32 op_flags, char *op_itflags)
+{
+    /* NULL operands must be automatically allocated outputs */
+    if (*op == NULL) {
+        /* ALLOCATE should be enabled */
+        if (!(op_flags&NPY_ITER_ALLOCATE)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator operand was NULL, but automatic allocation as an "
+                    "output wasn't requested");
+            return 0;
+        }
+        /* Writing should be enabled */
+        if (!((*op_itflags)&NPY_OP_ITFLAG_WRITE)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Automatic allocation was requested for an iterator "
+                    "operand, but it wasn't flagged for writing");
+            return 0;
+        }
+        /*
+         * Reading should be disabled if buffering is enabled without
+         * also enabling NPY_ITER_DELAY_BUFALLOC.  In all other cases,
+         * the caller may initialize the allocated operand to a value
+         * before beginning iteration.
+         */
+        if (((flags&(NPY_ITER_BUFFERED|
+                        NPY_ITER_DELAY_BUFALLOC)) == NPY_ITER_BUFFERED) &&
+                ((*op_itflags)&NPY_OP_ITFLAG_READ)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Automatic allocation was requested for an iterator "
+                    "operand, and it was flagged as readable, but buffering "
+                    " without delayed allocation was enabled");
+            return 0;
+        }
+        *op_dataptr = NULL;
+        /* If a requested dtype was provided, use it, otherwise NULL */
+        Py_XINCREF(op_request_dtype);
+        *op_dtype = op_request_dtype;
+
+        return 1;
+    }
+
+    if (PyArray_Check(*op)) {
+        if (((*op_itflags)&NPY_OP_ITFLAG_WRITE) &&
+                    (!PyArray_CHKFLAGS(*op, NPY_WRITEABLE))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator operand was a non-writeable array, but was "
+                    "flagged as writeable");
+            return 0;
+        }
+        if (!(flags&NPY_ITER_ZEROSIZE_OK) && PyArray_SIZE(*op) == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iteration of zero-sized operands is not enabled");
+            return 0;
+        }
+        *op_dataptr = PyArray_BYTES(*op);
+        /* PyArray_DESCR does not give us a reference */
+        *op_dtype = PyArray_DESCR(*op);
+        if (*op_dtype == NULL) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator input array object has no dtype descr");
+            return 0;
+        }
+        Py_INCREF(*op_dtype);
+        /*
+         * If references weren't specifically allowed, make sure there
+         * are no references in the inputs or requested dtypes.
+         */
+        if (!(flags&NPY_ITER_REFS_OK)) {
+            PyArray_Descr *dt = PyArray_DESCR(*op);
+            if (((dt->flags&(NPY_ITEM_REFCOUNT|
+                           NPY_ITEM_IS_POINTER)) != 0) ||
+                    (dt != *op_dtype &&
+                        (((*op_dtype)->flags&(NPY_ITEM_REFCOUNT|
+                                             NPY_ITEM_IS_POINTER))) != 0)) {
+                PyErr_SetString(PyExc_TypeError,
+                        "Iterator operand or requested dtype holds "
+                        "references, but the REFS_OK flag was not enabled");
+                return 0;
+            }
+        }
+        /*
+         * Checking whether casts are valid is done later, once the
+         * final data types have been selected.  For now, just store the
+         * requested type.
+         */
+        if (op_request_dtype != NULL) {
+            /* We just have a borrowed reference to op_request_dtype */
+            Py_INCREF(op_request_dtype);
+            /* If it's a data type without a size, set the size */
+            if (op_request_dtype->elsize == 0) {
+                PyArray_DESCR_REPLACE(op_request_dtype);
+                if (op_request_dtype == NULL) {
+                    return 0;
+                }
+
+                if (op_request_dtype->type_num == NPY_STRING) {
+                    switch((*op_dtype)->type_num) {
+                        case NPY_STRING:
+                            op_request_dtype->elsize = (*op_dtype)->elsize;
+                            break;
+                        case NPY_UNICODE:
+                            op_request_dtype->elsize = (*op_dtype)->elsize >> 2;
+                            break;
+                    }
+                }
+                else if (op_request_dtype->type_num == NPY_UNICODE) {
+                    switch((*op_dtype)->type_num) {
+                        case NPY_STRING:
+                            op_request_dtype->elsize = (*op_dtype)->elsize << 2;
+                            break;
+                        case NPY_UNICODE:
+                            op_request_dtype->elsize = (*op_dtype)->elsize;
+                            break;
+                    }
+                }
+                else if (op_request_dtype->type_num == NPY_VOID) {
+                    op_request_dtype->elsize = (*op_dtype)->elsize;
+                }
+            }
+            /* Store the requested dtype */
+            Py_DECREF(*op_dtype);
+            *op_dtype = op_request_dtype;
+        }
+
+        /* Check if the operand is in the byte order requested */
+        if (op_flags&NPY_ITER_NBO) {
+            /* Check byte order */
+            if (!PyArray_ISNBO((*op_dtype)->byteorder)) {
+                PyArray_Descr *nbo_dtype;
+                
+                /* Replace with a new descr which is in native byte order */
+                nbo_dtype = PyArray_DescrNewByteorder(*op_dtype, NPY_NATIVE);
+                Py_DECREF(*op_dtype);
+                *op_dtype = nbo_dtype;
+
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_NBO\n");
+                /* Indicate that byte order or alignment needs fixing */
+                *op_itflags |= NPY_OP_ITFLAG_CAST;
+            }
+        }
+        /* Check if the operand is aligned */
+        if (op_flags&NPY_ITER_ALIGNED) {
+            /* Check alignment */
+            if (!PyArray_ISALIGNED(*op)) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_ALIGNED\n");
+                *op_itflags |= NPY_OP_ITFLAG_CAST;
+            }
+        }
+        /*
+         * The check for NPY_ITER_CONTIG can only be done later,
+         * once the final iteration order is settled.
+         */
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator inputs must be ndarrays");
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Process all the operands, copying new references so further processing
+ * can replace the arrays if copying is necessary.
+ */
+static int
+npyiter_prepare_operands(npy_intp niter, PyArrayObject **op_in,
+                    PyArrayObject **op,
+                    char **op_dataptr,
+                    PyArray_Descr **op_request_dtypes,
+                    PyArray_Descr **op_dtype,
+                    npy_uint32 flags,
+                    npy_uint32 *op_flags, char *op_itflags)
+{
+    npy_intp iiter, i;
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        op[iiter] = op_in[iiter];
+        Py_XINCREF(op[iiter]);
+        op_dtype[iiter] = NULL;
+
+        /* Check the readonly/writeonly flags, and fill in op_itflags */
+        if (!npyiter_check_per_op_flags(op_flags[iiter], &op_itflags[iiter])) {
+            for (i = 0; i <= iiter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            return 0;
+        }
+
+        /*
+         * Prepare the operand.  This produces an op_dtype[iiter] reference
+         * on success.
+         */
+        if (!npyiter_prepare_one_operand(&op[iiter],
+                        &op_dataptr[iiter],
+                        op_request_dtypes ? op_request_dtypes[iiter] : NULL,
+                        &op_dtype[iiter],
+                        flags,
+                        op_flags[iiter], &op_itflags[iiter])) {
+            for (i = 0; i <= iiter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            return 0;
+        }
+    }
+
+
+    /* If all the operands were NULL, it's an error */
+    if (op[0] == NULL) {
+        int all_null = 1;
+        for (iiter = 1; iiter < niter; ++iiter) {
+            if (op[iiter] != NULL) {
+                all_null = 0;
+                break;
+            }
+        }
+        if (all_null) {
+            npy_intp i;
+
+            for (i = 0; i < niter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            PyErr_SetString(PyExc_ValueError,
+                    "At least one iterator input must be non-NULL");
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static const char *
+npyiter_casting_to_string(NPY_CASTING casting)
+{
+    switch (casting) {
+        case NPY_NO_CASTING:
+            return "'no'";
+        case NPY_EQUIV_CASTING:
+            return "'equiv'";
+        case NPY_SAFE_CASTING:
+            return "'safe'";
+        case NPY_SAME_KIND_CASTING:
+            return "'same_kind'";
+        case NPY_UNSAFE_CASTING:
+            return "'unsafe'";
+        default:
+            return "<unknown>";
+    }
+}
+
+static int
+npyiter_check_casting(npy_intp niter, PyArrayObject **op,
+                    PyArray_Descr **op_dtype,
+                    NPY_CASTING casting,
+                    char *op_itflags)
+{
+    npy_intp iiter;
+
+    for(iiter = 0; iiter < niter; ++iiter) {
+        NPY_IT_DBG_PRINTF("Iterator: Checking casting for operand %d\n",
+                            (int)iiter);
+#if NPY_IT_DBG_TRACING 
+        printf("op: ");
+        if (op[iiter] != NULL) {
+            PyObject_Print((PyObject *)PyArray_DESCR(op[iiter]), stdout, 0);
+        }
+        else {
+            printf("<null>");
+        }
+        printf(", iter: ");
+        PyObject_Print((PyObject *)op_dtype[iiter], stdout, 0);
+        printf("\n");
+#endif
+        /* If the types aren't equivalent, a cast is necessary */
+        if (op[iiter] != NULL && !PyArray_EquivTypes(PyArray_DESCR(op[iiter]),
+                                                     op_dtype[iiter])) {
+            /* Check read (op -> temp) casting */
+            if ((op_itflags[iiter]&NPY_OP_ITFLAG_READ) &&
+                        !PyArray_CanCastArrayTo(op[iiter],
+                                          op_dtype[iiter],
+                                          casting)) {
+                PyErr_Format(PyExc_TypeError,
+                        "Iterator operand %d dtype could not be cast "
+                        "to the requested dtype, according to "
+                        "the casting rule given, %s", (int)iiter,
+                        npyiter_casting_to_string(casting));
+                return 0;
+            }
+            /* Check write (temp -> op) casting */
+            if ((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
+                        !PyArray_CanCastTypeTo(op_dtype[iiter],
+                                          PyArray_DESCR(op[iiter]),
+                                          casting)) {
+                PyErr_Format(PyExc_TypeError,
+                        "Iterator requested dtype could not be cast "
+                        "to the operand %d dtype, according to "
+                        "the casting rule given, %s", (int)iiter,
+                        npyiter_casting_to_string(casting));
+                return 0;
+            }
+
+            NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                "because the types aren't equivalent\n");
+            /* Indicate that this operand needs casting */
+            op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
+        }
+    }
+
+    return 1;
+}
+
+static PyObject *
+npyiter_shape_string(npy_intp n, npy_intp *vals, char *ending)
+{
+    npy_intp i;
+    PyObject *ret, *tmp;
+
+    /*
+     * Negative dimension indicates "newaxis", which can
+     * be discarded for printing if its a leading dimension.
+     * Find the first non-"newaxis" dimension.
+     */
+    i = 0;
+    while (i < n && vals[i] < 0) {
+        ++i;
+    }
+
+    if (i == n) {
+            return PyString_FromFormat("()%s", ending);
+    }
+    else {
+        ret = PyString_FromFormat("(%zd", vals[i++]);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    for (; i < n; ++i) {
+        if (vals[i] < 0) {
+            tmp = PyString_FromString(",newaxis");
+        }
+        else {
+            tmp = PyString_FromFormat(",%zd", vals[i]);
+        }
+        if (tmp == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        
+        PyString_ConcatAndDel(&ret, tmp);
+        if (ret == NULL) {
+            return NULL;
+        }
+    }
+
+    tmp = PyString_FromFormat(")%s", ending);
+    PyString_ConcatAndDel(&ret, tmp);
+    return ret;
+}
+
+/*
+ * Fills in the AXISDATA for the 'niter' operands, broadcasting
+ * the dimensionas as necessary.  Also fills
+ * in the ITERSIZE data member.
+ *
+ * If op_axes is not NULL, it should point to an array of ndim-sized
+ * arrays, one for each op.
+ *
+ * Returns 1 on success, 0 on failure.
+ */
+static int
+npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, char *op_itflags,
+                    char **op_dataptr,
+                    npy_uint32 *op_flags, npy_intp **op_axes,
+                    int output_scalars)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    npy_intp ondim;
+    char *odataptr;
+    NpyIter_AxisData *axisdata0, *axisdata;
+    npy_intp sizeof_axisdata;
+    PyArrayObject **op = NIT_OPERANDS(iter);
+
+    axisdata0 = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    /* Process the first operand */
+    if (op_axes == NULL || op_axes[0] == NULL) {
+        /* Default broadcasting rules if op_axes is not specified */
+        axisdata = axisdata0;
+        ondim = (op[0] == NULL) ? 0 : PyArray_NDIM(op[0]);
+        odataptr = op_dataptr[0];
+        /* Possible if op_axes are being used, but op_axes[0] is NULL */
+        if (ondim > ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator input has more dimensions than allowed "
+                    "by the 'op_axes' specified");
+            return 0;
+        }
+        for (idim = 0; idim < ondim; ++idim) {
+            npy_intp shape;
+
+            /* op[0] != NULL, because we set ondim to 0 in that case */
+            shape = PyArray_DIM(op[0], ondim-idim-1);
+
+            NAD_SHAPE(axisdata) = shape;
+            NAD_COORD(axisdata) = 0;
+            if (shape == 1) {
+                NAD_STRIDES(axisdata)[0] = 0;
+            }
+            else {
+                NAD_STRIDES(axisdata)[0] = PyArray_STRIDE(op[0], ondim-idim-1);
+            }
+            NAD_PTRS(axisdata)[0] = odataptr;
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+        for (idim = ondim; idim < ndim; ++idim) {
+            NAD_SHAPE(axisdata) = 1;
+            NAD_COORD(axisdata) = 0;
+            NAD_STRIDES(axisdata)[0] = 0;
+            NAD_PTRS(axisdata)[0] = odataptr;
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+    }
+    else {
+        npy_intp *axes = op_axes[0];
+
+        /* Use op_axes to choose the axes */
+        axisdata = axisdata0;
+        ondim = (op[0] == NULL) ? ndim : PyArray_NDIM(op[0]);
+        odataptr = op_dataptr[0];
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp i = axes[ndim-idim-1];
+            if (i < 0) {
+                NAD_SHAPE(axisdata) = 1;
+                NAD_COORD(axisdata) = 0;
+                NAD_STRIDES(axisdata)[0] = 0;
+                NAD_PTRS(axisdata)[0] = odataptr;
+            }
+            else if (i < ondim) {
+                npy_intp shape;
+                
+                if (op[0] != NULL) {
+                    shape = PyArray_DIM(op[0], i);
+                }
+                else {
+                    shape = 1;
+                }
+
+                NAD_SHAPE(axisdata) = shape;
+                NAD_COORD(axisdata) = 0;
+                if (shape == 1) {
+                    NAD_STRIDES(axisdata)[0] = 0;
+                }
+                else {
+                    NAD_STRIDES(axisdata)[0] = PyArray_STRIDE(op[0], i);
+                }
+                NAD_PTRS(axisdata)[0] = odataptr;
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                        "Iterator input op_axes[0][%d] (==%d) is not a valid "
+                        "axis of op[0], which has %d dimensions",
+                        (int)(ndim-idim-1), (int)i, (int)ondim);
+                return 0;
+            }
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+    }
+
+    /*
+     * Process the rest of the operands, using the broadcasting rules
+     * to combine them.
+     */
+    for (iiter = 1; iiter < niter; ++iiter) {
+        if (op_axes == NULL || op_axes[iiter] == NULL) {
+            axisdata = axisdata0;
+            ondim = (op[iiter] == NULL) ? 0 : PyArray_NDIM(op[iiter]);
+            odataptr = op_dataptr[iiter];
+            /* Possible if op_axes are being used, but op_axes[iiter] is NULL */
+            if (ondim > ndim) {
+                PyErr_SetString(PyExc_ValueError,
+                        "input operand has more dimensions than allowed "
+                        "by the axis remapping");
+                return 0;
+            }
+            for (idim = 0; idim < ondim; ++idim) {
+                npy_intp shape;
+                
+                /* op[iiter] != NULL, because we set ondim to 0 in that case */
+                shape = PyArray_DIM(op[iiter], ondim-idim-1);
+
+                if (shape == 1) {
+                    NAD_STRIDES(axisdata)[iiter] = 0;
+                }
+                else {
+                    if (NAD_SHAPE(axisdata) == 1) {
+                        NAD_SHAPE(axisdata) = shape;
+                    }
+                    else if (NAD_SHAPE(axisdata) != shape) {
+                        goto broadcast_error;
+                    }
+                    NAD_STRIDES(axisdata)[iiter] = PyArray_STRIDE(
+                                                      op[iiter], ondim-idim-1);
+                }
+                NAD_PTRS(axisdata)[iiter] = odataptr;
+
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+            for (idim = ondim; idim < ndim; ++idim) {
+                NAD_STRIDES(axisdata)[iiter] = 0;
+                NAD_PTRS(axisdata)[iiter] = odataptr;
+
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+        }
+        else {
+            npy_intp *axes = op_axes[iiter];
+
+            /* Use op_axes to choose the axes */
+            axisdata = axisdata0;
+            ondim = (op[iiter] == NULL) ? ndim : PyArray_NDIM(op[iiter]);
+            odataptr = op_dataptr[iiter];
+            for (idim = 0; idim < ndim; ++idim) {
+                npy_intp i = axes[ndim-idim-1];
+                if (i < 0) {
+                    NAD_STRIDES(axisdata)[iiter] = 0;
+                    NAD_PTRS(axisdata)[iiter] = odataptr;
+                }
+                else if (i < ondim) {
+                    npy_intp shape;
+                    
+                    if (op[iiter] != NULL) {
+                        shape = PyArray_DIM(op[iiter], i);
+                    }
+                    else {
+                        shape = 1;
+                    }
+
+                    if (shape == 1) {
+                        NAD_STRIDES(axisdata)[iiter] = 0;
+                    }
+                    else {
+                        if (NAD_SHAPE(axisdata) == 1) {
+                            NAD_SHAPE(axisdata) = shape;
+                        }
+                        else if (NAD_SHAPE(axisdata) != shape) {
+                            goto broadcast_error;
+                        }
+                        NAD_STRIDES(axisdata)[iiter] =
+                                                PyArray_STRIDE(op[iiter], i);
+                    }
+                    NAD_PTRS(axisdata)[iiter] = odataptr;
+                }
+                else {
+                    PyErr_Format(PyExc_ValueError,
+                            "Iterator input op_axes[%d][%d] (==%d) is not a "
+                            "valid axis of op[%d], which has %d dimensions ",
+                            (int)iiter, (int)(ndim-idim-1), (int)i,
+                            (int)iiter, (int)ondim);
+                    return 0;
+                }
+
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+        }
+    }
+
+    /* Go through and check for operands for broadcasting and reduction */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] != NULL) {
+            /*
+             * If broadcasting is disallowed for this operand,
+             * unless scalars are output, which means all operands are scalar
+             * and no broadcasting errors could occur
+             */
+            if ((op_flags[iiter]&NPY_ITER_NO_BROADCAST) && !output_scalars) {
+                npy_intp *axes;
+
+                axes = op_axes ? op_axes[iiter] : NULL;
+                axisdata = axisdata0;
+                for (idim = 0; idim < ndim; ++idim) {
+                    npy_intp i;
+                    if (axes) {
+                        i = axes[ndim-idim-1];
+                    }
+                    else {
+                        i = ndim-idim-1;
+                    }
+                    if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
+                        if (PyArray_DIM(op[iiter], i) != NAD_SHAPE(axisdata)) {
+                            goto operand_different_than_broadcast;
+                        }
+                    }
+                    /*
+                     * Also disallow broadcasting by adding additional
+                     * dimensions, unless that part of the broadcasting
+                     * was done explicitly through op_axes.
+                     */
+                    else if (!axes) {
+                        goto operand_different_than_broadcast;
+                    }
+                    /* If its writeable, this may cause a reduction */
+                    else if ((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
+                                                    NAD_SHAPE(axisdata) > 1) {
+                        if (!(flags&NPY_ITER_REDUCE_OK)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "operand requires a reduction, but "
+                                    "reduction is not enabled");
+                            return 0;
+                        }
+                        if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "operand requires a reduction, but "
+                                    "is flagged as write-only, not "
+                                    "read-write");
+                            return 0;
+                        }
+                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                        op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
+                    }
+                    
+                    NIT_ADVANCE_AXISDATA(axisdata, 1);
+                }
+            }
+            /* Check whether this operand includes any reduction */
+            else if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                axisdata = axisdata0;
+                for (idim = 0; idim < ndim; ++idim) {
+                    /*
+                     * If the stride is 0 and the shape is bigger than
+                     * one, that's a reduction.
+                     */
+                    if (NAD_SHAPE(axisdata) > 1 &&
+                                    NAD_STRIDES(axisdata)[iiter] == 0) {
+                        if (!(flags&NPY_ITER_REDUCE_OK)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "operand requires a reduction, but "
+                                    "reduction is not enabled");
+                            return 0;
+                        }
+                        if (!(op_itflags[iiter]&NPY_OP_ITFLAG_READ)) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "operand requires a reduction, but "
+                                    "is flagged as write-only, not "
+                                    "read-write");
+                            return 0;
+                        }
+                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                        op_itflags[iiter] |= NPY_OP_ITFLAG_REDUCE;
+                        break;
+                    }
+                    
+                    NIT_ADVANCE_AXISDATA(axisdata, 1);
+                }
+            }
+        }
+    }
+
+    /* Now fill in the ITERSIZE member */
+    NIT_ITERSIZE(iter) = 1;
+    axisdata = axisdata0;
+    for (idim = 0; idim < ndim; ++idim) {
+        NIT_ITERSIZE(iter) *= NAD_SHAPE(axisdata);
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+    /* The range defaults to everything */
+    NIT_ITERSTART(iter) = 0;
+    NIT_ITEREND(iter) = NIT_ITERSIZE(iter);
+
+    return 1;
+
+broadcast_error: {
+        PyObject *errmsg, *tmp;
+        npy_intp remdims[NPY_MAXDIMS];
+        char *tmpstr;
+
+        if (op_axes == NULL) {
+            errmsg = PyString_FromString("operands could not be broadcast "
+                                "together with shapes ");
+            for (iiter = 0; iiter < niter; ++iiter) {
+                if (op[iiter] != NULL) {
+                    tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
+                                                    PyArray_DIMS(op[iiter]),
+                                                    " ");
+                    if (tmp == NULL) {
+                        return 0;
+                    }
+                    PyString_ConcatAndDel(&errmsg, tmp);
+                    if (errmsg == NULL) {
+                        return 0;
+                    }
+                }
+            }
+            PyErr_SetObject(PyExc_ValueError, errmsg);
+        }
+        else {
+            errmsg = PyString_FromString("operands could not be broadcast "
+                                "together with remapped shapes "
+                                "[original->remapped]: ");
+            for (iiter = 0; iiter < niter; ++iiter) {
+                if (op[iiter] != NULL) {
+                    npy_intp *axes = op_axes[iiter];
+
+                    tmpstr = (axes == NULL) ? " " : "->";
+                    tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
+                                                    PyArray_DIMS(op[iiter]),
+                                                    tmpstr);
+                    if (tmp == NULL) {
+                        return 0;
+                    }
+                    PyString_ConcatAndDel(&errmsg, tmp);
+                    if (errmsg == NULL) {
+                        return 0;
+                    }
+
+                    if (axes != NULL) {
+                        for (idim = 0; idim < ndim; ++idim) {
+                            npy_intp i = axes[ndim-idim-1];
+
+                            if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
+                                remdims[idim] = PyArray_DIM(op[iiter], i);
+                            }
+                            else {
+                                remdims[idim] = -1;
+                            }
+                        }
+                        tmp = npyiter_shape_string(ndim, remdims, " ");
+                        if (tmp == NULL) {
+                            return 0;
+                        }
+                        PyString_ConcatAndDel(&errmsg, tmp);
+                        if (errmsg == NULL) {
+                            return 0;
+                        }
+                    }
+                }
+            }
+            PyErr_SetObject(PyExc_ValueError, errmsg);
+        }
+
+        return 0;
+    }
+
+operand_different_than_broadcast: {
+        npy_intp remdims[NPY_MAXDIMS];
+        PyObject *errmsg, *tmp;
+
+        /* Start of error message */
+        if (op_flags[iiter]&NPY_ITER_READONLY) {
+            errmsg = PyString_FromString("non-broadcastable operand "
+                                        "with shape ");
+        }
+        else {
+            errmsg = PyString_FromString("non-broadcastable output "
+                                            "operand with shape ");
+        }
+        if (errmsg == NULL) {
+            return 0;
+        }
+
+        /* Operand shape */
+        tmp = npyiter_shape_string(PyArray_NDIM(op[iiter]),
+                                        PyArray_DIMS(op[iiter]), "");
+        if (tmp == NULL) {
+            return 0;
+        }
+        PyString_ConcatAndDel(&errmsg, tmp);
+        if (errmsg == NULL) {
+            return 0;
+        }
+        /* Remapped operand shape */
+        if (op_axes != NULL && op_axes[iiter] != NULL) {
+            npy_intp *axes = op_axes[iiter];
+
+            for (idim = 0; idim < ndim; ++idim) {
+                npy_intp i = axes[ndim-idim-1];
+
+                if (i >= 0 && i < PyArray_NDIM(op[iiter])) {
+                    remdims[idim] = PyArray_DIM(op[iiter], i);
+                }
+                else {
+                    remdims[idim] = -1;
+                }
+            }
+
+            tmp = PyString_FromString(" [remapped to ");
+            if (tmp == NULL) {
+                return 0;
+            }
+            PyString_ConcatAndDel(&errmsg, tmp);
+            if (errmsg == NULL) {
+                return 0;
+            }
+
+            tmp = npyiter_shape_string(ndim, remdims, "]");
+            if (tmp == NULL) {
+                return 0;
+            }
+            PyString_ConcatAndDel(&errmsg, tmp);
+            if (errmsg == NULL) {
+                return 0;
+            }
+        }
+
+        tmp = PyString_FromString(" doesn't match the broadcast shape ");
+        if (tmp == NULL) {
+            return 0;
+        }
+        PyString_ConcatAndDel(&errmsg, tmp);
+        if (errmsg == NULL) {
+            return 0;
+        }
+
+        /* Fill in the broadcast shape */
+        axisdata = NIT_AXISDATA(iter);
+        for (idim = 0; idim < ndim; ++idim) {
+            remdims[idim] = NAD_SHAPE(axisdata);
+            
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+
+        /* Broadcast shape */
+        tmp = npyiter_shape_string(ndim, remdims, "");
+        if (tmp == NULL) {
+            return 0;
+        }
+        PyString_ConcatAndDel(&errmsg, tmp);
+        if (errmsg == NULL) {
+            return 0;
+        }
+
+        PyErr_SetObject(PyExc_ValueError, errmsg);
+
+        return 0;
+    }
+}
+
+/*
+ * Replaces the AXISDATA for the iiter'th operand, broadcasting
+ * the dimensions as necessary.  Assumes the replacement array is
+ * exactly the same shape as the original array used when
+ * npy_fill_axisdata was called.
+ *
+ * If op_axes is not NULL, it should point to an ndim-sized
+ * array.
+ */
+static void
+npyiter_replace_axisdata(NpyIter *iter, npy_intp iiter,
+                      PyArrayObject *op,
+                      npy_intp op_ndim, char *op_dataptr,
+                      npy_intp *op_axes)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    NpyIter_AxisData *axisdata0, *axisdata;
+    npy_intp sizeof_axisdata;
+    char *perm;
+    npy_intp baseoffset = 0;
+
+    perm = NIT_PERM(iter);
+    axisdata0 = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+
+    /*
+     * Replace just the strides which were non-zero, and compute
+     * the base data address.
+     */
+    axisdata = axisdata0;
+
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        char p;
+        npy_intp i, shape;
+        
+        /* Apply the perm to get the original axis */
+        p = perm[idim];
+        if (p < 0) {
+            i = ndim+p;
+        }
+        else {
+            i = ndim-p-1;
+        }
+
+        /* Apply op_axes */
+        if (op_axes != NULL) {
+            i = op_axes[i];
+        }
+        else {
+            i -= (ndim - op_ndim);
+        }
+
+        if (i >= 0 && i < op_ndim) {
+            shape = PyArray_DIM(op, i);
+            if (shape != 1) {
+                npy_intp stride = PyArray_STRIDE(op, i);
+                if (p < 0) {
+                    /* If the perm entry is negative, flip the axis */
+                    NAD_STRIDES(axisdata)[iiter] = -stride;
+                    baseoffset += stride*(shape-1);
+                }
+                else {
+                    NAD_STRIDES(axisdata)[iiter] = stride;
+                }
+            }
+        }
+    }
+
+    op_dataptr += baseoffset;
+
+    /* Now the base data pointer is calculated, set it everywhere it's needed */
+    NIT_RESETDATAPTR(iter)[iiter] = op_dataptr;
+    NIT_BASEOFFSETS(iter)[iiter] = baseoffset;
+    axisdata = axisdata0;
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        NAD_PTRS(axisdata)[iiter] = op_dataptr;
+    }
+}
+
+/*
+ * Computes the iterator's index strides and initializes the index values
+ * to zero.
+ *
+ * This must be called before the axes (i.e. the AXISDATA array) may
+ * be reordered.
+ */
+static void
+npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp indexstride;
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+
+    /*
+     * If there is only one element being iterated, we just have
+     * to touch the first AXISDATA because nothing will ever be
+     * incremented.
+     */
+    if (NIT_ITERSIZE(iter) == 1) {
+        if (itflags&NPY_ITFLAG_HASINDEX) {
+            axisdata = NIT_AXISDATA(iter);
+            NAD_PTRS(axisdata)[niter] = 0;
+        }
+        return;
+    }
+
+    if (flags&NPY_ITER_C_INDEX) {
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        axisdata = NIT_AXISDATA(iter);
+        indexstride = 1;
+        for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            npy_intp shape = NAD_SHAPE(axisdata);
+
+            if (shape == 1) {
+                NAD_STRIDES(axisdata)[niter] = 0;
+            }
+            else {
+                NAD_STRIDES(axisdata)[niter] = indexstride;
+            }
+            NAD_PTRS(axisdata)[niter] = 0;
+            indexstride *= shape;
+        }
+    }
+    else if (flags&NPY_ITER_F_INDEX) {
+        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+        axisdata = NIT_INDEX_AXISDATA(NIT_AXISDATA(iter), ndim-1);
+        indexstride = 1;
+        for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, -1)) {
+            npy_intp shape = NAD_SHAPE(axisdata);
+
+            if (shape == 1) {
+                NAD_STRIDES(axisdata)[niter] = 0;
+            }
+            else {
+                NAD_STRIDES(axisdata)[niter] = indexstride;
+            }
+            NAD_PTRS(axisdata)[niter] = 0;
+            indexstride *= shape;
+        }
+    }
+}
+
+/*
+ * If the order is NPY_KEEPORDER, lets the iterator find the best
+ * iteration order, otherwise forces it.  Indicates in the itflags that
+ * whether the iteration order was forced.
+ */
+static void
+npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    switch (order) {
+    case NPY_CORDER:
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
+        break;
+    case NPY_FORTRANORDER:
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
+        /* Only need to actually do something if there is more than 1 dim */
+        if (ndim > 1) {
+            npyiter_reverse_axis_ordering(iter);
+        }
+        break;
+    case NPY_ANYORDER:
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
+        /* Only need to actually do something if there is more than 1 dim */
+        if (ndim > 1) {
+            PyArrayObject **op = NIT_OPERANDS(iter);
+            int forder = 1;
+
+            /* Check that all the array inputs are fortran order */
+            for (iiter = 0; iiter < niter; ++iiter, ++op) {
+                if (*op && !PyArray_CHKFLAGS(*op, NPY_F_CONTIGUOUS)) {
+                   forder = 0;
+                   break;
+                }
+            }
+
+            if (forder) {
+                npyiter_reverse_axis_ordering(iter);
+            }
+        }
+        break;
+    case NPY_KEEPORDER:
+        /* Don't set the forced order flag here... */
+        break;
+    }
+}
+
+
+/*
+ * This function negates any strides in the iterator
+ * which are negative.  When iterating more than one
+ * object, it only flips strides when they are all
+ * negative or zero.
+ */
+static void
+npyiter_flip_negative_strides(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+    NpyIter_AxisData *axisdata, *axisdata0;
+    npy_intp *baseoffsets;
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    int any_flipped = 0;
+
+    axisdata0 = axisdata = NIT_AXISDATA(iter);
+    baseoffsets = NIT_BASEOFFSETS(iter);
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_intp *strides = NAD_STRIDES(axisdata);
+        int any_negative = 0;
+
+        /*
+         * Check the signs of all the strides, excluding
+         * the index stride at the end.
+         */
+        for (iiter = 0; iiter < niter; ++iiter) {
+            if (strides[iiter] < 0) {
+                any_negative = 1;
+            }
+            else if (strides[iiter] != 0) {
+                break;
+            }
+        }
+        /*
+         * If at least on stride is negative and none are positive,
+         * flip all the strides for this dimension.
+         */
+        if (any_negative && iiter == niter) {
+            npy_intp shapem1 = NAD_SHAPE(axisdata) - 1;
+
+            for (istrides = 0; istrides < nstrides; ++istrides) {
+                npy_intp stride = strides[istrides];
+
+                /* Adjust the base pointers to start at the end */
+                baseoffsets[istrides] += shapem1 * stride;
+                /* Flip the stride */
+                strides[istrides] = -stride;
+            }
+            /* Make the perm entry negative, so getcoords knows it's  flipped */
+            NIT_PERM(iter)[idim] = -1-NIT_PERM(iter)[idim];
+
+            any_flipped = 1;
+        }
+    }
+
+    /*
+     * If any strides were flipped, the base pointers were adjusted
+     * in the first AXISDATA, and need to be copied to all the rest
+     */
+    if (any_flipped) {
+        char **resetdataptr = NIT_RESETDATAPTR(iter);
+
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            resetdataptr[istrides] += baseoffsets[istrides];
+        }
+        axisdata = axisdata0;
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+            char **ptrs = NAD_PTRS(axisdata);
+            for (istrides = 0; istrides < nstrides; ++istrides) {
+                ptrs[istrides] = resetdataptr[istrides];
+            }
+        }
+        /*
+         * Indicate that some of the perm entries are negative,
+         * and that it's not (strictly speaking) the identity perm.
+         */
+        NIT_ITFLAGS(iter) = (NIT_ITFLAGS(iter)|NPY_ITFLAG_NEGPERM) &
+                            ~NPY_ITFLAG_IDENTPERM;
+    }
+}
+
+static void
+npyiter_reverse_axis_ordering(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp i, temp, size;
+    npy_intp *first, *last;
+    char *perm;
+
+    size = NIT_AXISDATA_SIZEOF(itflags, ndim, niter)/NPY_SIZEOF_INTP;
+    first = (npy_intp*)NIT_AXISDATA(iter);
+    last = first + (ndim-1)*size;
+
+    /* This loop reverses the order of the AXISDATA array */
+    while (first < last) {
+        for (i = 0; i < size; ++i) {
+            temp = first[i];
+            first[i] = last[i];
+            last[i] = temp;
+        }
+        first += size;
+        last -= size;
+    }
+
+    /* Store the perm we applied */
+    perm = NIT_PERM(iter);
+    for(i = ndim-1; i >= 0; --i, ++perm) {
+        *perm = (char)i;
+    }
+
+    NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
+}
+
+static npy_intp intp_abs(npy_intp x)
+{
+    return (x < 0) ? -x : x;
+}
+
+static void 
+npyiter_find_best_axis_ordering(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    npy_intp i0, i1, ipos;
+    char j0, j1;
+    char *perm;
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    int permuted = 0;
+
+    perm = NIT_PERM(iter);
+    
+    /*
+     * Do a custom stable insertion sort.  Note that because
+     * the AXISDATA has been reversed from C order, this
+     * is sorting from smallest stride to biggest stride.
+     */
+    for (i0 = 1; i0 < ndim; ++i0) {
+        npy_intp *strides0;
+
+        /* 'ipos' is where perm[i0] will get inserted */
+        ipos = i0;
+        j0 = perm[i0];
+
+        strides0 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, j0));
+        for (i1 = i0-1; i1 >= 0; --i1) {
+            int ambig = 1, shouldswap = 0;
+            npy_intp *strides1;
+
+            j1 = perm[i1];
+
+            strides1 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, j1));
+
+            for (iiter = 0; iiter < niter; ++iiter) {
+                if (strides0[iiter] != 0 && strides1[iiter] != 0) {
+                    if (intp_abs(strides1[iiter]) <=
+                                            intp_abs(strides0[iiter])) {
+                        /*
+                         * Set swap even if it's not ambiguous already,
+                         * because in the case of conflicts between
+                         * different operands, C-order wins.
+                         */
+                        shouldswap = 0;
+                    }
+                    else {
+                        /* Only set swap if it's still ambiguous */
+                        if (ambig) {
+                            shouldswap = 1;
+                        }
+                    }
+                    
+                    /*
+                     * A comparison has been done, so it's
+                     * no longer ambiguous
+                     */
+                    ambig = 0;
+                }
+            }
+            /*
+             * If the comparison was unambiguous, either shift
+             * 'ipos' to 'i1' or stop looking for an insertion
+             * point
+             */
+            if (!ambig) {
+                if (shouldswap) {
+                    ipos = i1;
+                }
+                else {
+                    break;
+                }
+            }
+        }
+
+        /* Insert perm[i0] into the right place */
+        if (ipos != i0) {
+            for (i1 = i0; i1 > ipos; --i1) {
+                perm[i1] = perm[i1-1];
+            }
+            perm[ipos] = j0;
+            permuted = 1;
+        }
+    }
+
+    /* Apply the computed permutation to the AXISDATA array */
+    if (permuted == 1) {
+        npy_intp i, size = sizeof_axisdata/NPY_SIZEOF_INTP;
+        NpyIter_AxisData *ad_i;
+
+        /* Use the coord as a flag, set each to 1 */
+        ad_i = axisdata;
+        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(ad_i, 1)) {
+            NAD_COORD(ad_i) = 1;
+        }
+        /* Apply the permutation by following the cycles */
+        for (idim = 0; idim < ndim; ++idim) {
+            ad_i = NIT_INDEX_AXISDATA(axisdata, idim);
+
+            /* If this axis hasn't been touched yet, process it */
+            if (NAD_COORD(ad_i) == 1) {
+                char pidim = perm[idim], qidim;
+                npy_intp tmp;
+                NpyIter_AxisData *ad_p, *ad_q;
+
+                if (pidim != idim) {
+                    /* Follow the cycle, copying the data */
+                    for (i = 0; i < size; ++i) {
+                        qidim = (char)idim;
+                        pidim = perm[idim];
+                        ad_q = ad_i;
+                        tmp = *((npy_intp*)ad_q + i);
+                        while (pidim != idim) {
+                            ad_p = NIT_INDEX_AXISDATA(axisdata, pidim);
+                            *((npy_intp*)ad_q + i) = *((npy_intp*)ad_p + i);
+
+                            qidim = pidim;
+                            ad_q = ad_p;
+                            pidim = perm[(int)pidim];
+                        }
+                        *((npy_intp*)ad_q + i) = tmp;
+                    }
+                    /* Follow the cycle again, marking it as done */
+                    pidim = perm[idim];
+
+                    while (pidim != idim) {
+                        NAD_COORD(NIT_INDEX_AXISDATA(axisdata, pidim)) = 0;
+                        pidim = perm[(int)pidim];
+                    }
+                }
+                NAD_COORD(ad_i) = 0;
+            }
+        }
+        /* Clear the identity perm flag */
+        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
+    }
+}
+
+static void 
+npyiter_coalesce_axes(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    NpyIter_AxisData *ad_compress;
+    npy_intp new_ndim = 1;
+
+    /* The HASCOORDS or IDENTPERM flags do not apply after coalescing */
+    NIT_ITFLAGS(iter) &= ~(NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_HASCOORDS);
+
+    axisdata = NIT_AXISDATA(iter);
+    ad_compress = axisdata;
+
+    for (idim = 0; idim < ndim-1; ++idim) {
+        int can_coalesce = 1;
+        npy_intp shape0 = NAD_SHAPE(ad_compress);
+        npy_intp shape1 = NAD_SHAPE(NIT_INDEX_AXISDATA(axisdata, 1));
+        npy_intp *strides0 = NAD_STRIDES(ad_compress);
+        npy_intp *strides1 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, 1));
+
+        /* Check that all the axes can be coalesced */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            if (!((shape0 == 1 && strides0[istrides] == 0) ||
+                  (shape1 == 1 && strides1[istrides] == 0)) &&
+                     (strides0[istrides]*shape0 != strides1[istrides])) {
+                can_coalesce = 0;
+                break;
+            }
+        }
+
+        if (can_coalesce) {
+            npy_intp *strides = NAD_STRIDES(ad_compress);
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+            NAD_SHAPE(ad_compress) *= NAD_SHAPE(axisdata);
+            for (istrides = 0; istrides < nstrides; ++istrides) {
+                if (strides[istrides] == 0) {
+                    strides[istrides] = NAD_STRIDES(axisdata)[istrides];
+                }
+            }
+        }
+        else {
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+            NIT_ADVANCE_AXISDATA(ad_compress, 1);
+            if (ad_compress != axisdata) {
+                memcpy(ad_compress, axisdata, sizeof_axisdata);
+            }
+            ++new_ndim;
+        }
+    }
+
+    /*
+     * If the number of axes shrunk, reset the perm and
+     * compress the data into the new layout.
+     */
+    if (new_ndim < ndim) {
+        char *perm = NIT_PERM(iter);
+
+        /* Reset to an identity perm */
+        for (idim = 0; idim < new_ndim; ++idim) {
+            perm[idim] = (char)idim;
+        }
+        NIT_NDIM(iter) = new_ndim;
+    }
+}
+
+/*
+ * Allocates a temporary array which can be used to replace op
+ * in the iteration.  Its dtype will be op_dtype.
+ *
+ * The result array has a memory ordering which matches the iterator,
+ * which may or may not match that of op.  The parameter 'shape' may be
+ * NULL, in which case it is filled in from the iterator's shape.
+ *
+ * This function must be called before any axes are coalesced.
+ */
+static PyArrayObject *
+npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
+                npy_uint32 flags, char *op_itflags,
+                npy_intp op_ndim, npy_intp *shape,
+                PyArray_Descr *op_dtype, npy_intp *op_axes)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    char *perm = NIT_PERM(iter);
+    npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS],
+             stride = op_dtype->elsize;
+    char reversestride[NPY_MAXDIMS], anyreverse = 0;
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    npy_intp tmp_op_axes = -1;
+
+    PyArrayObject *ret;
+
+    /*
+     * If it's an automatically allocated output, start by assuming
+     * the shape will have the same length as the iterator
+     */
+    if (shape == NULL) {
+        /*
+         * If it's a scalar output, trigger scalar allocation below
+         * by making an op_axes of -1
+         */
+        if (op_ndim == 0 && ndim == 1 && op_axes == NULL) {
+            op_axes = &tmp_op_axes;
+        }
+        op_ndim = ndim;
+    }
+
+    /* Initially no strides have been set */
+    for (idim = 0; idim < op_ndim; ++idim) {
+        strides[idim] = NPY_MAX_INTP;
+        reversestride[idim] = 0;
+    }
+
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_intp i;
+        char p;
+        
+        /* Apply the perm to get the original axis */
+        p = perm[idim];
+        if (p < 0) {
+            i = ndim+p;
+        }
+        else {
+            i = ndim-p-1;
+        }
+
+        /* Apply op_axes */
+        if (op_axes != NULL) {
+            i = op_axes[i];
+        }
+        else {
+            i -= (ndim - op_ndim);
+        }
+
+        if (i >= 0) {
+            NPY_IT_DBG_PRINTF("Iterator: Setting allocated stride %d "
+                                "for iterator dimension %d to %d\n", (int)i,
+                                (int)idim, (int)stride);
+            strides[i] = stride;
+            if (p < 0) {
+                reversestride[i] = 1;
+                anyreverse = 1;
+            }
+            if (shape == NULL) {
+                new_shape[i] = NAD_SHAPE(axisdata);
+                stride *= new_shape[i];
+            }
+            else {
+                stride *= shape[i];
+            }
+        }
+    }
+
+    /*
+     * If custom axes were specified, some dimensions may not have been used.
+     * Add the REDUCE itflag if this creates a reduction situation.
+     */
+    if (shape == NULL) {
+        npy_intp new_ndim = -1;
+
+        axisdata = NIT_AXISDATA(iter);
+        for (idim = 0; idim < op_ndim; ++idim) {
+            NPY_IT_DBG_PRINTF("Iterator: Checking allocated output "
+                              "dimension %d with stride %d\n",
+                              (int)idim, (int)strides[idim]);
+            if (strides[idim] == NPY_MAX_INTP) {
+                NPY_IT_DBG_PRINTF("Iterator: The axis wasn't used, "
+                                  "and its dimension is %d\n",
+                                  (int)NAD_SHAPE(axisdata));
+                /*
+                 * If deleting this axis produces a reduction, but
+                 * reduction wasn't enabled, throw an error
+                 */
+                if (NAD_SHAPE(axisdata) != 1) {
+                    if (!(flags&NPY_ITER_REDUCE_OK)) {
+                        PyErr_SetString(PyExc_ValueError,
+                                "output requires a reduction, but "
+                                "reduction is not enabled");
+                        return NULL;
+                    }
+                    if (!((*op_itflags)&NPY_OP_ITFLAG_READ)) {
+                        PyErr_SetString(PyExc_ValueError,
+                                "output requires a reduction, but "
+                                "is flagged as write-only, not read-write");
+                        return NULL;
+                    }
+
+                    NPY_IT_DBG_PRINTF("Iterator: Indicating that a reduction "
+                                      "is occurring\n");
+                    /* Indicate that a reduction is occurring */
+                    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                    (*op_itflags) |= NPY_OP_ITFLAG_REDUCE;
+                }
+
+                /* If we didn't get the number of dimensions yet, set it */
+                if (new_ndim == -1) {
+                    new_ndim = idim;
+                }
+            }
+            /*
+             * If there's a gap in the array's dimensions, it's an error.
+             * For example, op_axes of [0,2] for the automatically allocated
+             * output.
+             */
+            else if (new_ndim != -1) {
+                PyErr_SetString(PyExc_ValueError,
+                        "automatically allocated reduction output array "
+                        "specified with an inconsistent axis mapping");
+                return NULL;
+            }
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+
+        if (new_ndim >= 0) {
+            op_ndim = new_ndim;
+        }
+    }
+    else {
+        for (idim = 0; idim < op_ndim; ++idim) {
+            if (strides[idim] == NPY_MAX_INTP) {
+                npy_intp factor, new_strides[NPY_MAXDIMS],
+                         itemsize;
+
+                /* Fill in the missing strides in C order */
+                factor = 1;
+                itemsize = op_dtype->elsize;
+                for (idim = op_ndim-1; idim >= 0; --idim) {
+                    if (strides[idim] == NPY_MAX_INTP) {
+                        new_strides[idim] = factor * itemsize;
+                        factor *= shape[idim];
+                    }
+                }
+
+                /*
+                 * Copy the missing strides, and multiply the existing strides
+                 * by the calculated factor.  This way, the missing strides
+                 * are tighter together in memory, which is good for nested
+                 * loops.
+                 */
+                for (idim = 0; idim < op_ndim; ++idim) {
+                    if (strides[idim] == NPY_MAX_INTP) {
+                        strides[idim] = new_strides[idim];
+                    }
+                    else {
+                        strides[idim] *= factor;
+                    }
+                }
+
+                break;
+            }
+        }
+    }
+
+    /* If shape was NULL, set it to the shape we calculated */
+    if (shape == NULL) {
+        shape = new_shape;
+    }
+
+    /* Allocate the temporary array */
+    Py_INCREF(op_dtype);
+    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype, op_dtype, op_ndim,
+                               shape, strides, NULL, 0, NULL);
+    if (ret == NULL) {
+        return NULL;
+    }
+    
+    /* If there are any reversed axes, create a view that reverses them */
+    if (anyreverse) {
+        char *dataptr = PyArray_DATA(ret);
+        PyArrayObject *newret;
+
+        for (idim = 0; idim < op_ndim; ++idim) {
+            if (reversestride[idim]) {
+                dataptr += strides[idim]*(shape[idim]-1);
+                strides[idim] = -strides[idim];
+            }
+        }
+        Py_INCREF(op_dtype);
+        newret = (PyArrayObject *)PyArray_NewFromDescr(subtype,
+                              op_dtype, op_ndim,
+                              shape, strides, dataptr,
+                              NPY_WRITEABLE, NULL);
+        if (newret == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        newret->base = (PyObject *)ret;
+        ret = newret;
+    }
+
+    /* Make sure all the flags are good */
+    PyArray_UpdateFlags(ret, NPY_UPDATE_ALL);
+
+    /* Double-check that the subtype didn't mess with the dimensions */
+    if (subtype != &PyArray_Type) {
+        if (PyArray_NDIM(ret) != op_ndim ||
+                    !PyArray_CompareLists(shape, PyArray_DIMS(ret), op_ndim)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Iterator automatic output has an array subtype "
+                    "which changed the dimensions of the output");
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    return ret;
+}
+
+static int
+npyiter_allocate_arrays(NpyIter *iter,
+                        npy_uint32 flags,
+                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
+                        npy_uint32 *op_flags, char *op_itflags,
+                        npy_intp **op_axes, int output_scalars)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    PyArrayObject **op = NIT_OPERANDS(iter);
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] == NULL) {
+            PyArrayObject *out;
+            PyTypeObject *op_subtype;
+            npy_intp ondim = output_scalars ? 0 : ndim;
+
+            /* Check whether the subtype was disabled */
+            op_subtype = (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) ?
+                                                &PyArray_Type : subtype;
+
+            /* Allocate the output array */
+            out = npyiter_new_temp_array(iter, op_subtype,
+                                        flags, &op_itflags[iiter],
+                                        ondim,
+                                        NULL,
+                                        op_dtype[iiter],
+                                        op_axes ? op_axes[iiter] : NULL);
+            if (out == NULL) {
+                return 0;
+            }
+
+            op[iiter] = out;
+
+            /*
+             * Now we need to replace the pointers and strides with values
+             * from the new array.
+             */
+            npyiter_replace_axisdata(iter, iiter, op[iiter], ondim,
+                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+            /* New arrays are aligned and need no cast */
+            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
+        }
+        else if ((op_itflags[iiter]&NPY_OP_ITFLAG_CAST) &&
+                   (op_flags[iiter]&(NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) {
+            PyArrayObject *temp;
+            npy_intp ondim = PyArray_NDIM(op[iiter]);
+
+            /* Allocate the temporary array, if possible */
+            temp = npyiter_new_temp_array(iter, &PyArray_Type,
+                                        flags, &op_itflags[iiter],
+                                        ondim,
+                                        PyArray_DIMS(op[iiter]),
+                                        op_dtype[iiter],
+                                        op_axes ? op_axes[iiter] : NULL);
+            if (temp == NULL) {
+                return 0;
+            }
+
+            /* If the data will be read, copy it into temp */
+            if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
+                if (PyArray_CopyInto(temp, op[iiter]) != 0) {
+                    Py_DECREF(temp);
+                    return 0;
+                }
+            }
+            /* If the data will be written to, set UPDATEIFCOPY */
+            if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                PyArray_FLAGS(temp) |= NPY_UPDATEIFCOPY;
+                PyArray_FLAGS(op[iiter]) &= ~NPY_WRITEABLE;
+                Py_INCREF(op[iiter]);
+                temp->base = (PyObject *)op[iiter];
+            }
+
+            Py_DECREF(op[iiter]);
+            op[iiter] = temp;
+
+            /*
+             * Now we need to replace the pointers and strides with values
+             * from the temporary array.
+             */
+            npyiter_replace_axisdata(iter, iiter, op[iiter], ondim,
+                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+            /* The temporary copy is aligned and needs no cast */
+            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iiter] &= ~NPY_OP_ITFLAG_CAST;
+        }
+        else {
+            /*
+             * Buffering must be enabled for casting/conversion if copy
+             * wasn't specified.
+             */
+            if ((op_itflags[iiter]&NPY_OP_ITFLAG_CAST) &&
+                                  !(itflags&NPY_ITFLAG_BUFFER)) {
+                PyErr_SetString(PyExc_TypeError,
+                        "Iterator operand required copying or buffering, "
+                        "but neither copying nor buffering was enabled");
+                return 0;
+            }
+
+            /*
+             * If the operand is aligned, any buffering can use aligned
+             * optimizations.
+             */
+            if (PyArray_ISALIGNED(op[iiter])) {
+                op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+        }
+
+        /* Here we can finally check for contiguous iteration */
+        if (op_flags[iiter]&NPY_ITER_CONTIG) {
+            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+            npy_intp stride = NAD_STRIDES(axisdata)[iiter];
+
+            if (stride != op_dtype[iiter]->elsize) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_CONTIG\n");
+                op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
+                if (!(itflags&NPY_ITFLAG_BUFFER)) {
+                    PyErr_SetString(PyExc_TypeError,
+                            "Iterator operand required buffering, "
+                            "to be contiguous as requested, but "
+                            "buffering is not enabled");
+                    return 0;
+                }
+            }
+        }
+
+        /*
+         * If no alignment, byte swap, or casting is needed, and
+         * the inner stride of this operand works for the whole
+         * array, we can set NPY_OP_ITFLAG_BUFNEVER.
+         * But, if buffering is enabled, write-buffering must be
+         * one-to-one, because the buffering write back won't combine
+         * values correctly. This test doesn't catch everything, but it will
+         * catch the most common case of a broadcasting a write-buffered
+         * dimension.
+         */
+        if ((itflags&NPY_ITFLAG_BUFFER) &&
+                        (!(op_itflags[iiter]&NPY_OP_ITFLAG_CAST) ||
+                          (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE))) {
+            int is_one_to_one = 1;
+            npy_intp stride, shape, innerstride = 0, innershape;
+            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+            npy_intp sizeof_axisdata =
+                                NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+            /* Find stride of the first non-empty shape */
+            for (idim = 0; idim < ndim; ++idim) {
+                innershape = NAD_SHAPE(axisdata);
+                if (innershape != 1) {
+                    innerstride = NAD_STRIDES(axisdata)[iiter];
+                    if (innerstride == 0) {
+                        is_one_to_one = 0;
+                    }
+                    break;
+                }
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+            ++idim;
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+            /* Check that everything could have coalesced together */
+            for (; idim < ndim; ++idim) {
+                stride = NAD_STRIDES(axisdata)[iiter];
+                shape = NAD_SHAPE(axisdata);
+                if (shape != 1) {
+                    if (stride == 0) {
+                        is_one_to_one = 0;
+                    }
+                    /*
+                     * If N times the inner stride doesn't equal this
+                     * stride, the multi-dimensionality is needed.
+                     */
+                    if (innerstride*innershape != stride) {
+                        break;
+                    }
+                    else {
+                        innershape *= shape;
+                    }
+                }
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+            /*
+             * If we looped all the way to the end, one stride works.
+             * Set that stride, because it may not belong to the first
+             * dimension.
+             */
+            if (idim == ndim && !(op_itflags[iiter]&NPY_OP_ITFLAG_CAST)) {
+                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+                NBF_STRIDES(bufferdata)[iiter] = innerstride;
+            }
+            else if (!is_one_to_one &&
+                        (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
+                        !(flags&NPY_ITER_REDUCE_OK)) {
+                NPY_IT_DBG_PRINTF("Iterator: %d %d %d\n",
+                            (int)(!is_one_to_one),
+                            (int)((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)),
+                            (int)(!(flags&NPY_ITER_REDUCE_OK)));
+                PyErr_SetString(PyExc_ValueError,
+                        "Iterator operand requires write buffering, "
+                        "but has dimensions which have been broadcasted "
+                        "and would be combined incorrectly");
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * The __array_priority__ attribute of the inputs determines
+ * the subtype of any output arrays.  This function finds the
+ * subtype of the input array with highest priority.
+ */
+static void
+npyiter_get_priority_subtype(npy_intp niter, PyArrayObject **op,
+                            char *op_itflags,
+                            double *subtype_priority,
+                            PyTypeObject **subtype)
+{
+    npy_intp iiter;
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] != NULL && op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
+            double priority = PyArray_GetPriority((PyObject *)op[iiter], 0.0);
+            if (priority > *subtype_priority) {
+                *subtype_priority = priority;
+                *subtype = Py_TYPE(op[iiter]);
+            }
+        }
+    }
+}
+
+/*
+ * Calculates a dtype that all the types can be promoted to, using the
+ * ufunc rules.  If only_inputs is 1, it leaves any operands that
+ * are not read from out of the calculation.
+ */
+static PyArray_Descr *
+npyiter_get_common_dtype(npy_intp niter, PyArrayObject **op,
+                        char *op_itflags, PyArray_Descr **op_dtype,
+                        PyArray_Descr **op_request_dtypes,
+                        int only_inputs, int output_scalars)
+{
+    npy_intp iiter;
+    npy_intp narrs = 0, ndtypes = 0;
+    PyArrayObject *arrs[NPY_MAXARGS];
+    PyArray_Descr *dtypes[NPY_MAXARGS];
+
+    NPY_IT_DBG_PRINTF("Iterator: Getting a common data type from operands\n");
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op_dtype[iiter] != NULL &&
+                    (!only_inputs || (op_itflags[iiter]&NPY_OP_ITFLAG_READ))) {
+            /* If no dtype was requested and the op is a scalar, pass the op */
+            if ((op_request_dtypes == NULL ||
+                            op_request_dtypes[iiter] == NULL) &&
+                                            PyArray_NDIM(op[iiter]) == 0) {
+                arrs[narrs++] = op[iiter];
+            }
+            /* Otherwise just pass in the dtype */
+            else {
+                dtypes[ndtypes++] = op_dtype[iiter];
+            }
+        }
+    }
+
+    return PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
+}
+
+static int
+npyiter_allocate_transfer_functions(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter = 0, niter = NIT_NITER(iter);
+
+    npy_intp i;
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    PyArrayObject **op = NIT_OPERANDS(iter);
+    PyArray_Descr **op_dtype = NIT_DTYPES(iter);
+    npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
+    PyArray_StridedTransferFn **readtransferfn = NBF_READTRANSFERFN(bufferdata),
+                        **writetransferfn = NBF_WRITETRANSFERFN(bufferdata);
+    void **readtransferdata = NBF_READTRANSFERDATA(bufferdata),
+         **writetransferdata = NBF_WRITETRANSFERDATA(bufferdata);
+
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    int needs_api = 0;
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        char flags = op_itflags[iiter];
+        op_stride = strides[iiter];
+
+        /*
+         * If we have determined that a buffer may be needed,
+         * allocate the appropriate transfer functions
+         */
+        if (!(flags&NPY_OP_ITFLAG_BUFNEVER)) {
+            if (flags&NPY_OP_ITFLAG_READ) {
+                int move_references = 0;
+                if (PyArray_GetDTypeTransferFunction(
+                                        (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
+                                        op_stride,
+                                        op_dtype[iiter]->elsize,
+                                        PyArray_DESCR(op[iiter]),
+                                        op_dtype[iiter],
+                                        move_references,
+                                        &stransfer,
+                                        &transferdata,
+                                        &needs_api) != NPY_SUCCEED) {
+                    goto fail;
+                }
+                readtransferfn[iiter] = stransfer;
+                readtransferdata[iiter] = transferdata;
+            }
+            else {
+                readtransferfn[iiter] = NULL;
+            }
+            if (flags&NPY_OP_ITFLAG_WRITE) {
+                int move_references = 1;
+                if (PyArray_GetDTypeTransferFunction(
+                                        (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
+                                        op_dtype[iiter]->elsize,
+                                        op_stride,
+                                        op_dtype[iiter],
+                                        PyArray_DESCR(op[iiter]),
+                                        move_references,
+                                        &stransfer,
+                                        &transferdata,
+                                        &needs_api) != NPY_SUCCEED) {
+                    goto fail;
+                }
+                writetransferfn[iiter] = stransfer;
+                writetransferdata[iiter] = transferdata;
+            }
+            /* If no write back but there are references make a decref fn */
+            else if (PyDataType_REFCHK(op_dtype[iiter])) {
+                /*
+                 * By passing NULL to dst_type and setting move_references
+                 * to 1, we get back a function that just decrements the
+                 * src references.
+                 */
+                if (PyArray_GetDTypeTransferFunction(
+                                        (flags&NPY_OP_ITFLAG_ALIGNED) != 0,
+                                        op_dtype[iiter]->elsize, 0,
+                                        op_dtype[iiter], NULL,
+                                        1,
+                                        &stransfer,
+                                        &transferdata,
+                                        &needs_api) != NPY_SUCCEED) {
+                    goto fail;
+                }
+                writetransferfn[iiter] = stransfer;
+                writetransferdata[iiter] = transferdata;
+            }
+            else {
+                writetransferfn[iiter] = NULL;
+            }
+        }
+        else {
+            readtransferfn[iiter] = NULL;
+            writetransferfn[iiter] = NULL;
+        }
+    }
+
+    /* If any of the dtype transfer functions needed the API, flag it */
+    if (needs_api) {
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI;
+    }
+
+    return 1;
+
+fail:
+    for (i = 0; i < iiter; ++i) {
+        if (readtransferdata[iiter] != NULL) {
+            PyArray_FreeStridedTransferData(readtransferdata[iiter]);
+            readtransferdata[iiter] = NULL;
+        }
+        if (writetransferdata[iiter] != NULL) {
+            PyArray_FreeStridedTransferData(writetransferdata[iiter]);
+            writetransferdata[iiter] = NULL;
+        }
+    }
+    return 0;
+}
+
+/*
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+static int
+npyiter_allocate_buffers(NpyIter *iter, char **errmsg)
+{
+    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
+    /*npy_intp ndim = NIT_NDIM(iter);*/
+    npy_intp iiter = 0, niter = NIT_NITER(iter);
+
+    npy_intp i;
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    PyArray_Descr **op_dtype = NIT_DTYPES(iter);
+    npy_intp buffersize = NBF_BUFFERSIZE(bufferdata);
+    char *buffer, **buffers = NBF_BUFFERS(bufferdata);
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        char flags = op_itflags[iiter];
+
+        /*
+         * If we have determined that a buffer may be needed,
+         * allocate one.
+         */
+        if (!(flags&NPY_OP_ITFLAG_BUFNEVER)) {
+            npy_intp itemsize = op_dtype[iiter]->elsize;
+            buffer = PyArray_malloc(itemsize*buffersize);
+            if (buffer == NULL) {
+                if (errmsg == NULL) {
+                    PyErr_NoMemory();
+                }
+                else {
+                    *errmsg = "out of memory";
+                }
+                goto fail;
+            }
+            buffers[iiter] = buffer;
+        }
+    }
+
+    return 1;
+
+fail:
+    for (i = 0; i < iiter; ++i) {
+        if (buffers[i] != NULL) {
+            PyArray_free(buffers[i]);
+            buffers[i] = NULL;
+        }
+    }
+    return 0;
+}
+
+/*
+ * This sets the AXISDATA portion of the iterator to the specified
+ * iterindex, updating the pointers as well.  This function does
+ * no error checking.
+ */
+static void
+npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp niter = NIT_NITER(iter);
+
+    char **dataptr;
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    npy_intp istrides, nstrides, i, shape;
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    nstrides = NAD_NSTRIDES();
+
+    NIT_ITERINDEX(iter) = iterindex;
+
+    if (iterindex == 0) {
+        dataptr = NIT_RESETDATAPTR(iter);
+
+        for (idim = 0; idim < ndim; ++idim) {
+            char **ptrs;
+            NAD_COORD(axisdata) = 0;
+            ptrs = NAD_PTRS(axisdata);
+            for (istrides = 0; istrides < nstrides; ++istrides) {
+                ptrs[istrides] = dataptr[istrides];
+            }
+
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+        }
+    }
+    else {
+        /*
+         * Set the coordinates, from the fastest-changing to the
+         * slowest-changing.
+         */
+        axisdata = NIT_AXISDATA(iter);
+        shape = NAD_SHAPE(axisdata);
+        i = iterindex;
+        iterindex /= shape;
+        NAD_COORD(axisdata) = i - iterindex * shape;
+        for (idim = 0; idim < ndim-1; ++idim) {
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+
+            shape = NAD_SHAPE(axisdata);
+            i = iterindex;
+            iterindex /= shape;
+            NAD_COORD(axisdata) = i - iterindex * shape;
+        }
+
+        dataptr = NIT_RESETDATAPTR(iter);
+
+        /*
+         * Accumulate the successive pointers with their
+         * offsets in the opposite order, starting from the
+         * original data pointers.
+         */
+        for (idim = 0; idim < ndim; ++idim) {
+            npy_intp *strides;
+            char **ptrs;
+
+            strides = NAD_STRIDES(axisdata);
+            ptrs = NAD_PTRS(axisdata);
+
+            i = NAD_COORD(axisdata);
+
+            for (istrides = 0; istrides < nstrides; ++istrides) {
+                ptrs[istrides] = dataptr[istrides] + i*strides[istrides];
+            }
+
+            dataptr = ptrs;
+            
+            NIT_ADVANCE_AXISDATA(axisdata, -1);
+        }
+    }
+}
+
+/*
+ * This gets called after the the buffers have been exhausted, and
+ * their data needs to be written back to the arrays.  The coordinates
+ * must be positioned for the beginning of the buffer.
+ */
+static void
+npyiter_copy_from_buffers(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter),
+                    *reduce_outeraxisdata = NULL;
+
+    PyArray_Descr **dtypes = NIT_DTYPES(iter);
+    npy_intp transfersize = NBF_SIZE(bufferdata),
+                buffersize = NBF_BUFFERSIZE(bufferdata);
+    npy_intp *strides = NBF_STRIDES(bufferdata),
+             *ad_strides = NAD_STRIDES(axisdata);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
+    char **buffers = NBF_BUFFERS(bufferdata);
+    char *buffer;
+
+    npy_intp reduce_outerdim = 0;
+    npy_intp *reduce_outerstrides = NULL;
+    char **reduce_outerptrs = NULL;
+
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    
+    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, niter) /
+                                NPY_SIZEOF_INTP;
+
+    /* If we're past the end, nothing to copy */
+    if (NBF_SIZE(bufferdata) == 0) {
+        return;
+    }
+
+    NPY_IT_DBG_PRINTF("Iterator: Copying buffers to outputs\n");
+
+    if (itflags&NPY_ITFLAG_REDUCE) {
+        reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
+        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
+        transfersize *= NBF_REDUCE_OUTERSIZE(bufferdata);
+    }
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        stransfer = NBF_WRITETRANSFERFN(bufferdata)[iiter];
+        transferdata = NBF_WRITETRANSFERDATA(bufferdata)[iiter];
+        buffer = buffers[iiter];
+        /*
+         * Copy the data back to the arrays.  If the type has refs,
+         * this function moves them so the buffer's refs are released.
+         */
+        if ((stransfer != NULL) && (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) {
+            /* Copy back only if the pointer was pointing to the buffer */
+            npy_intp delta = (ptrs[iiter] - buffer);
+            if (0 <= delta && delta <= buffersize*dtypes[iiter]->elsize) {
+                npy_intp op_transfersize;
+
+                npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
+                npy_intp ndim_transfer;
+
+                NPY_IT_DBG_PRINTF("Iterator: Operand %d was buffered\n",
+                                            (int)iiter);
+
+                /*
+                 * If this operand is being reduced in the inner loop,
+                 * its buffering stride was set to zero, and just
+                 * one element was copied.
+                 */
+                if (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) {
+                    if (strides[iiter] == 0) {
+                        if (reduce_outerstrides[iiter] == 0) {
+                            op_transfersize = 1;
+                            src_stride = 0;
+                            dst_strides = &src_stride;
+                            dst_coords = &NAD_COORD(reduce_outeraxisdata);
+                            dst_shape = &NAD_SHAPE(reduce_outeraxisdata);
+                            ndim_transfer = 1;
+                        }
+                        else {
+                            op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
+                            src_stride = reduce_outerstrides[iiter];
+                            dst_strides =
+                                    &NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                            dst_coords = &NAD_COORD(reduce_outeraxisdata);
+                            dst_shape = &NAD_SHAPE(reduce_outeraxisdata);
+                            ndim_transfer = ndim - reduce_outerdim;
+                        }
+                    }
+                    else {
+                        if (reduce_outerstrides[iiter] == 0) {
+                            op_transfersize = NBF_SIZE(bufferdata);
+                            src_stride = strides[iiter];
+                            dst_strides = &ad_strides[iiter];
+                            dst_coords = &NAD_COORD(axisdata);
+                            dst_shape = &NAD_SHAPE(axisdata);
+                            ndim_transfer = reduce_outerdim ?
+                                            reduce_outerdim : 1;
+                        }
+                        else {
+                            op_transfersize = transfersize;
+                            src_stride = strides[iiter];
+                            dst_strides = &ad_strides[iiter];
+                            dst_coords = &NAD_COORD(axisdata);
+                            dst_shape = &NAD_SHAPE(axisdata);
+                            ndim_transfer = ndim;
+                        }
+                    }
+                }
+                else {
+                    op_transfersize = transfersize;
+                    src_stride = strides[iiter];
+                    dst_strides = &ad_strides[iiter];
+                    dst_coords = &NAD_COORD(axisdata);
+                    dst_shape = &NAD_SHAPE(axisdata);
+                    ndim_transfer = ndim;
+                }
+
+                NPY_IT_DBG_PRINTF("Iterator: Copying buffer to "
+                                    "operand %d (%d items)\n",
+                                    (int)iiter, (int)op_transfersize);
+
+                PyArray_TransferStridedToNDim(ndim_transfer,
+                        ad_ptrs[iiter], dst_strides, axisdata_incr,
+                        buffer, src_stride,
+                        dst_coords, axisdata_incr,
+                        dst_shape, axisdata_incr,
+                        op_transfersize, dtypes[iiter]->elsize,
+                        stransfer,
+                        transferdata);
+            }
+        }
+        /* If there's no copy back, we may have to decrement refs.  In
+         * this case, the transfer function has a 'decsrcref' transfer
+         * function, so we can use it to do the decrement.
+         */
+        else if (stransfer != NULL) {
+            /* Decrement refs only if the pointer was pointing to the buffer */
+            npy_intp delta = (ptrs[iiter] - buffer);
+            if (0 <= delta && delta <= transfersize*dtypes[iiter]->elsize) {
+                NPY_IT_DBG_PRINTF("Iterator: Freeing refs and zeroing buffer "
+                                    "of operand %d\n", (int)iiter);
+                /* Decrement refs */
+                stransfer(NULL, 0, buffer, dtypes[iiter]->elsize,
+                            transfersize, dtypes[iiter]->elsize,
+                            transferdata);
+                /*
+                 * Zero out the memory for safety.  For instance,
+                 * if during iteration some Python code copied an
+                 * array pointing into the buffer, it will get None
+                 * values for its references after this.
+                 */
+                memset(buffer, 0, dtypes[iiter]->elsize*transfersize);
+            }
+        }
+    }
+
+    NPY_IT_DBG_PRINTF("Iterator: Finished copying buffers to outputs\n");
+}
+
+/*
+ * This gets called after the iterator has been positioned to coordinates
+ * for the start of a buffer.  It decides which operands need a buffer,
+ * and copies the data into the buffers.
+ */
+static void
+npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter),
+                    *reduce_outeraxisdata = NULL;
+
+    PyArray_Descr **dtypes = NIT_DTYPES(iter);
+    PyArrayObject **operands = NIT_OPERANDS(iter);
+    npy_intp *strides = NBF_STRIDES(bufferdata),
+             *ad_strides = NAD_STRIDES(axisdata);
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
+    char **buffers = NBF_BUFFERS(bufferdata);
+    npy_intp iterindex, iterend, transfersize,
+            singlestridesize, reduce_innersize = 0, reduce_outerdim = 0;
+    int is_onestride = 0, any_buffered = 0;
+
+    npy_intp *reduce_outerstrides = NULL;
+    char **reduce_outerptrs = NULL;
+
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+
+    /*
+     * Have to get this flag before npyiter_checkreducesize sets
+     * it for the next iteration.
+     */
+    npy_bool reuse_reduce_loops = (prev_dataptrs != NULL) &&
+                    ((itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS) != 0);
+
+    npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, niter) /
+                                NPY_SIZEOF_INTP;
+
+    NPY_IT_DBG_PRINTF("Iterator: Copying inputs to buffers\n");
+
+    /* Calculate the size if using any buffers */
+    iterindex = NIT_ITERINDEX(iter);
+    iterend = NIT_ITEREND(iter);
+    transfersize = NBF_BUFFERSIZE(bufferdata);
+    if (transfersize > iterend - iterindex) {
+        transfersize = iterend - iterindex;
+    }
+
+    /* If last time around, the reduce loop structure was full, we reuse it */
+    if (reuse_reduce_loops) {
+        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
+        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
+        reduce_innersize = NBF_SIZE(bufferdata);
+        NBF_REDUCE_POS(bufferdata) = 0;
+        transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
+        NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
+
+        NPY_IT_DBG_PRINTF("Reused reduce transfersize: %d innersize: %d "
+                        "itersize: %d\n",
+                            (int)transfersize,
+                            (int)reduce_innersize,
+                            (int)NpyIter_GetIterSize(iter));
+    }
+    /*
+     * If there are any reduction operands, we may have to make
+     * the size smaller so we don't copy the same value into
+     * a buffer twice, as the buffering does not have a mechanism
+     * to combine values itself.
+     */
+    else if (itflags&NPY_ITFLAG_REDUCE) {
+        NPY_IT_DBG_PRINTF("Iterator: Calculating reduce loops\n");
+        transfersize = npyiter_checkreducesize(iter, transfersize,
+                                                &reduce_innersize,
+                                                &reduce_outerdim);
+        NPY_IT_DBG_PRINTF("Reduce transfersize: %d innersize: %d "
+                        "itersize: %d\n",
+                            (int)transfersize,
+                            (int)reduce_innersize,
+                            (int)NpyIter_GetIterSize(iter));
+
+        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
+        NBF_SIZE(bufferdata) = reduce_innersize;
+        NBF_REDUCE_POS(bufferdata) = 0;
+        NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
+        NBF_REDUCE_OUTERDIM(bufferdata) = reduce_outerdim;
+        NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
+    }
+    else {
+        NBF_SIZE(bufferdata) = transfersize;
+        NBF_BUFITEREND(bufferdata) = iterindex + transfersize;
+    }
+
+    /* Calculate the maximum size if using a single stride and no buffers */
+    singlestridesize = NAD_SHAPE(axisdata)-NAD_COORD(axisdata);
+    if (singlestridesize > iterend - iterindex) {
+        singlestridesize = iterend - iterindex;
+    }
+    if (singlestridesize >= transfersize) {
+        is_onestride = 1;
+    }
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        /*
+         * If the buffer is write-only, these two are NULL, and the buffer
+         * pointers will be set up but the read copy won't be done
+         */
+        stransfer = NBF_READTRANSFERFN(bufferdata)[iiter];
+        transferdata = NBF_READTRANSFERDATA(bufferdata)[iiter];
+        switch (op_itflags[iiter]&
+                        (NPY_OP_ITFLAG_BUFNEVER|
+                         NPY_OP_ITFLAG_CAST|
+                         NPY_OP_ITFLAG_REDUCE)) {
+            /* Never need to buffer this operand */
+            case NPY_OP_ITFLAG_BUFNEVER:
+                ptrs[iiter] = ad_ptrs[iiter];
+                if (itflags&NPY_ITFLAG_REDUCE) {
+                    reduce_outerstrides[iiter] = reduce_innersize *
+                                                 strides[iiter];
+                    reduce_outerptrs[iiter] = ptrs[iiter];
+                }
+                /*
+                 * Should not adjust the stride - ad_strides[iiter]
+                 * could be zero, but strides[iiter] was initialized
+                 * to the first non-trivial stride.
+                 */
+                stransfer = NULL;
+                break;
+            /* Never need to buffer this operand */
+            case NPY_OP_ITFLAG_BUFNEVER|NPY_OP_ITFLAG_REDUCE:
+                ptrs[iiter] = ad_ptrs[iiter];
+                reduce_outerptrs[iiter] = ptrs[iiter];
+                reduce_outerstrides[iiter] = 0;
+                /*
+                 * Should not adjust the stride - ad_strides[iiter]
+                 * could be zero, but strides[iiter] was initialized
+                 * to the first non-trivial stride.
+                 */
+                stransfer = NULL;
+                break;
+            /* Just a copy */
+            case 0:
+                /*
+                 * No copyswap or cast was requested, so all we're
+                 * doing is copying the data to fill the buffer and
+                 * produce a single stride.  If the underlying data
+                 * already does that, no need to copy it.
+                 */
+                if (is_onestride) {
+                    ptrs[iiter] = ad_ptrs[iiter];
+                    strides[iiter] = ad_strides[iiter];
+                    stransfer = NULL;
+                }
+                /* If some other op is reduced, we have a double reduce loop */
+                else if ((itflags&NPY_ITFLAG_REDUCE) &&
+                                (reduce_outerdim == 1) &&
+                                (transfersize/reduce_innersize <=
+                                            NAD_SHAPE(reduce_outeraxisdata) -
+                                            NAD_COORD(reduce_outeraxisdata))) {
+                    ptrs[iiter] = ad_ptrs[iiter];
+                    reduce_outerptrs[iiter] = ptrs[iiter];
+                    strides[iiter] = ad_strides[iiter];
+                    reduce_outerstrides[iiter] =
+                                    NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                    stransfer = NULL;
+                }
+                else {
+                    /* In this case, the buffer is being used */
+                    ptrs[iiter] = buffers[iiter];
+                    strides[iiter] = dtypes[iiter]->elsize;
+                    if (itflags&NPY_ITFLAG_REDUCE) {
+                        reduce_outerstrides[iiter] = reduce_innersize *
+                                                     strides[iiter];
+                        reduce_outerptrs[iiter] = ptrs[iiter];
+                    }
+                }
+                break;
+            /* Just a copy, but with a reduction */
+            case NPY_OP_ITFLAG_REDUCE:
+                if (ad_strides[iiter] == 0) {
+                    strides[iiter] = 0;
+                    /* It's all in one stride in the inner loop dimension */
+                    if (is_onestride) {
+                        NPY_IT_DBG_PRINTF("reduce op %d all one stride\n", (int)iiter);
+                        ptrs[iiter] = ad_ptrs[iiter];
+                        reduce_outerstrides[iiter] = 0;
+                        stransfer = NULL;
+                    }
+                    /* It's all in one stride in the reduce outer loop */
+                    else if ((reduce_outerdim > 0) &&
+                                    (transfersize/reduce_innersize <=
+                                            NAD_SHAPE(reduce_outeraxisdata) -
+                                            NAD_COORD(reduce_outeraxisdata))) {
+                        NPY_IT_DBG_PRINTF("reduce op %d all one outer stride\n",
+                                                            (int)iiter);
+                        ptrs[iiter] = ad_ptrs[iiter];
+                        /* Outer reduce loop advances by one item */
+                        reduce_outerstrides[iiter] =
+                                NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        stransfer = NULL;
+                    }
+                    /* In this case, the buffer is being used */
+                    else {
+                        NPY_IT_DBG_PRINTF("reduce op %d must buffer\n", (int)iiter);
+                        ptrs[iiter] = buffers[iiter];
+                        /* Both outer and inner reduce loops have stride 0 */
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
+                            reduce_outerstrides[iiter] = 0;
+                        }
+                        /* Outer reduce loop advances by one item */
+                        else {
+                            reduce_outerstrides[iiter] = dtypes[iiter]->elsize;
+                        }
+                    }
+
+                }
+                else if (is_onestride) {
+                    NPY_IT_DBG_PRINTF("reduce op %d all one stride in dim 0\n", (int)iiter);
+                    ptrs[iiter] = ad_ptrs[iiter];
+                    strides[iiter] = ad_strides[iiter];
+                    reduce_outerstrides[iiter] = 0;
+                    stransfer = NULL;
+                }
+                else {
+                    /* It's all in one stride in the reduce outer loop */
+                    if ((reduce_outerdim > 0) &&
+                                    (transfersize/reduce_innersize <=
+                                            NAD_SHAPE(reduce_outeraxisdata) -
+                                            NAD_COORD(reduce_outeraxisdata))) {
+                        ptrs[iiter] = ad_ptrs[iiter];
+                        strides[iiter] = ad_strides[iiter];
+                        /* Outer reduce loop advances by one item */
+                        reduce_outerstrides[iiter] =
+                                NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        stransfer = NULL;
+                    }
+                    /* In this case, the buffer is being used */
+                    else {
+                        ptrs[iiter] = buffers[iiter];
+                        strides[iiter] = dtypes[iiter]->elsize;
+
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
+                            /* Reduction in outer reduce loop */
+                            reduce_outerstrides[iiter] = 0;
+                        }
+                        else {
+                            /* Advance to next items in outer reduce loop */
+                            reduce_outerstrides[iiter] = reduce_innersize *
+                                                         dtypes[iiter]->elsize;
+                        }
+                    }
+                }
+                reduce_outerptrs[iiter] = ptrs[iiter];
+                break;
+            default:
+                /* In this case, the buffer is being used */
+                if (!(op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE)) {
+                    ptrs[iiter] = buffers[iiter];
+                    strides[iiter] = dtypes[iiter]->elsize;
+                    if (itflags&NPY_ITFLAG_REDUCE) {
+                        reduce_outerstrides[iiter] = reduce_innersize *
+                                                     strides[iiter];
+                        reduce_outerptrs[iiter] = ptrs[iiter];
+                    }
+                }
+                /* The buffer is being used with reduction */
+                else {
+                    ptrs[iiter] = buffers[iiter];
+                    if (ad_strides[iiter] == 0) {
+                        NPY_IT_DBG_PRINTF("cast op %d has innermost stride 0\n", (int)iiter);
+                        strides[iiter] = 0;
+                        /* Both outer and inner reduce loops have stride 0 */
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
+                            NPY_IT_DBG_PRINTF("cast op %d has outermost stride 0\n", (int)iiter);
+                            reduce_outerstrides[iiter] = 0;
+                        }
+                        /* Outer reduce loop advances by one item */
+                        else {
+                            NPY_IT_DBG_PRINTF("cast op %d has outermost stride !=0\n", (int)iiter);
+                            reduce_outerstrides[iiter] = dtypes[iiter]->elsize;
+                        }
+                    }
+                    else {
+                        NPY_IT_DBG_PRINTF("cast op %d has innermost stride !=0\n", (int)iiter);
+                        strides[iiter] = dtypes[iiter]->elsize;
+
+                        if (NAD_STRIDES(reduce_outeraxisdata)[iiter] == 0) {
+                            NPY_IT_DBG_PRINTF("cast op %d has outermost stride 0\n", (int)iiter);
+                            /* Reduction in outer reduce loop */
+                            reduce_outerstrides[iiter] = 0;
+                        }
+                        else {
+                            NPY_IT_DBG_PRINTF("cast op %d has outermost stride !=0\n", (int)iiter);
+                            /* Advance to next items in outer reduce loop */
+                            reduce_outerstrides[iiter] = reduce_innersize *
+                                                         dtypes[iiter]->elsize;
+                        }
+                    }
+                    reduce_outerptrs[iiter] = ptrs[iiter];
+                }
+                break;
+        }
+
+        if (stransfer != NULL) {
+            npy_intp src_itemsize = PyArray_DESCR(operands[iiter])->elsize;
+            npy_intp op_transfersize;
+
+            npy_intp dst_stride, *src_strides, *src_coords, *src_shape;
+            npy_intp ndim_transfer;
+
+            npy_bool skip_transfer = 0;
+
+            any_buffered = 1;
+
+            /*
+             * If this operand is being reduced in the inner loop,
+             * set its buffering stride to zero, and just copy
+             * one element.
+             */
+            if (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) {
+                if (ad_strides[iiter] == 0) {
+                    strides[iiter] = 0;
+                    if (reduce_outerstrides[iiter] == 0) {
+                        op_transfersize = 1;
+                        dst_stride = 0;
+                        src_strides = &dst_stride;
+                        src_coords = &NAD_COORD(reduce_outeraxisdata);
+                        src_shape = &NAD_SHAPE(reduce_outeraxisdata);
+                        ndim_transfer = 1;
+
+                        /*
+                         * When we're reducing a single element, and
+                         * it's still the same element, don't overwrite
+                         * it even when reuse reduce loops is unset.
+                         * This preserves the precision of the
+                         * intermediate calculation.
+                         */
+                        if (prev_dataptrs &&
+                                    prev_dataptrs[iiter] == ad_ptrs[iiter]) {
+                            NPY_IT_DBG_PRINTF("Iterator: skipping operand %d"
+                                    " copy because it's a 1-element reduce\n",
+                                    (int)iiter);
+
+                            skip_transfer = 1;
+                        }
+                    }
+                    else {
+                        op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
+                        dst_stride = reduce_outerstrides[iiter];
+                        src_strides = &NAD_STRIDES(reduce_outeraxisdata)[iiter];
+                        src_coords = &NAD_COORD(reduce_outeraxisdata);
+                        src_shape = &NAD_SHAPE(reduce_outeraxisdata);
+                        ndim_transfer = ndim - reduce_outerdim;
+                    }
+                }
+                else {
+                    if (reduce_outerstrides[iiter] == 0) {
+                        op_transfersize = NBF_SIZE(bufferdata);
+                        dst_stride = strides[iiter];
+                        src_strides = &ad_strides[iiter];
+                        src_coords = &NAD_COORD(axisdata);
+                        src_shape = &NAD_SHAPE(axisdata);
+                        ndim_transfer = reduce_outerdim ? reduce_outerdim : 1;
+                    }
+                    else {
+                        op_transfersize = transfersize;
+                        dst_stride = strides[iiter];
+                        src_strides = &ad_strides[iiter];
+                        src_coords = &NAD_COORD(axisdata);
+                        src_shape = &NAD_SHAPE(axisdata);
+                        ndim_transfer = ndim;
+                    }
+                }
+            }
+            else {
+                op_transfersize = transfersize;
+                dst_stride = strides[iiter];
+                src_strides = &ad_strides[iiter];
+                src_coords = &NAD_COORD(axisdata);
+                src_shape = &NAD_SHAPE(axisdata);
+                ndim_transfer = ndim;
+            }
+
+            /*
+             * If the whole buffered loop structure remains the same,
+             * and the source pointer for this data didn't change,
+             * we don't have to copy the data again.
+             */
+            if (reuse_reduce_loops && prev_dataptrs[iiter] == ad_ptrs[iiter]) {
+                NPY_IT_DBG_PRINTF("Iterator: skipping operands %d "
+                        "copy (%d items) because loops are reused and the data "
+                        "pointer didn't change\n",
+                        (int)iiter, (int)op_transfersize);
+                skip_transfer = 1;
+            }
+
+            /* If the data type requires zero-inititialization */
+            if (PyDataType_FLAGCHK(dtypes[iiter], NPY_NEEDS_INIT)) {
+                NPY_IT_DBG_PRINTF("Iterator: Buffer requires init, "
+                                    "memsetting to 0\n");
+                memset(ptrs[iiter], 0, dtypes[iiter]->elsize*op_transfersize);
+                /* Can't skip the transfer in this case */
+                skip_transfer = 0;
+            }
+
+            if (!skip_transfer) {
+                NPY_IT_DBG_PRINTF("Iterator: Copying operand %d to "
+                                "buffer (%d items)\n",
+                                (int)iiter, (int)op_transfersize);
+
+                PyArray_TransferNDimToStrided(ndim_transfer,
+                        ptrs[iiter], dst_stride,
+                        ad_ptrs[iiter], src_strides, axisdata_incr,
+                        src_coords, axisdata_incr,
+                        src_shape, axisdata_incr,
+                        op_transfersize, src_itemsize,
+                        stransfer,
+                        transferdata);
+            }
+        }
+        else if (ptrs[iiter] == buffers[iiter]) {
+            /* If the data type requires zero-inititialization */
+            if (PyDataType_FLAGCHK(dtypes[iiter], NPY_NEEDS_INIT)) {
+                NPY_IT_DBG_PRINTF("Iterator: Write-only buffer for "
+                                    "operand %d requires init, "
+                                    "memsetting to 0\n", (int)iiter);
+                memset(ptrs[iiter], 0, dtypes[iiter]->elsize*transfersize);
+            }
+        }
+
+    }
+
+    /*
+     * If buffering wasn't needed, we can grow the inner
+     * loop to as large as possible.
+     *
+     * TODO: Could grow REDUCE loop too with some more logic above.
+     */
+    if (!any_buffered && (itflags&NPY_ITFLAG_GROWINNER) &&
+                        !(itflags&NPY_ITFLAG_REDUCE)) {
+        if (singlestridesize > transfersize) {
+            NPY_IT_DBG_PRINTF("Iterator: Expanding inner loop size "
+                    "from %d to %d since buffering wasn't needed\n",
+                    (int)NBF_SIZE(bufferdata), (int)singlestridesize);
+            NBF_SIZE(bufferdata) = singlestridesize;
+            NBF_BUFITEREND(bufferdata) = iterindex + singlestridesize;
+        }
+    }
+
+    NPY_IT_DBG_PRINTF("Any buffering needed: %d\n", any_buffered);
+
+    NPY_IT_DBG_PRINTF("Iterator: Finished copying inputs to buffers "
+                        "(buffered size is %d)\n", (int)NBF_SIZE(bufferdata));
+}
+
+/*
+ * This checks how much space can be buffered without encountering the
+ * same value twice, or for operands whose innermost stride is zero,
+ * without encountering a different value.  By reducing the buffered
+ * amount to this size, reductions can be safely buffered.
+ *
+ * Reductions are buffered with two levels of looping, to avoid
+ * frequent copying to the buffers.  The return value is the over-all
+ * buffer size, and when the flag NPY_ITFLAG_REDUCE is set, reduce_innersize
+ * receives the size of the inner of the two levels of looping.
+ *
+ * The value placed in reduce_outerdim is the index into the AXISDATA
+ * for where the second level of the double loop begins.
+ *
+ * The return value is always a multiple of the value placed in
+ * reduce_innersize.
+ */
+static npy_intp
+npyiter_checkreducesize(NpyIter *iter, npy_intp count,
+                                npy_intp *reduce_innersize,
+                                npy_intp *reduce_outerdim)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+    npy_intp coord, shape, *strides;
+    npy_intp reducespace = 1, factor;
+    npy_bool nonzerocoord = 0;
+
+    char *op_itflags = NIT_OPITFLAGS(iter);
+    char stride0op[NPY_MAXARGS];
+
+    /* Default to no outer axis */
+    *reduce_outerdim = 0;
+
+    /* If there's only one dimension, no need to calculate anything */
+    if (ndim == 1) {
+        *reduce_innersize = count;
+        return count;
+    }
+
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    axisdata = NIT_AXISDATA(iter);
+
+    /* Indicate which REDUCE operands have stride 0 in the inner loop */
+    strides = NAD_STRIDES(axisdata);
+    for (iiter = 0; iiter < niter; ++iiter) {
+        stride0op[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) &&
+                           (strides[iiter] == 0);
+    }
+    shape = NAD_SHAPE(axisdata);
+    coord = NAD_COORD(axisdata);
+    reducespace += (shape-coord-1);
+    factor = shape;
+    NIT_ADVANCE_AXISDATA(axisdata, 1);
+
+    /* Go forward through axisdata, calculating the space available */
+    for (idim = 1; idim < ndim && reducespace < count;
+                                ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        strides = NAD_STRIDES(axisdata);
+        for (iiter = 0; iiter < niter; ++iiter) {
+            /*
+             * If a reduce stride switched from zero to non-zero, or
+             * vice versa, that's the point where the data will stop
+             * being the same element or will repeat, and if the
+             * buffer starts with all zero coordinates up to this
+             * point, gives us the reduce_innersize.
+             */
+            if((stride0op[iiter] && (strides[iiter] != 0)) ||
+                        (!stride0op[iiter] &&
+                         (strides[iiter] == 0) &&
+                         (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE))) {
+                NPY_IT_DBG_PRINTF("Iterator: Reduce operation limits "
+                                    "buffer to %d\n", (int)reducespace);
+                /*
+                 * If we already found more elements than count, or
+                 * the starting coordinate wasn't zero, the two-level
+                 * looping is unnecessary/can't be done, so return.
+                 */
+                if (count <= reducespace) {
+                    *reduce_innersize = count;
+                    return count;
+                }
+                else if (nonzerocoord) {
+                    if (reducespace < count) {
+                        count = reducespace;
+                    }
+                    *reduce_innersize = count;
+                    return count;
+                }
+                else {
+                    *reduce_innersize = reducespace;
+                    break;
+                }
+            }
+        }
+        /* If we broke out of the loop early, we found reduce_innersize */
+        if (iiter != niter) {
+            break;
+        }
+
+        shape = NAD_SHAPE(axisdata);
+        coord = NAD_COORD(axisdata);
+        if (coord != 0) {
+            nonzerocoord = 1;
+        }
+        reducespace += (shape-coord-1) * factor;
+        factor *= shape;
+    }
+
+    /* If there was any non-zero coordinate, can't do the double loop */
+    if (nonzerocoord) {
+        if (reducespace < count) {
+            count = reducespace;
+        }
+        *reduce_innersize = count;
+        /* In this case, we can't reuse the reduce loops */
+        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS;
+        return count;
+    }
+
+    /* In this case, we can reuse the reduce loops */
+    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REUSE_REDUCE_LOOPS;
+
+    *reduce_innersize = reducespace;
+    count /= reducespace;
+
+    /*
+     * Continue through the rest of the dimensions.  If there are
+     * two separated reduction axes, we may have to cut the buffer
+     * short again.
+     */
+    *reduce_outerdim = idim;
+    reducespace = 1;
+    factor = 1;
+    /* Indicate which REDUCE operands have stride 0 at the current level */
+    strides = NAD_STRIDES(axisdata);
+    for (iiter = 0; iiter < niter; ++iiter) {
+        stride0op[iiter] = (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE) &&
+                           (strides[iiter] == 0);
+    }
+    shape = NAD_SHAPE(axisdata);
+    coord = NAD_COORD(axisdata);
+    reducespace += (shape-coord-1) * factor;
+    factor *= shape;
+    NIT_ADVANCE_AXISDATA(axisdata, 1);
+    ++idim;
+
+    for (; idim < ndim && reducespace < count;
+                                ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        strides = NAD_STRIDES(axisdata);
+        for (iiter = 0; iiter < niter; ++iiter) {
+            /*
+             * If a reduce stride switched from zero to non-zero, or
+             * vice versa, that's the point where the data will stop
+             * being the same element or will repeat, and if the
+             * buffer starts with all zero coordinates up to this
+             * point, gives us the reduce_innersize.
+             */
+            if((stride0op[iiter] && (strides[iiter] != 0)) ||
+                        (!stride0op[iiter] &&
+                         (strides[iiter] == 0) &&
+                         (op_itflags[iiter]&NPY_OP_ITFLAG_REDUCE))) {
+                NPY_IT_DBG_PRINTF("Iterator: Reduce operation limits "
+                                    "buffer to %d\n", (int)reducespace);
+                /*
+                 * This terminates the outer level of our double loop.
+                 */
+                if (count <= reducespace) {
+                    return count * (*reduce_innersize);
+                }
+                else {
+                    return reducespace * (*reduce_innersize);
+                }
+            }
+        }
+
+        shape = NAD_SHAPE(axisdata);
+        coord = NAD_COORD(axisdata);
+        if (coord != 0) {
+            nonzerocoord = 1;
+        }
+        reducespace += (shape-coord-1) * factor;
+        factor *= shape;
+    }
+
+    if (reducespace < count) {
+        count = reducespace;
+    }
+    return count * (*reduce_innersize);
+}
+
+
+
+/*NUMPY_API
+ * For debugging
+ */
+NPY_NO_EXPORT void
+NpyIter_DebugPrint(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+
+    PyGILState_STATE gilstate = PyGILState_Ensure();
+
+    printf("\n------ BEGIN ITERATOR DUMP ------\n");
+    printf("| Iterator Address: %p\n", iter);
+    printf("| ItFlags: ");
+    if (itflags&NPY_ITFLAG_IDENTPERM)
+        printf("IDENTPERM ");
+    if (itflags&NPY_ITFLAG_NEGPERM)
+        printf("NEGPERM ");
+    if (itflags&NPY_ITFLAG_HASINDEX)
+        printf("HASINDEX ");
+    if (itflags&NPY_ITFLAG_HASCOORDS)
+        printf("HASCOORDS ");
+    if (itflags&NPY_ITFLAG_FORCEDORDER)
+        printf("FORCEDORDER ");
+    if (itflags&NPY_ITFLAG_NOINNER)
+        printf("NOINNER ");
+    if (itflags&NPY_ITFLAG_RANGE)
+        printf("RANGE ");
+    if (itflags&NPY_ITFLAG_BUFFER)
+        printf("BUFFER ");
+    if (itflags&NPY_ITFLAG_GROWINNER)
+        printf("GROWINNER ");
+    if (itflags&NPY_ITFLAG_ONEITERATION)
+        printf("ONEITERATION ");
+    if (itflags&NPY_ITFLAG_DELAYBUF)
+        printf("DELAYBUF ");
+    if (itflags&NPY_ITFLAG_NEEDSAPI)
+        printf("NEEDSAPI ");
+    if (itflags&NPY_ITFLAG_REDUCE)
+        printf("REDUCE ");
+    if (itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS)
+        printf("REUSE_REDUCE_LOOPS ");
+    printf("\n");
+    printf("| NDim: %d\n", (int)ndim);
+    printf("| NIter: %d\n", (int)niter);
+    printf("| IterSize: %d\n", (int)NIT_ITERSIZE(iter));
+    printf("| IterStart: %d\n", (int)NIT_ITERSTART(iter));
+    printf("| IterEnd: %d\n", (int)NIT_ITEREND(iter));
+    printf("| IterIndex: %d\n", (int)NIT_ITERINDEX(iter));
+    printf("| Iterator SizeOf: %d\n",
+                            (int)NIT_SIZEOF_ITERATOR(itflags, ndim, niter));
+    printf("| BufferData SizeOf: %d\n",
+                            (int)NIT_BUFFERDATA_SIZEOF(itflags, ndim, niter));
+    printf("| AxisData SizeOf: %d\n",
+                            (int)NIT_AXISDATA_SIZEOF(itflags, ndim, niter));
+    printf("|\n");
+
+    printf("| Perm: ");
+    for (idim = 0; idim < ndim; ++idim) {
+        printf("%d ", (int)NIT_PERM(iter)[idim]);
+    }
+    printf("\n");
+    printf("| DTypes: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        printf("%p ", NIT_DTYPES(iter)[iiter]);
+    }
+    printf("\n");
+    printf("| DTypes: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (NIT_DTYPES(iter)[iiter] != NULL)
+            PyObject_Print((PyObject*)NIT_DTYPES(iter)[iiter], stdout, 0);
+        else
+            printf("(nil) ");
+        printf(" "); 
+    }
+    printf("\n");
+    printf("| InitDataPtrs: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        printf("%p ", NIT_RESETDATAPTR(iter)[iiter]);
+    }
+    printf("\n");
+    printf("| BaseOffsets: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        printf("%i ", (int)NIT_BASEOFFSETS(iter)[iiter]);
+    }
+    printf("\n");
+    if (itflags&NPY_ITFLAG_HASINDEX) {
+        printf("| InitIndex: %d\n",
+                        (int)(npy_intp)NIT_RESETDATAPTR(iter)[niter]);
+    }
+    printf("| Operands: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        printf("%p ", NIT_OPERANDS(iter)[iiter]);
+    }
+    printf("\n");
+    printf("| Operand DTypes: ");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyArray_Descr *dtype;
+        if (NIT_OPERANDS(iter)[iiter] != NULL) {
+            dtype = PyArray_DESCR(NIT_OPERANDS(iter)[iiter]);
+            if (dtype != NULL)
+                PyObject_Print((PyObject *)dtype, stdout, 0);
+            else
+                printf("(nil) ");
+        }
+        else {
+            printf("(op nil) ");
+        }
+        printf(" "); 
+    }
+    printf("\n");
+    printf("| OpItFlags:\n");
+    for (iiter = 0; iiter < niter; ++iiter) {
+        printf("|   Flags[%d]: ", (int)iiter);
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_READ)
+            printf("READ ");
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_WRITE)
+            printf("WRITE ");
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_CAST)
+            printf("CAST ");
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_BUFNEVER)
+            printf("BUFNEVER ");
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_ALIGNED)
+            printf("ALIGNED ");
+        if ((NIT_OPITFLAGS(iter)[iiter])&NPY_OP_ITFLAG_REDUCE)
+            printf("REDUCE ");
+        printf("\n");
+    }
+    printf("|\n");
+
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        printf("| BufferData:\n");
+        printf("|   BufferSize: %d\n", (int)NBF_BUFFERSIZE(bufferdata));
+        printf("|   Size: %d\n", (int)NBF_SIZE(bufferdata));
+        printf("|   BufIterEnd: %d\n", (int)NBF_BUFITEREND(bufferdata));
+        if (itflags&NPY_ITFLAG_REDUCE) {
+            printf("|   REDUCE Pos: %d\n",
+                        (int)NBF_REDUCE_POS(bufferdata));
+            printf("|   REDUCE OuterSize: %d\n",
+                        (int)NBF_REDUCE_OUTERSIZE(bufferdata));
+            printf("|   REDUCE OuterDim: %d\n",
+                        (int)NBF_REDUCE_OUTERDIM(bufferdata));
+        }
+        printf("|   Strides: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%d ", (int)NBF_STRIDES(bufferdata)[iiter]);
+        printf("\n");
+        printf("|   Ptrs: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_PTRS(bufferdata)[iiter]);
+        printf("\n");
+        if (itflags&NPY_ITFLAG_REDUCE) {
+            printf("|   REDUCE Outer Strides: ");
+            for (iiter = 0; iiter < niter; ++iiter)
+                printf("%d ", (int)NBF_REDUCE_OUTERSTRIDES(bufferdata)[iiter]);
+            printf("\n");
+            printf("|   REDUCE Outer Ptrs: ");
+            for (iiter = 0; iiter < niter; ++iiter)
+                printf("%p ", NBF_REDUCE_OUTERPTRS(bufferdata)[iiter]);
+            printf("\n");
+        }
+        printf("|   ReadTransferFn: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_READTRANSFERFN(bufferdata)[iiter]);
+        printf("\n");
+        printf("|   ReadTransferData: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_READTRANSFERDATA(bufferdata)[iiter]);
+        printf("\n");
+        printf("|   WriteTransferFn: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_WRITETRANSFERFN(bufferdata)[iiter]);
+        printf("\n");
+        printf("|   WriteTransferData: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_WRITETRANSFERDATA(bufferdata)[iiter]);
+        printf("\n");
+        printf("|   Buffers: ");
+        for (iiter = 0; iiter < niter; ++iiter)
+            printf("%p ", NBF_BUFFERS(bufferdata)[iiter]);
+        printf("\n");
+        printf("|\n");
+    }
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, niter);
+    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        printf("| AxisData[%d]:\n", (int)idim);
+        printf("|   Shape: %d\n", (int)NAD_SHAPE(axisdata));
+        printf("|   Coord: %d\n", (int)NAD_COORD(axisdata));
+        printf("|   Strides: ");
+        for (iiter = 0; iiter < niter; ++iiter) {
+            printf("%d ", (int)NAD_STRIDES(axisdata)[iiter]);
+        }
+        printf("\n");
+        if (itflags&NPY_ITFLAG_HASINDEX) {
+            printf("|   Index Stride: %d\n", (int)NAD_STRIDES(axisdata)[niter]);
+        }
+        printf("|   Ptrs: ");
+        for (iiter = 0; iiter < niter; ++iiter) {
+            printf("%p ", NAD_PTRS(axisdata)[iiter]);
+        }
+        printf("\n");
+        if (itflags&NPY_ITFLAG_HASINDEX) {
+            printf("|   Index Value: %d\n",
+                               (int)((npy_intp*)NAD_PTRS(axisdata))[niter]);
+        }
+    }
+
+    printf("------- END ITERATOR DUMP -------\n");
+
+    PyGILState_Release(gilstate);
+}
+
diff --git a/numpy/core/src/multiarray/new_iterator_pywrap.c b/numpy/core/src/multiarray/new_iterator_pywrap.c
new file mode 100644
index 000000000..948ff0be6
--- /dev/null
+++ b/numpy/core/src/multiarray/new_iterator_pywrap.c
@@ -0,0 +1,2253 @@
+/*
+ * This file implements the CPython wrapper of the new NumPy iterator.
+ *
+ * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
+ * The Univerity of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "structmember.h"
+
+#define _MULTIARRAYMODULE
+#include <numpy/ndarrayobject.h>
+
+#include "npy_config.h"
+
+#include "numpy/npy_3kcompat.h"
+
+typedef struct NewNpyArrayIterObject_tag NewNpyArrayIterObject;
+
+struct NewNpyArrayIterObject_tag {
+    PyObject_HEAD
+    /* The iterator */
+    NpyIter *iter;
+    /* Flag indicating iteration started/stopped */
+    char started, finished;
+    /* Child to update for nested iteration */
+    NewNpyArrayIterObject *nested_child;
+    /* Cached values from the iterator */
+    NpyIter_IterNext_Fn iternext;
+    NpyIter_GetCoords_Fn getcoords;
+    char **dataptrs;
+    PyArray_Descr **dtypes;
+    PyArrayObject **operands;
+    npy_intp *innerstrides, *innerloopsizeptr;
+    char readflags[NPY_MAXARGS];
+    char writeflags[NPY_MAXARGS];
+};
+
+void npyiter_cache_values(NewNpyArrayIterObject *self)
+{
+    NpyIter *iter = self->iter;
+
+    /* iternext and getcoords functions */
+    self->iternext = NpyIter_GetIterNext(iter, NULL);
+    if (NpyIter_HasCoords(iter) && !NpyIter_HasDelayedBufAlloc(iter)) {
+        self->getcoords = NpyIter_GetGetCoords(iter, NULL);
+    }
+    else {
+        self->getcoords = NULL;
+    }
+
+    /* Internal data pointers */
+    self->dataptrs = NpyIter_GetDataPtrArray(iter);
+    self->dtypes = NpyIter_GetDescrArray(iter);
+    self->operands = NpyIter_GetOperandArray(iter);
+
+    if (NpyIter_HasInnerLoop(iter)) {
+        self->innerstrides = NULL;
+        self->innerloopsizeptr = NULL;
+    }
+    else {
+        self->innerstrides = NpyIter_GetInnerStrideArray(iter);
+        self->innerloopsizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+    }
+
+    /* The read/write settings */
+    NpyIter_GetReadFlags(iter, self->readflags);
+    NpyIter_GetWriteFlags(iter, self->writeflags);
+}
+
+static PyObject *
+npyiter_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
+{
+    NewNpyArrayIterObject *self;
+
+    self = (NewNpyArrayIterObject *)subtype->tp_alloc(subtype, 0);
+    if (self != NULL) {
+        self->iter = NULL;
+        self->nested_child = NULL;
+    }
+
+    return (PyObject *)self;
+}
+
+static int
+NpyIter_GlobalFlagsConverter(PyObject *flags_in, npy_uint32 *flags)
+{
+    npy_uint32 tmpflags = 0;
+    int iflags, nflags;
+
+    PyObject *f;
+    char *str = NULL;
+    Py_ssize_t length = 0;
+    npy_uint32 flag;
+
+    if (flags_in == NULL || flags_in == Py_None) {
+        *flags = 0;
+        return 1;
+    }
+
+    if (!PyTuple_Check(flags_in) && !PyList_Check(flags_in)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator global flags must be a list or tuple of strings");
+        return 0;
+    }
+
+    nflags = PySequence_Size(flags_in);
+
+    for (iflags = 0; iflags < nflags; ++iflags) {
+        f = PySequence_GetItem(flags_in, iflags);
+        if (f == NULL) {
+            return 0;
+        }
+        if (PyString_AsStringAndSize(f, &str, &length) == -1) {
+            Py_DECREF(f);
+            return 0;
+        }
+        /* Use switch statements to quickly isolate the right flag */
+        flag = 0;
+        switch (str[0]) {
+            case 'b':
+                if (strcmp(str, "buffered") == 0) {
+                    flag = NPY_ITER_BUFFERED;
+                }
+                break;
+            case 'c':
+                if (length >= 6) switch (str[5]) {
+                    case 'e':
+                        if (strcmp(str, "c_index") == 0) {
+                            flag = NPY_ITER_C_INDEX;
+                        }
+                        break;
+                    case 's':
+                        if (strcmp(str, "coords") == 0) {
+                            flag = NPY_ITER_COORDS;
+                        }
+                        break;
+                    case 'n':
+                        if (strcmp(str, "common_dtype") == 0) {
+                            flag = NPY_ITER_COMMON_DTYPE;
+                        }
+                        break;
+                }
+                break;
+            case 'd':
+                if (strcmp(str, "delay_bufalloc") == 0) {
+                    flag = NPY_ITER_DELAY_BUFALLOC;
+                }
+                break;
+            case 'f':
+                if (strcmp(str, "f_index") == 0) {
+                    flag = NPY_ITER_F_INDEX;
+                }
+                break;
+            case 'g':
+                if (strcmp(str, "growinner") == 0) {
+                    flag = NPY_ITER_GROWINNER;
+                }
+                break;
+            case 'n':
+                if (strcmp(str, "no_inner_iteration") == 0) {
+                    flag = NPY_ITER_NO_INNER_ITERATION;
+                }
+                break;
+            case 'r':
+                if (strcmp(str, "ranged") == 0) {
+                    flag = NPY_ITER_RANGED;
+                }
+                else if (strcmp(str, "refs_ok") == 0) {
+                    flag = NPY_ITER_REFS_OK;
+                }
+                else if (strcmp(str, "reduce_ok") == 0) {
+                    flag = NPY_ITER_REDUCE_OK;
+                }
+                break;
+            case 'z':
+                if (strcmp(str, "zerosize_ok") == 0) {
+                    flag = NPY_ITER_ZEROSIZE_OK;
+                }
+                break;
+        }
+        if (flag == 0) {
+            PyErr_Format(PyExc_ValueError,
+                    "Unexpected iterator global flag \"%s\"", str);
+            Py_DECREF(f);
+            return 0;
+        }
+        else {
+            tmpflags |= flag;
+        }
+        Py_DECREF(f);
+    }
+
+    *flags |= tmpflags;
+    return 1;
+}
+
+/* TODO: Use PyArray_OrderConverter once 'K' is added there */
+static int
+npyiter_order_converter(PyObject *order_in, NPY_ORDER *order)
+{
+    char *str = NULL;
+    Py_ssize_t length = 0;
+    
+    if (PyString_AsStringAndSize(order_in, &str, &length) == -1) {
+        return 0;
+    }
+
+    if (length == 1) switch (str[0]) {
+        case 'C':
+            *order = NPY_CORDER;
+            return 1;
+        case 'F':
+            *order = NPY_FORTRANORDER;
+            return 1;
+        case 'A':
+            *order = NPY_ANYORDER;
+            return 1;
+        case 'K':
+            *order = NPY_KEEPORDER;
+            return 1;
+    }
+
+    PyErr_SetString(PyExc_ValueError,
+            "order must be one of 'C', 'F', 'A', or 'K'");
+    return 0;
+}
+
+/*NUMPY_API
+ * Convert any Python object, *obj*, to an NPY_CASTING enum.
+ * TODO: Move elsewhere
+ */
+NPY_NO_EXPORT int
+PyArray_CastingConverter(PyObject *obj, NPY_CASTING *casting)
+{
+    char *str = NULL;
+    Py_ssize_t length = 0;
+    
+    if (PyString_AsStringAndSize(obj, &str, &length) == -1) {
+        return 0;
+    }
+    if (length >= 2) switch (str[2]) {
+        case 0:
+            if (strcmp(str, "no") == 0) {
+                *casting = NPY_NO_CASTING;
+                return 1;
+            }
+            break;
+        case 'u':
+            if (strcmp(str, "equiv") == 0) {
+                *casting = NPY_EQUIV_CASTING;
+                return 1;
+            }
+            break;
+        case 'f':
+            if (strcmp(str, "safe") == 0) {
+                *casting = NPY_SAFE_CASTING;
+                return 1;
+            }
+            break;
+        case 'm':
+            if (strcmp(str, "same_kind") == 0) {
+                *casting = NPY_SAME_KIND_CASTING;
+                return 1;
+            }
+            break;
+        case 's':
+            if (strcmp(str, "unsafe") == 0) {
+                *casting = NPY_UNSAFE_CASTING;
+                return 1;
+            }
+            break;
+    }
+
+    PyErr_SetString(PyExc_ValueError,
+            "casting must be one of 'no', 'equiv', 'safe', "
+            "'same_kind', or 'unsafe'");
+    return 0;
+
+}
+
+static int
+NpyIter_OpFlagsConverter(PyObject *op_flags_in,
+                         npy_uint32 *op_flags)
+{
+    int iflags, nflags;
+    npy_uint32 flag;
+
+    if (!PyTuple_Check(op_flags_in) && !PyList_Check(op_flags_in)) {
+        PyErr_SetString(PyExc_ValueError,
+                "op_flags must be a tuple or array of per-op flag-tuples");
+        return 0;
+    }
+
+    nflags = PySequence_Size(op_flags_in);
+
+    *op_flags = 0;
+    for (iflags = 0; iflags < nflags; ++iflags) {
+        PyObject *f;
+        char *str = NULL;
+        Py_ssize_t length = 0;
+
+        f = PySequence_GetItem(op_flags_in, iflags);
+        if (f == NULL) {
+            return 0;
+        }
+
+        if (PyString_AsStringAndSize(f, &str, &length) == -1) {
+            Py_DECREF(f);
+            PyErr_SetString(PyExc_ValueError,
+                   "op_flags must be a tuple or array of per-op flag-tuples");
+            return 0;
+        }
+
+        /* Use switch statements to quickly isolate the right flag */
+        flag = 0;
+        switch (str[0]) {
+            case 'a':
+                if (strcmp(str, "allocate") == 0) {
+                    flag = NPY_ITER_ALLOCATE;
+                }
+                if (strcmp(str, "aligned") == 0) {
+                    flag = NPY_ITER_ALIGNED;
+                }
+                break;
+            case 'c':
+                if (strcmp(str, "copy") == 0) {
+                    flag = NPY_ITER_COPY;
+                }
+                if (strcmp(str, "contig") == 0) {
+                    flag = NPY_ITER_CONTIG;
+                }
+                break;
+            case 'n':
+                switch (str[1]) {
+                    case 'b':
+                        if (strcmp(str, "nbo") == 0) {
+                            flag = NPY_ITER_NBO;
+                        }
+                        break;
+                    case 'o':
+                        if (strcmp(str, "no_subtype") == 0) {
+                            flag = NPY_ITER_NO_SUBTYPE;
+                        }
+                        else if (strcmp(str, "no_broadcast") == 0) {
+                            flag = NPY_ITER_NO_BROADCAST;
+                        }
+                        break;
+                }
+                break;
+            case 'r':
+                if (length > 4) switch (str[4]) {
+                    case 'o':
+                        if (strcmp(str, "readonly") == 0) {
+                            flag = NPY_ITER_READONLY;
+                        }
+                        break;
+                    case 'w':
+                        if (strcmp(str, "readwrite") == 0) {
+                            flag = NPY_ITER_READWRITE;
+                        }
+                        break;
+                }
+                break;
+            case 'u':
+                if (strcmp(str, "updateifcopy") == 0) {
+                    flag = NPY_ITER_UPDATEIFCOPY;
+                }
+                break;
+            case 'w':
+                if (strcmp(str, "writeonly") == 0) {
+                    flag = NPY_ITER_WRITEONLY;
+                }
+                break;
+        }
+        if (flag == 0) {
+            PyErr_Format(PyExc_ValueError,
+                    "Unexpected per-op iterator flag \"%s\"", str);
+            Py_DECREF(f);
+            return 0;
+        }
+        else {
+            *op_flags |= flag;
+        }
+        Py_DECREF(f);
+    }
+
+    return 1;
+}
+
+static int
+npyiter_convert_op_flags_array(PyObject *op_flags_in,
+                         npy_uint32 *op_flags_array, npy_intp niter)
+{
+    npy_intp iiter;
+
+    if (!PyTuple_Check(op_flags_in) && !PyList_Check(op_flags_in)) {
+        PyErr_SetString(PyExc_ValueError,
+                "op_flags must be a tuple or array of per-op flag-tuples");
+        return 0;
+    }
+
+    if (PySequence_Size(op_flags_in) != niter) {
+        goto try_single_flags;
+    }
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyObject *f = PySequence_GetItem(op_flags_in, iiter);
+        if (f == NULL) {
+            return 0;
+        }
+        /* If the first item is a string, try as one set of flags */
+        if (iiter == 0 && (PyString_Check(f) || PyUnicode_Check(f))) {
+            Py_DECREF(f);
+            goto try_single_flags;
+        }
+        if (NpyIter_OpFlagsConverter(f,
+                        &op_flags_array[iiter]) != 1) {
+            Py_DECREF(f);
+            return 0;
+        }
+        
+        Py_DECREF(f);
+    }
+
+    return 1;
+
+try_single_flags:
+    if (NpyIter_OpFlagsConverter(op_flags_in,
+                        &op_flags_array[0]) != 1) {
+        return 0;
+    }
+
+    for (iiter = 1; iiter < niter; ++iiter) {
+        op_flags_array[iiter] = op_flags_array[0];
+    }
+
+    return 1;
+}
+
+static int
+npyiter_convert_dtypes(PyObject *op_dtypes_in,
+                        PyArray_Descr **op_dtypes,
+                        npy_intp niter)
+{
+    npy_intp iiter;
+
+    /*
+     * If the input isn't a tuple of dtypes, try converting it as-is
+     * to a dtype, and replicating to all operands.
+     */
+    if ((!PyTuple_Check(op_dtypes_in) && !PyList_Check(op_dtypes_in)) ||
+                                    PySequence_Size(op_dtypes_in) != niter) {
+        goto try_single_dtype;
+    }
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyObject *dtype = PySequence_GetItem(op_dtypes_in, iiter);
+        if (dtype == NULL) {
+            npy_intp i;
+            for (i = 0; i < iiter; ++i ) {
+                Py_XDECREF(op_dtypes[i]);
+            }
+            return 0;
+        }
+
+        /* Try converting the object to a descr */
+        if (PyArray_DescrConverter2(dtype, &op_dtypes[iiter]) != 1) {
+            npy_intp i;
+            for (i = 0; i < iiter; ++i ) {
+                Py_XDECREF(op_dtypes[i]);
+            }
+            Py_DECREF(dtype);
+            PyErr_Clear();
+            goto try_single_dtype;
+        }
+
+        Py_DECREF(dtype);
+    }
+
+    return 1;
+
+try_single_dtype:
+    if (PyArray_DescrConverter2(op_dtypes_in, &op_dtypes[0]) == 1) {
+        for (iiter = 1; iiter < niter; ++iiter) {
+            op_dtypes[iiter] = op_dtypes[0];
+            Py_XINCREF(op_dtypes[iiter]);
+        }
+        return 1;
+    }
+
+    return 0;
+}
+
+static int
+npyiter_convert_op_axes(PyObject *op_axes_in, npy_intp niter,
+                        npy_intp **op_axes, npy_intp *oa_ndim)
+{
+    PyObject *a;
+    npy_intp iiter;
+
+    if ((!PyTuple_Check(op_axes_in) && !PyList_Check(op_axes_in)) ||
+                                PySequence_Size(op_axes_in) != niter) {
+        PyErr_SetString(PyExc_ValueError,
+                "op_axes must be a tuple/list matching the number of ops");
+        return 0;
+    }
+
+    *oa_ndim = 0;
+
+    /* Copy the tuples into op_axes */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        npy_intp idim;
+        a = PySequence_GetItem(op_axes_in, iiter);
+        if (a == NULL) {
+            return 0;
+        }
+        if (a == Py_None) {
+            op_axes[iiter] = NULL;
+        } else {
+            if (!PyTuple_Check(a) && !PyList_Check(a)) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Each entry of op_axes must be None "
+                        "or a tuple/list");
+                Py_DECREF(a);
+                return 0;
+            }
+            if (*oa_ndim == 0) {
+                *oa_ndim = PySequence_Size(a);
+                if (*oa_ndim == 0) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "op_axes must have at least one dimension");
+                    return 0;
+                }
+                if (*oa_ndim > NPY_MAXDIMS) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "Too many dimensions in op_axes");
+                    return 0;
+                }
+            }
+            if (PySequence_Size(a) != *oa_ndim) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Each entry of op_axes must have the same size");
+                Py_DECREF(a);
+                return 0;
+            }
+            for (idim = 0; idim < *oa_ndim; ++idim) {
+                PyObject *v = PySequence_GetItem(a, idim);
+                if (v == NULL) {
+                    Py_DECREF(a);
+                    return 0;
+                }
+                /* numpy.newaxis is None */
+                if (v == Py_None) {
+                    op_axes[iiter][idim] = -1;
+                }
+                else {
+                    op_axes[iiter][idim] = PyInt_AsLong(v);
+                    if (op_axes[iiter][idim]==-1 &&
+                                                PyErr_Occurred()) {
+                        Py_DECREF(a);
+                        Py_DECREF(v);
+                        return 0;
+                    }
+                }
+                Py_DECREF(v);
+            }
+            Py_DECREF(a);
+        }
+    }
+
+    if (*oa_ndim == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "If op_axes is provided, at least one list of axes "
+                "must be contained within it");
+        return 0;
+    }
+    
+    return 1;
+}
+
+/*
+ * Converts the operand array and op_flags array into the form NpyIter_MultiNew
+ * needs.  Sets niter, and on success, each op[i] owns a reference
+ * to an array object.
+ */
+static int
+npyiter_convert_ops(PyObject *op_in, PyObject *op_flags_in,
+                    PyArrayObject **op, npy_uint32 *op_flags,
+                    npy_intp *niter_out)
+{
+    npy_intp iiter, niter;
+
+    /* niter and op */
+    if (PyTuple_Check(op_in) || PyList_Check(op_in)) {
+        niter = PySequence_Size(op_in);
+        if (niter == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Must provide at least one operand");
+            return 0;
+        }
+        if (niter > NPY_MAXARGS) {
+            PyErr_SetString(PyExc_ValueError, "Too many operands");
+            return 0;
+        }
+    
+        for (iiter = 0; iiter < niter; ++iiter) {
+            PyObject *item = PySequence_GetItem(op_in, iiter);
+            if (item == NULL) {
+                npy_intp i;
+                for (i = 0; i < iiter; ++i) {
+                    Py_XDECREF(op[i]);
+                }
+                return 0;
+            }
+            else if (item == Py_None) {
+                Py_DECREF(item);
+                item = NULL;
+            }
+            /* This is converted to an array after op flags are retrieved */
+            op[iiter] = (PyArrayObject *)item;
+        }
+    }
+    else {
+        niter = 1;
+        /* Is converted to an array after op flags are retrieved */
+        Py_INCREF(op_in);
+        op[0] = (PyArrayObject *)op_in;
+    }
+
+    *niter_out = niter;
+
+    /* op_flags */
+    if (op_flags_in == NULL || op_flags_in == Py_None) {
+        for (iiter = 0; iiter < niter; ++iiter) {
+            /*
+             * By default, make NULL operands writeonly and flagged for
+             * allocation, and everything else readonly.  To write
+             * to a provided operand, you must specify the write flag manually.
+             */
+            if (op[iiter] == NULL) {
+                op_flags[iiter] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+            }
+            else {
+                op_flags[iiter] = NPY_ITER_READONLY;
+            }
+        }
+    }
+    else if (npyiter_convert_op_flags_array(op_flags_in,
+                                      op_flags, niter) != 1) {
+        for (iiter = 0; iiter < niter; ++iiter) {
+            Py_XDECREF(op[iiter]);
+        }
+        return 0;
+    }
+
+    /* Now that we have the flags - convert all the ops to arrays */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] != NULL) {
+            PyArrayObject *ao;
+            int fromanyflags = 0;
+            
+            if (op_flags[iiter]&(NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
+                fromanyflags = NPY_UPDATEIFCOPY;
+            }
+            ao = (PyArrayObject *)PyArray_FromAny((PyObject *)op[iiter],
+                                            NULL, 0, 0, fromanyflags, NULL);
+            if (ao == NULL) {
+                if (PyErr_Occurred() &&
+                            PyErr_ExceptionMatches(PyExc_TypeError)) {
+                    PyErr_SetString(PyExc_TypeError,
+                            "Iterator operand is flagged as writeable, "
+                            "but is an object which cannot be written "
+                            "back to via UPDATEIFCOPY");
+                }
+                for (iiter = 0; iiter < niter; ++iiter) {
+                    Py_DECREF(op[iiter]);
+                }
+                return 0;
+            }
+            Py_DECREF(op[iiter]);
+            op[iiter] = ao;
+        }
+    }
+
+    return 1;
+}
+
+static int
+npyiter_init(NewNpyArrayIterObject *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"op", "flags", "op_flags", "op_dtypes",
+                             "order", "casting", "op_axes", "buffersize",
+                             NULL};
+
+    PyObject *op_in = NULL, *op_flags_in = NULL,
+                *op_dtypes_in = NULL, *op_axes_in = NULL;
+
+    npy_intp iiter, niter = 0;
+    PyArrayObject *op[NPY_MAXARGS];
+    npy_uint32 flags = 0;
+    NPY_ORDER order = NPY_KEEPORDER;
+    NPY_CASTING casting = NPY_SAFE_CASTING;
+    npy_uint32 op_flags[NPY_MAXARGS];
+    PyArray_Descr *op_request_dtypes[NPY_MAXARGS];
+    npy_intp oa_ndim = 0;
+    npy_intp op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    npy_intp *op_axes[NPY_MAXARGS];
+    int buffersize = 0;
+
+    if (self->iter != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator was already initialized");
+        return -1;
+    }
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&OOO&O&Oi", kwlist,
+                    &op_in,
+                    NpyIter_GlobalFlagsConverter, &flags,
+                    &op_flags_in,
+                    &op_dtypes_in,
+                    npyiter_order_converter, &order,
+                    PyArray_CastingConverter, &casting,
+                    &op_axes_in,
+                    &buffersize)) {
+        return -1;
+    }
+
+    /* op and op_flags */
+    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &niter)
+                                                        != 1) {
+        return -1;
+    }
+
+    /* Set the dtypes to all NULL to start as well */
+    memset(op_request_dtypes, 0, sizeof(op_request_dtypes[0])*niter);
+
+    /* op_request_dtypes */
+    if (op_dtypes_in != NULL && op_dtypes_in != Py_None &&
+            npyiter_convert_dtypes(op_dtypes_in,
+                                   op_request_dtypes, niter) != 1) {
+        goto fail;
+    }
+
+    /* op_axes */
+    if (op_axes_in != NULL && op_axes_in != Py_None) {
+        /* Initialize to point to the op_axes arrays */
+        for (iiter = 0; iiter < niter; ++iiter) {
+            op_axes[iiter] = op_axes_arrays[iiter];
+        }
+
+        if (npyiter_convert_op_axes(op_axes_in, niter,
+                                    op_axes, &oa_ndim) != 1) {
+            goto fail;
+        }
+    }
+
+    self->iter = NpyIter_MultiNew(niter, op, flags, order, casting, op_flags,
+                                  op_request_dtypes,
+                                  oa_ndim, oa_ndim > 0 ? op_axes : NULL,
+                                  buffersize);
+
+    if (self->iter == NULL) {
+        goto fail;
+    }
+
+    /* Cache some values for the member functions to use */
+    npyiter_cache_values(self);
+
+    if (NpyIter_GetIterSize(self->iter) == 0) {
+        self->started = 1;
+        self->finished = 1;
+    }
+    else {
+        self->started = 0;
+        self->finished = 0;
+    }
+
+    /* Release the references we got to the ops and dtypes */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        Py_XDECREF(op[iiter]);
+        Py_XDECREF(op_request_dtypes[iiter]);
+    }
+
+    return 0;
+
+fail:
+    for (iiter = 0; iiter < niter; ++iiter) {
+        Py_XDECREF(op[iiter]);
+        Py_XDECREF(op_request_dtypes[iiter]);
+    }
+    return -1;
+}
+
+NPY_NO_EXPORT PyObject *
+NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
+                    PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"op", "axes", "flags", "op_flags",
+                             "op_dtypes", "order",
+                             "casting", "buffersize",
+                             NULL};
+
+    PyObject *op_in = NULL, *axes_in = NULL,
+            *op_flags_in = NULL, *op_dtypes_in = NULL;
+
+    npy_intp iiter, niter = 0, inest, nnest = 0;
+    PyArrayObject *op[NPY_MAXARGS];
+    npy_uint32 flags = 0, flags_inner = 0;
+    NPY_ORDER order = NPY_KEEPORDER;
+    NPY_CASTING casting = NPY_SAFE_CASTING;
+    npy_uint32 op_flags[NPY_MAXARGS], op_flags_inner[NPY_MAXARGS];
+    PyArray_Descr *op_request_dtypes[NPY_MAXARGS],
+                  *op_request_dtypes_inner[NPY_MAXARGS];
+    npy_intp op_axes_data[NPY_MAXDIMS];
+    npy_intp *nested_op_axes[NPY_MAXDIMS];
+    npy_intp nested_naxes[NPY_MAXDIMS], iaxes, naxes;
+    npy_intp negones[NPY_MAXDIMS];
+    char used_axes[NPY_MAXDIMS];
+    int buffersize = 0;
+
+    PyObject *ret = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O&OOO&O&i", kwlist,
+                    &op_in,
+                    &axes_in,
+                    NpyIter_GlobalFlagsConverter, &flags,
+                    &op_flags_in,
+                    &op_dtypes_in,
+                    npyiter_order_converter, &order,
+                    PyArray_CastingConverter, &casting,
+                    &buffersize)) {
+        return NULL;
+    }
+
+    /* axes */
+    if (!PyTuple_Check(axes_in) && !PyList_Check(axes_in)) {
+        PyErr_SetString(PyExc_ValueError,
+                "axes must be a tuple of axis arrays");
+        return NULL;
+    }
+    nnest = PySequence_Size(axes_in);
+    if (nnest < 2) {
+        PyErr_SetString(PyExc_ValueError,
+                "axes must have at least 2 entries for nested iteration");
+        return NULL;
+    }
+    naxes = 0;
+    memset(used_axes, 0, NPY_MAXDIMS);
+    for (inest = 0; inest < nnest; ++inest) {
+        PyObject *item = PySequence_GetItem(axes_in, inest);
+        npy_intp i;
+        if (item == NULL) {
+            return NULL;
+        }
+        if (!PyTuple_Check(item) && !PyList_Check(item)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Each item in axes must be a an integer tuple");
+            Py_DECREF(item);
+            return NULL;
+        }
+        nested_naxes[inest] = PySequence_Size(item);
+        if (naxes + nested_naxes[inest] > NPY_MAXDIMS) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Too many axes given");
+            Py_DECREF(item);
+            return NULL;
+        }
+        for (i = 0; i < nested_naxes[inest]; ++i) {
+            PyObject *v = PySequence_GetItem(item, i);
+            npy_intp axis;
+            if (v == NULL) {
+                Py_DECREF(item);
+                return NULL;
+            }
+            axis = PyInt_AsLong(v);
+            Py_DECREF(v);
+            if (axis < 0 || axis >= NPY_MAXDIMS) {
+                PyErr_SetString(PyExc_ValueError,
+                        "An axis is out of bounds");
+                Py_DECREF(item);
+                return NULL;
+            }
+            /*
+             * This check is very important, without it out of bounds
+             * data accesses are possible.
+             */
+            if (used_axes[axis] != 0) {
+                PyErr_SetString(PyExc_ValueError,
+                        "An axis is used more than once");
+                Py_DECREF(item);
+                return NULL;
+            }
+            used_axes[axis] = 1;
+            op_axes_data[naxes+i] = axis;
+        }
+        nested_op_axes[inest] = &op_axes_data[naxes];
+        naxes += nested_naxes[inest];
+        Py_DECREF(item);
+    }
+
+    /* op and op_flags */
+    if (npyiter_convert_ops(op_in, op_flags_in, op, op_flags, &niter)
+                                                        != 1) {
+        return NULL;
+    }
+
+    /* Set the dtypes to all NULL to start as well */
+    memset(op_request_dtypes, 0, sizeof(op_request_dtypes[0])*niter);
+    memset(op_request_dtypes_inner, 0,
+                        sizeof(op_request_dtypes_inner[0])*niter);
+
+    /* op_request_dtypes */
+    if (op_dtypes_in != NULL && op_dtypes_in != Py_None &&
+            npyiter_convert_dtypes(op_dtypes_in,
+                                   op_request_dtypes, niter) != 1) {
+        goto fail;
+    }
+
+    ret = PyTuple_New(nnest);
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    /* For broadcasting allocated arrays */
+    for (iaxes = 0; iaxes < naxes; ++iaxes) {
+        negones[iaxes] = -1;
+    }
+
+    /*
+     * Clear any unnecessary ALLOCATE flags, so we can use them
+     * to indicate exactly the allocated outputs.  Also, separate
+     * the inner loop flags.
+     */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if ((op_flags[iiter]&NPY_ITER_ALLOCATE) && op[iiter] != NULL) {
+            op_flags[iiter] &= ~NPY_ITER_ALLOCATE;
+        }
+
+        /*
+         * Clear any flags allowing copies or output allocation for
+         * the inner loop.
+         */
+        op_flags_inner[iiter] = op_flags[iiter] & ~(NPY_ITER_COPY|
+                             NPY_ITER_UPDATEIFCOPY|
+                             NPY_ITER_ALLOCATE);
+        /*
+         * If buffering is enabled and copying is not,
+         * clear the nbo_aligned flag and strip the data type
+         * for the outer loops.
+         */
+        if ((flags&(NPY_ITER_BUFFERED)) &&
+                !(op_flags[iiter]&(NPY_ITER_COPY|
+                                   NPY_ITER_UPDATEIFCOPY|
+                                   NPY_ITER_ALLOCATE))) {
+            op_flags[iiter] &= ~(NPY_ITER_NBO|NPY_ITER_ALIGNED|NPY_ITER_CONTIG);
+            op_request_dtypes_inner[iiter] = op_request_dtypes[iiter];
+            op_request_dtypes[iiter] = NULL;
+        }
+    }
+
+    /* Only the inner loop gets the buffering and no inner flags */
+    flags_inner = flags&~NPY_ITER_COMMON_DTYPE;
+    flags &= ~(NPY_ITER_NO_INNER_ITERATION|
+                    NPY_ITER_BUFFERED);
+
+    for (inest = 0; inest < nnest; ++inest) {
+        NewNpyArrayIterObject *iter;
+        npy_intp *op_axes_niter[NPY_MAXARGS];
+
+        /*
+         * All the operands' op_axes are the same, except for
+         * allocated outputs.
+         */
+        for (iiter = 0; iiter < niter; ++iiter) {
+            if (op_flags[iiter]&NPY_ITER_ALLOCATE) {
+                if (inest == 0) {
+                    op_axes_niter[iiter] = NULL;
+                }
+                else {
+                    op_axes_niter[iiter] = negones;
+                }
+            }
+            else {
+                op_axes_niter[iiter] = nested_op_axes[inest];
+            }
+        }
+
+        /*
+        printf("\n");
+        for (iiter = 0; iiter < niter; ++iiter) {
+            npy_intp i;
+
+            for (i = 0; i < nested_naxes[inest]; ++i) {
+                printf("%d ", (int)op_axes_niter[iiter][i]);
+            }
+            printf("\n");
+        }
+        */
+        
+        /* Allocate the iterator */
+        iter = (NewNpyArrayIterObject *)npyiter_new(&NpyIter_Type, NULL, NULL);
+        if (iter == NULL) {
+            Py_DECREF(ret);
+            goto fail;
+        }
+
+        if (inest < nnest-1) {
+            iter->iter = NpyIter_MultiNew(niter, op, flags, order,
+                                casting, op_flags, op_request_dtypes,
+                                nested_naxes[inest], op_axes_niter,
+                                0);
+        }
+        else {
+            iter->iter = NpyIter_MultiNew(niter, op, flags_inner, order,
+                                casting, op_flags_inner,
+                                op_request_dtypes_inner,
+                                nested_naxes[inest], op_axes_niter,
+                                buffersize);
+        }
+
+        if (iter->iter == NULL) {
+            Py_DECREF(ret);
+            goto fail;
+        }
+
+        /* Cache some values for the member functions to use */
+        npyiter_cache_values(iter);
+
+        if (NpyIter_GetIterSize(iter->iter) == 0) {
+            iter->started = 1;
+            iter->finished = 1;
+        }
+        else {
+            iter->started = 0;
+            iter->finished = 0;
+        }
+
+        /*
+         * If there are any allocated outputs or any copies were made,
+         * adjust op so that the other iterators use the same ones.
+         */
+        if (inest == 0) {
+            PyArrayObject **operands = NpyIter_GetOperandArray(iter->iter);
+            for (iiter = 0; iiter < niter; ++iiter) {
+                if (op[iiter] != operands[iiter]) {
+                    Py_XDECREF(op[iiter]);
+                    op[iiter] = operands[iiter];
+                    Py_INCREF(op[iiter]);
+                }
+
+                /*
+                 * Clear any flags allowing copies for
+                 * the rest of the iterators
+                 */
+                op_flags[iiter] &= ~(NPY_ITER_COPY|
+                                 NPY_ITER_UPDATEIFCOPY);
+            }
+            /* Clear the common dtype flag for the rest of the iterators */
+            flags &= ~NPY_ITER_COMMON_DTYPE;
+        }
+
+        PyTuple_SET_ITEM(ret, inest, (PyObject *)iter);
+    }
+
+    /* Release our references to the ops and dtypes */
+    for (iiter = 0; iiter < niter; ++iiter) {
+        Py_XDECREF(op[iiter]);
+        Py_XDECREF(op_request_dtypes[iiter]);
+        Py_XDECREF(op_request_dtypes_inner[iiter]);
+    }
+
+    /* Set up the nested child references */
+    for (inest = 0; inest < nnest-1; ++inest) {
+        NewNpyArrayIterObject *iter;
+        iter = (NewNpyArrayIterObject *)PyTuple_GET_ITEM(ret, inest);
+        /*
+         * Indicates which iterator to reset with new base pointers
+         * each iteration step.
+         */
+        iter->nested_child =
+                (NewNpyArrayIterObject *)PyTuple_GET_ITEM(ret, inest+1);
+        Py_INCREF(iter->nested_child);
+        /*
+         * Need to do a nested reset so all the iterators point
+         * at the right data
+         */
+        if (NpyIter_ResetBasePointers(iter->nested_child->iter,
+                                iter->dataptrs, NULL) != NPY_SUCCEED) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+
+    return ret;
+
+fail:
+    for (iiter = 0; iiter < niter; ++iiter) {
+        Py_XDECREF(op[iiter]);
+        Py_XDECREF(op_request_dtypes[iiter]);
+        Py_XDECREF(op_request_dtypes_inner[iiter]);
+    }
+    return NULL;
+}
+
+static void
+npyiter_dealloc(NewNpyArrayIterObject *self)
+{
+    if (self->iter) {
+        NpyIter_Deallocate(self->iter);
+        self->iter = NULL;
+        Py_XDECREF(self->nested_child);
+        self->nested_child = NULL;
+    }
+    self->ob_type->tp_free((PyObject*)self);
+}
+
+static int
+npyiter_resetbasepointers(NewNpyArrayIterObject *self)
+{
+    while (self->nested_child) {
+        if (NpyIter_ResetBasePointers(self->nested_child->iter,
+                                        self->dataptrs, NULL) != NPY_SUCCEED) {
+            return NPY_FAIL;
+        }
+        self = self->nested_child;
+        if (NpyIter_GetIterSize(self->iter) == 0) {
+            self->started = 1;
+            self->finished = 1;
+        }
+        else {
+            self->started = 0;
+            self->finished = 0;
+        }
+    }
+
+    return NPY_SUCCEED;
+}
+
+static PyObject *
+npyiter_reset(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (NpyIter_Reset(self->iter, NULL) != NPY_SUCCEED) {
+        return NULL;
+    }
+    if (NpyIter_GetIterSize(self->iter) == 0) {
+        self->started = 1;
+        self->finished = 1;
+    }
+    else {
+        self->started = 0;
+        self->finished = 0;
+    }
+
+    if (self->getcoords == NULL && NpyIter_HasCoords(self->iter)) {
+        self->getcoords = NpyIter_GetGetCoords(self->iter, NULL);
+    }
+
+    /* If there is nesting, the nested iterators should be reset */
+    if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+/*
+ * Makes a copy of the iterator.  Note that the nesting is not
+ * copied.
+ */
+static PyObject *
+npyiter_copy(NewNpyArrayIterObject *self)
+{
+    NewNpyArrayIterObject *iter;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    /* Allocate the iterator */
+    iter = (NewNpyArrayIterObject *)npyiter_new(&NpyIter_Type, NULL, NULL);
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    /* Copy the C iterator */
+    iter->iter = NpyIter_Copy(self->iter);
+    if (iter->iter == NULL) {
+        Py_DECREF(iter);
+        return NULL;
+    }
+
+    /* Cache some values for the member functions to use */
+    npyiter_cache_values(iter);
+
+    iter->started = self->started;
+    iter->finished = self->finished;
+
+    return (PyObject *)iter;
+}
+
+static PyObject *
+npyiter_iternext(NewNpyArrayIterObject *self)
+{
+    if (self->iter != NULL && self->iternext != NULL &&
+                        !self->finished && self->iternext(self->iter)) {
+        /* If there is nesting, the nested iterators should be reset */
+        if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+            return NULL;
+        }
+
+        Py_RETURN_TRUE;
+    }
+    else {
+        self->finished = 1;
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *
+npyiter_remove_axis(NewNpyArrayIterObject *self, PyObject *args)
+{
+    int axis = 0;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (!PyArg_ParseTuple(args, "i", &axis)) {
+        return NULL;
+    }
+
+    if (NpyIter_RemoveAxis(self->iter, axis) != NPY_SUCCEED) {
+        return NULL;
+    }
+    /* RemoveAxis invalidates cached values */
+    npyiter_cache_values(self);
+    /* RemoveAxis also resets the iterator */
+    if (NpyIter_GetIterSize(self->iter) == 0) {
+        self->started = 1;
+        self->finished = 1;
+    }
+    else {
+        self->started = 0;
+        self->finished = 0;
+    }
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+npyiter_remove_coords(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    NpyIter_RemoveCoords(self->iter);
+    /* RemoveCoords invalidates cached values */
+    npyiter_cache_values(self);
+    /* RemoveCoords also resets the iterator */
+    if (NpyIter_GetIterSize(self->iter) == 0) {
+        self->started = 1;
+        self->finished = 1;
+    }
+    else {
+        self->started = 0;
+        self->finished = 0;
+    }
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+npyiter_remove_inner_loop(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    NpyIter_RemoveInnerLoop(self->iter);
+    /* RemoveInnerLoop invalidates cached values */
+    npyiter_cache_values(self);
+    /* RemoveInnerLoop also resets the iterator */
+    if (NpyIter_GetIterSize(self->iter) == 0) {
+        self->started = 1;
+        self->finished = 1;
+    }
+    else {
+        self->started = 0;
+        self->finished = 0;
+    }
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+npyiter_debug_print(NewNpyArrayIterObject *self)
+{
+    if (self->iter != NULL) {
+        NpyIter_DebugPrint(self->iter);
+    }
+    else {
+        printf("Iterator: (nil)\n");
+    }
+
+    Py_RETURN_NONE;
+}
+
+NPY_NO_EXPORT PyObject *
+npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i);
+
+static PyObject *npyiter_value_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+
+    npy_intp iiter, niter;
+    PyArray_Descr **dtypes;
+    char **dataptrs;
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+    dtypes = self->dtypes;
+    dataptrs = self->dataptrs;
+
+    /* Return an array  or tuple of arrays with the values */
+    if (niter == 1) {
+        ret = npyiter_seq_item(self, 0);
+    }
+    else {
+        ret = PyTuple_New(niter);
+        if (ret == NULL) {
+            return NULL;
+        }
+        for (iiter = 0; iiter < niter; ++iiter) {
+            PyObject *a = npyiter_seq_item(self, iiter);
+            if (a == NULL) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+            PyTuple_SET_ITEM(ret, iiter, a);
+        }
+    }
+
+    return ret;
+}
+
+static PyObject *npyiter_operands_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+
+    npy_intp iiter, niter;
+    PyArrayObject **operands;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+    operands = self->operands;
+
+    ret = PyTuple_New(niter);
+    if (ret == NULL) {
+        return NULL;
+    }
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyObject *operand = (PyObject *)operands[iiter];
+
+        Py_INCREF(operand);
+        PyTuple_SET_ITEM(ret, iiter, operand);
+    }
+
+    return ret;
+}
+
+static PyObject *npyiter_itviews_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+
+    npy_intp iiter, niter;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+
+    ret = PyTuple_New(niter);
+    if (ret == NULL) {
+        return NULL;
+    }
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyArrayObject *view = NpyIter_GetIterView(self->iter, iiter);
+
+        if (view == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(ret, iiter, (PyObject *)view);
+    }
+
+    return ret;
+}
+
+static PyObject *
+npyiter_next(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL || self->iternext == NULL || self->finished) {
+        return NULL;
+    }
+
+    /*
+     * Use the started flag for the Python iteration protocol to work
+     * when buffering is enabled.
+     */
+    if (self->started) {
+        if (!self->iternext(self->iter)) {
+            self->finished = 1;
+            return NULL;
+        }
+
+        /* If there is nesting, the nested iterators should be reset */
+        if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+            return NULL;
+        }
+    }
+    self->started = 1;
+
+    return npyiter_value_get(self);
+};
+
+static PyObject *npyiter_shape_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+    npy_intp idim, ndim, shape[NPY_MAXDIMS];
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    if (NpyIter_GetShape(self->iter, shape) == NPY_SUCCEED) {
+        ndim = NpyIter_GetNDim(self->iter);
+        ret = PyTuple_New(ndim);
+        if (ret != NULL) {
+            for (idim = 0; idim < ndim; ++idim) {
+                PyTuple_SET_ITEM(ret, idim,
+                        PyInt_FromLong(shape[idim]));
+            }
+            return ret;
+        }
+    }
+
+    return NULL;
+}
+
+static PyObject *npyiter_coords_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+    npy_intp idim, ndim, coords[NPY_MAXDIMS];
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    if (self->getcoords != NULL) {
+        ndim = NpyIter_GetNDim(self->iter);
+        self->getcoords(self->iter, coords);
+        ret = PyTuple_New(ndim);
+        for (idim = 0; idim < ndim; ++idim) {
+            PyTuple_SET_ITEM(ret, idim,
+                    PyInt_FromLong(coords[idim]));
+        }
+        return ret;
+    }
+    else {
+        if (!NpyIter_HasCoords(self->iter)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator does not have coordinates");
+            return NULL;
+        }
+        else if (NpyIter_HasDelayedBufAlloc(self->iter)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator construction used delayed buffer allocation, "
+                    "and no reset has been done yet");
+            return NULL;
+        }
+        else {
+            PyErr_SetString(PyExc_ValueError,
+                    "Iterator is in an invalid state");
+            return NULL;
+        }
+    }
+}
+
+static int npyiter_coords_set(NewNpyArrayIterObject *self, PyObject *value)
+{
+    npy_intp idim, ndim, coords[NPY_MAXDIMS];
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return -1;
+    }
+
+    if (value == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot delete coordinates");
+        return -1;
+    }
+
+    if (NpyIter_HasCoords(self->iter)) {
+        ndim = NpyIter_GetNDim(self->iter);
+        if (!PySequence_Check(value)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Coordinates must be set with a sequence");
+            return -1;
+        }
+        if (PySequence_Size(value) != ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Wrong number of coordinates");
+            return -1;
+        }
+        for (idim = 0; idim < ndim; ++idim) {
+            PyObject *v = PySequence_GetItem(value, idim);
+            coords[idim] = PyInt_AsLong(v);
+            if (coords[idim]==-1 && PyErr_Occurred()) {
+                return -1;
+            }
+        }
+        if (NpyIter_GotoCoords(self->iter, coords) != NPY_SUCCEED) {
+            return -1;
+        }
+        self->started = 0;
+        self->finished = 0;
+
+        /* If there is nesting, the nested iterators should be reset */
+        if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+            return -1;
+        }
+
+        return 0;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator does not have coordinates");
+        return -1;
+    }
+}
+
+static PyObject *npyiter_index_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    if (NpyIter_HasIndex(self->iter)) {
+        npy_intp index = *NpyIter_GetIndexPtr(self->iter);
+        return PyInt_FromLong(index);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator does not have an index");
+        return NULL;
+    }
+}
+
+static int npyiter_index_set(NewNpyArrayIterObject *self, PyObject *value)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return -1;
+    }
+
+    if (value == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot delete index");
+        return -1;
+    }
+
+    if (NpyIter_HasIndex(self->iter)) {
+        npy_intp index;
+        index = PyInt_AsLong(value);
+        if (index==-1 && PyErr_Occurred()) {
+            return -1;
+        }
+        if (NpyIter_GotoIndex(self->iter, index) != NPY_SUCCEED) {
+            return -1;
+        }
+        self->started = 0;
+        self->finished = 0;
+
+        /* If there is nesting, the nested iterators should be reset */
+        if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+            return -1;
+        }
+
+        return 0;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator does not have an index");
+        return -1;
+    }
+}
+
+static PyObject *npyiter_iterindex_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    return PyInt_FromLong(NpyIter_GetIterIndex(self->iter));
+}
+
+static int npyiter_iterindex_set(NewNpyArrayIterObject *self, PyObject *value)
+{
+    npy_intp iterindex;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return -1;
+    }
+
+    if (value == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot delete iterindex");
+        return -1;
+    }
+
+    iterindex = PyInt_AsLong(value);
+    if (iterindex==-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    if (NpyIter_GotoIterIndex(self->iter, iterindex) != NPY_SUCCEED) {
+        return -1;
+    }
+    self->started = 0;
+    self->finished = 0;
+
+    /* If there is nesting, the nested iterators should be reset */
+    if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyObject *npyiter_iterrange_get(NewNpyArrayIterObject *self)
+{
+    npy_intp istart = 0, iend = 0;
+    PyObject *ret;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    NpyIter_GetIterIndexRange(self->iter, &istart, &iend);
+
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    PyTuple_SET_ITEM(ret, 0, PyInt_FromLong(istart));
+    PyTuple_SET_ITEM(ret, 1, PyInt_FromLong(iend));
+
+    return ret;
+}
+
+static int npyiter_iterrange_set(NewNpyArrayIterObject *self, PyObject *value)
+{
+    npy_intp istart = 0, iend = 0;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return -1;
+    }
+
+    if (value == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot delete iterrange");
+        return -1;
+    }
+
+    if (!PyArg_ParseTuple(value, "nn", &istart, &iend)) {
+        return -1;
+    }
+
+    if (NpyIter_ResetToIterIndexRange(self->iter, istart, iend, NULL)
+                                                    != NPY_SUCCEED) {
+        return -1;
+    }
+    if (istart < iend) {
+        self->started = self->finished = 0;
+    }
+    else {
+        self->started = self->finished = 1;
+    }
+
+    if (self->getcoords == NULL && NpyIter_HasCoords(self->iter)) {
+        self->getcoords = NpyIter_GetGetCoords(self->iter, NULL);
+    }
+
+    /* If there is nesting, the nested iterators should be reset */
+    if (npyiter_resetbasepointers(self) != NPY_SUCCEED) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyObject *npyiter_hasdelayedbufalloc_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (NpyIter_HasDelayedBufAlloc(self->iter)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *npyiter_iterationneedsapi_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (NpyIter_IterationNeedsAPI(self->iter)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *npyiter_hascoords_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (NpyIter_HasCoords(self->iter)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *npyiter_hasindex_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    if (NpyIter_HasIndex(self->iter)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *npyiter_dtypes_get(NewNpyArrayIterObject *self)
+{
+    PyObject *ret;
+
+    npy_intp iiter, niter;
+    PyArray_Descr **dtypes;
+
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+
+    ret = PyTuple_New(niter);
+    if (ret == NULL) {
+        return NULL;
+    }
+    dtypes = self->dtypes;
+    for (iiter = 0; iiter < niter; ++iiter) {
+        PyArray_Descr *dtype = dtypes[iiter];
+
+        Py_INCREF(dtype);
+        PyTuple_SET_ITEM(ret, iiter, (PyObject *)dtype);
+    }
+
+    return ret;
+}
+
+static PyObject *npyiter_ndim_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    return PyInt_FromLong(NpyIter_GetNDim(self->iter));
+}
+
+static PyObject *npyiter_niter_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    return PyInt_FromLong(NpyIter_GetNIter(self->iter));
+}
+
+static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is invalid");
+        return NULL;
+    }
+
+    return PyInt_FromLong(NpyIter_GetIterSize(self->iter));
+}
+
+static PyObject *npyiter_finished_get(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL || !self->finished) {
+        Py_RETURN_FALSE;
+    }
+    else {
+        Py_RETURN_TRUE;
+    }
+}
+
+NPY_NO_EXPORT Py_ssize_t
+npyiter_seq_length(NewNpyArrayIterObject *self)
+{
+    if (self->iter == NULL) {
+        return 0;
+    }
+    else {
+        return NpyIter_GetNIter(self->iter);
+    }
+}
+
+NPY_NO_EXPORT PyObject *
+npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
+{
+    PyObject *ret;
+
+    npy_intp ret_ndim;
+    npy_intp niter, innerloopsize, innerstride;
+    char *dataptr;
+    PyArray_Descr *dtype;
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    if (NpyIter_HasDelayedBufAlloc(self->iter)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator construction used delayed buffer allocation, "
+                "and no reset has been done yet");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+    if (i < 0 || i >= niter) {
+        PyErr_Format(PyExc_IndexError,
+                "Iterator operand index %d is out of bounds", (int)i);
+        return NULL;
+    }
+
+#if 0
+    /*
+     * This check is disabled because it prevents things like
+     * np.add(it[0], it[1], it[2]), where it[2] is a write-only
+     * parameter.  When write-only, the value of it[i] is
+     * likely random junk, as if it were allocated with an
+     * np.empty(...) call.
+     */
+    if (!self->readflags[i]) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Iterator operand %d is write-only", (int)i);
+        return NULL;
+    }
+#endif
+
+    dataptr = self->dataptrs[i];
+    dtype = self->dtypes[i];
+
+    if (NpyIter_HasInnerLoop(self->iter)) {
+        innerloopsize = 1;
+        innerstride = 0;
+        /* If the iterator is going over every element, return array scalars */
+        ret_ndim = 0;
+    }
+    else {
+        innerloopsize = *self->innerloopsizeptr;
+        innerstride = self->innerstrides[i];
+        ret_ndim = 1;
+    }
+
+    Py_INCREF(dtype);
+    ret = (PyObject *)PyArray_NewFromDescr(&PyArray_Type, dtype,
+                            ret_ndim, &innerloopsize,
+                            &innerstride, dataptr,
+                            self->writeflags[i] ? NPY_WRITEABLE : 0, NULL);
+    Py_INCREF(self);
+    ((PyArrayObject *)ret)->base = (PyObject *)self;
+
+    PyArray_UpdateFlags((PyArrayObject *)ret, NPY_UPDATE_ALL);
+
+    return ret;
+}
+
+NPY_NO_EXPORT PyObject *
+npyiter_seq_slice(NewNpyArrayIterObject *self,
+                    Py_ssize_t ilow, Py_ssize_t ihigh)
+{
+    PyObject *ret;
+    npy_intp niter;
+    Py_ssize_t i;
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return NULL;
+    }
+
+    if (NpyIter_HasDelayedBufAlloc(self->iter)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator construction used delayed buffer allocation, "
+                "and no reset has been done yet");
+        return NULL;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+    if (ilow < 0) {
+        ilow = 0;
+    }
+    else if (ilow >= niter) {
+        ilow = niter-1;
+    }
+    if (ihigh < ilow) {
+        ihigh = ilow;
+    }
+    else if (ihigh > niter) {
+        ihigh = niter;
+    }
+
+    ret = PyTuple_New(ihigh-ilow);
+    if (ret == NULL) {
+        return NULL;
+    }
+    for (i = ilow; i < ihigh ; ++i) {
+        PyObject *item = npyiter_seq_item(self, i);
+        if (item == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(ret, i-ilow, item);
+    }
+    return ret;
+}
+
+NPY_NO_EXPORT int
+npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
+{
+
+    npy_intp niter, innerloopsize, innerstride;
+    char *dataptr;
+    PyArray_Descr *dtype;
+    PyArrayObject *tmp;
+    int ret;
+
+    if (v == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "can't delete iterator operands");
+        return -1;
+    }
+
+    if (self->iter == NULL || self->finished) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator is past the end");
+        return -1;
+    }
+
+    if (NpyIter_HasDelayedBufAlloc(self->iter)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Iterator construction used delayed buffer allocation, "
+                "and no reset has been done yet");
+        return -1;
+    }
+
+    niter = NpyIter_GetNIter(self->iter);
+    if (i < 0 || i >= niter) {
+        PyErr_Format(PyExc_IndexError,
+                "Iterator operand index  %d is out of bounds", (int)i);
+        return -1;
+    }
+    if (!self->writeflags[i]) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Iterator operand %d is not writeable", (int)i);
+        return -1;
+    }
+
+    dataptr = self->dataptrs[i];
+    dtype = self->dtypes[i];
+
+    if (NpyIter_HasInnerLoop(self->iter)) {
+        innerloopsize = 1;
+        innerstride = 0;
+    }
+    else {
+        innerloopsize = *self->innerloopsizeptr;
+        innerstride = self->innerstrides[i];
+    }
+
+    /* TODO - there should be a better way than this... */
+    Py_INCREF(dtype);
+    tmp = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype,
+                                1, &innerloopsize,
+                                &innerstride, dataptr,
+                                NPY_WRITEABLE, NULL);
+    if (tmp == NULL) {
+        return -1;
+    }
+    PyArray_UpdateFlags(tmp, NPY_UPDATE_ALL);
+    ret = PyArray_CopyObject(tmp, v);
+    Py_DECREF(tmp);
+    return ret;
+}
+
+static PyMethodDef npyiter_methods[] = {
+    {"reset", (PyCFunction)npyiter_reset, METH_NOARGS, NULL},
+    {"copy", (PyCFunction)npyiter_copy, METH_NOARGS, NULL},
+    {"__copy__", (PyCFunction)npyiter_copy, METH_NOARGS, NULL},
+    {"iternext", (PyCFunction)npyiter_iternext, METH_NOARGS, NULL},
+    {"remove_axis", (PyCFunction)npyiter_remove_axis, METH_VARARGS, NULL},
+    {"remove_coords", (PyCFunction)npyiter_remove_coords, METH_NOARGS, NULL},
+    {"remove_inner_loop", (PyCFunction)npyiter_remove_inner_loop,
+                METH_NOARGS, NULL},
+    {"debug_print", (PyCFunction)npyiter_debug_print, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL},
+};
+
+static PyMemberDef npyiter_members[] = {
+    {NULL, 0, 0, 0, NULL},
+};
+
+static PyGetSetDef npyiter_getsets[] = {
+    {"value",
+        (getter)npyiter_value_get,
+        NULL, NULL, NULL},
+    {"shape",
+        (getter)npyiter_shape_get,
+        NULL, NULL, NULL},
+    {"coords",
+        (getter)npyiter_coords_get,
+        (setter)npyiter_coords_set,
+        NULL, NULL},
+    {"index",
+        (getter)npyiter_index_get,
+        (setter)npyiter_index_set,
+        NULL, NULL},
+    {"iterindex",
+        (getter)npyiter_iterindex_get,
+        (setter)npyiter_iterindex_set,
+        NULL, NULL},
+    {"iterrange",
+        (getter)npyiter_iterrange_get,
+        (setter)npyiter_iterrange_set,
+        NULL, NULL},
+    {"operands",
+        (getter)npyiter_operands_get,
+        NULL, NULL, NULL},
+    {"itviews",
+        (getter)npyiter_itviews_get,
+        NULL, NULL, NULL},
+    {"hasdelayedbufalloc",
+        (getter)npyiter_hasdelayedbufalloc_get,
+        NULL, NULL, NULL},
+    {"iterationneedsapi",
+        (getter)npyiter_iterationneedsapi_get,
+        NULL, NULL, NULL},
+    {"hascoords",
+        (getter)npyiter_hascoords_get,
+        NULL, NULL, NULL},
+    {"hasindex",
+        (getter)npyiter_hasindex_get,
+        NULL, NULL, NULL},
+    {"dtypes",
+        (getter)npyiter_dtypes_get,
+        NULL, NULL, NULL},
+    {"ndim",
+        (getter)npyiter_ndim_get,
+        NULL, NULL, NULL},
+    {"niter",
+        (getter)npyiter_niter_get,
+        NULL, NULL, NULL},
+    {"itersize",
+        (getter)npyiter_itersize_get,
+        NULL, NULL, NULL},
+    {"finished",
+        (getter)npyiter_finished_get,
+        NULL, NULL, NULL},
+
+    {NULL, NULL, NULL, NULL, NULL},
+};
+
+NPY_NO_EXPORT PySequenceMethods npyiter_as_sequence = {
+#if PY_VERSION_HEX >= 0x02050000
+    (lenfunc)npyiter_seq_length,            /*sq_length*/
+    (binaryfunc)NULL,                       /*sq_concat*/
+    (ssizeargfunc)NULL,                     /*sq_repeat*/
+    (ssizeargfunc)npyiter_seq_item,         /*sq_item*/
+    (ssizessizeargfunc)npyiter_seq_slice,   /*sq_slice*/
+    (ssizeobjargproc)npyiter_seq_ass_item,  /*sq_ass_item*/
+    (ssizessizeobjargproc)NULL,             /*sq_ass_slice*/
+    (objobjproc)NULL,                       /*sq_contains */
+    (binaryfunc)NULL,                       /*sq_inplace_concat */
+    (ssizeargfunc)NULL,                     /*sq_inplace_repeat */
+#else
+    (inquiry)npyiter_seq_length,            /*sq_length*/
+    (binaryfunc)NULL,                       /*sq_concat is handled by nb_add*/
+    (intargfunc)NULL,                       /*sq_repeat is handled nb_multiply*/
+    (intargfunc)npyiter_seq_item,           /*sq_item*/
+    (intintargfunc)npyiter_seq_slice,       /*sq_slice*/
+    (intobjargproc)npyiter_seq_ass_item,    /*sq_ass_item*/
+    (intintobjargproc)NULL,                 /*sq_ass_slice*/
+    (objobjproc)NULL,                       /*sq_contains */
+    (binaryfunc)NULL,                       /*sg_inplace_concat */
+    (intargfunc)NULL                        /*sg_inplace_repeat */
+#endif
+};
+
+NPY_NO_EXPORT PyTypeObject NpyIter_Type = {
+#if defined(NPY_PY3K)
+    PyVarObject_HEAD_INIT(NULL, 0)
+#else
+    PyObject_HEAD_INIT(NULL)
+    0,                                          /* ob_size */
+#endif
+    "numpy.newiter",                            /* tp_name */
+    sizeof(NewNpyArrayIterObject),              /* tp_basicsize */
+    0,                                          /* tp_itemsize */
+    /* methods */
+    (destructor)npyiter_dealloc,                /* tp_dealloc */
+    0,                                          /* tp_print */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+#if defined(NPY_PY3K)
+    0,                                          /* tp_reserved */
+#else
+    0,                                          /* tp_compare */
+#endif
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    &npyiter_as_sequence,                       /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                         /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    (iternextfunc)npyiter_next,                 /* tp_iternext */
+    npyiter_methods,                            /* tp_methods */
+    npyiter_members,                            /* tp_members */
+    npyiter_getsets,                            /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    (initproc)npyiter_init,                     /* tp_init */
+    0,                                          /* tp_alloc */
+    npyiter_new,                                /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+#if PY_VERSION_HEX >= 0x02060000
+    0,                                          /* tp_version_tag */
+#endif
+};
+
diff --git a/numpy/core/src/multiarray/new_iterator_pywrap.h b/numpy/core/src/multiarray/new_iterator_pywrap.h
new file mode 100644
index 000000000..35e322541
--- /dev/null
+++ b/numpy/core/src/multiarray/new_iterator_pywrap.h
@@ -0,0 +1,8 @@
+#ifndef __NEW_ITERATOR_PYWRAP_H
+#define __NEW_ITERATOR_PYWRAP_H
+
+NPY_NO_EXPORT PyObject *
+NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
+                    PyObject *args, PyObject *kwds);
+
+#endif
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index c66337602..eff71d12d 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -3277,32 +3277,69 @@ NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
  */
 NPY_NO_EXPORT char
 _npy_scalar_kinds_table[NPY_NTYPES];
+
+/*
+ * This table maps a scalar kind (excluding NPY_NOSCALAR)
+ * to the smallest type number of that kind.
+ */
+NPY_NO_EXPORT char
+_npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
+
+/*
+ * This table gives the type of the same kind, but next in the sequence
+ * of sizes.
+ */
+NPY_NO_EXPORT char
+_npy_next_larger_type_table[NPY_NTYPES];
+
 /*
  * This table describes safe casting for small type numbers,
  * and is used by PyArray_CanCastSafely.
  */
 NPY_NO_EXPORT unsigned char
 _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
+
+/*
+ * This table gives the smallest-size and smallest-kind type to which
+ * the input types may be safely cast, according to _npy_can_cast_safely.
+ */
+NPY_NO_EXPORT char
+_npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
 #endif
 
 NPY_NO_EXPORT void
 initialize_casting_tables(void)
 {
-    int i;
+    int i, j;
+
+    _npy_smallest_type_of_kind_table[NPY_BOOL_SCALAR] = NPY_BOOL;
+    _npy_smallest_type_of_kind_table[NPY_INTPOS_SCALAR] = NPY_UBYTE;
+    _npy_smallest_type_of_kind_table[NPY_INTNEG_SCALAR] = NPY_BYTE;
+    _npy_smallest_type_of_kind_table[NPY_FLOAT_SCALAR] = NPY_HALF;
+    _npy_smallest_type_of_kind_table[NPY_COMPLEX_SCALAR] = NPY_CFLOAT;
+    _npy_smallest_type_of_kind_table[NPY_OBJECT_SCALAR] = NPY_OBJECT;
 
     /* Default for built-in types is object scalar */
     memset(_npy_scalar_kinds_table, PyArray_OBJECT_SCALAR,
                                         sizeof(_npy_scalar_kinds_table));
+    /* Default for next largest type is -1, signalling no bigger */
+    memset(_npy_next_larger_type_table, -1,
+                                        sizeof(_npy_next_larger_type_table));
 
     /* Compile-time loop of scalar kinds */
 /**begin repeat
  * #NAME = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
  *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE, LONGDOUBLE,
  *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #BIGGERTYPE = -1, NPY_SHORT, NPY_USHORT, NPY_INT, NPY_UINT,
+ *         NPY_LONG, NPY_ULONG, NPY_LONGLONG, NPY_ULONGLONG,
+ *         -1, -1, NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE, -1,
+ *         NPY_CDOUBLE, NPY_CLONGDOUBLE, -1#
  * #SCKIND = BOOL, (INTNEG, INTPOS)*5, FLOAT*4,
  *           COMPLEX*3#
  */
-    _npy_scalar_kinds_table[PyArray_@NAME@] = PyArray_@SCKIND@_SCALAR;
+    _npy_scalar_kinds_table[NPY_@NAME@] = PyArray_@SCKIND@_SCALAR;
+    _npy_next_larger_type_table[NPY_@NAME@] = @BIGGERTYPE@;
 /**end repeat**/
 
     memset(_npy_can_cast_safely_table, 0, sizeof(_npy_can_cast_safely_table));
@@ -3311,17 +3348,17 @@ initialize_casting_tables(void)
         /* Identity */
         _npy_can_cast_safely_table[i][i] = 1;
         /* Bool -> <Anything> */
-        _npy_can_cast_safely_table[PyArray_BOOL][i] = 1;
+        _npy_can_cast_safely_table[NPY_BOOL][i] = 1;
         /* DateTime sits out for these... */
         if (i != PyArray_DATETIME && i != PyArray_TIMEDELTA) {
             /* <Anything> -> Object */
-            _npy_can_cast_safely_table[i][PyArray_OBJECT] = 1;
+            _npy_can_cast_safely_table[i][NPY_OBJECT] = 1;
             /* <Anything> -> Void */
-            _npy_can_cast_safely_table[i][PyArray_VOID] = 1;
+            _npy_can_cast_safely_table[i][NPY_VOID] = 1;
         }
     }
 
-    _npy_can_cast_safely_table[PyArray_STRING][PyArray_UNICODE] = 1;
+    _npy_can_cast_safely_table[NPY_STRING][NPY_UNICODE] = 1;
 
 #ifndef NPY_SIZEOF_BYTE
 #define NPY_SIZEOF_BYTE 1
@@ -3349,7 +3386,7 @@ initialize_casting_tables(void)
  *                   1, 1, 1#
  */
 #define _FROM_BSIZE NPY_SIZEOF_@FROM_BASENAME@
-#define _FROM_NUM   (PyArray_@FROM_NAME@)
+#define _FROM_NUM   (NPY_@FROM_NAME@)
 
     _npy_can_cast_safely_table[_FROM_NUM][PyArray_STRING] = 1;
     _npy_can_cast_safely_table[_FROM_NUM][PyArray_UNICODE] = 1;
@@ -3375,7 +3412,7 @@ initialize_casting_tables(void)
  *                 1, 1, 1#
  */
 #define _TO_BSIZE NPY_SIZEOF_@TO_BASENAME@
-#define _TO_NUM   (PyArray_@TO_NAME@)
+#define _TO_NUM   (NPY_@TO_NAME@)
 
 /*
  * NOTE: _FROM_BSIZE and _TO_BSIZE are the sizes of the "base type"
@@ -3454,6 +3491,89 @@ initialize_casting_tables(void)
 
 /**end repeat**/
 
+    /*
+     * Now that the _can_cast_safely table is finished, we can
+     * use it to build the _type_promotion table
+     */
+    for (i = 0; i < NPY_NTYPES; ++i) {
+        _npy_type_promotion_table[i][i] = i;
+        /* Don't let number promote to string/unicode/void */
+        if (i == NPY_STRING || i == NPY_UNICODE || i == NPY_VOID) {
+            /* Promoting these types requires examining their contents */
+            _npy_type_promotion_table[i][i] = -1;
+            for (j = i+1; j < NPY_NTYPES; ++j) {
+                _npy_type_promotion_table[i][j] = -1;
+                _npy_type_promotion_table[j][i] = -1;
+            }
+            /* Except they can convert to OBJECT */
+            _npy_type_promotion_table[i][NPY_OBJECT] = NPY_OBJECT;
+            _npy_type_promotion_table[NPY_OBJECT][i] = NPY_OBJECT;
+        }
+        else {
+            for (j = i+1; j < NPY_NTYPES; ++j) {
+                /* Don't let number promote to string/unicode/void */
+                if (j == NPY_STRING || j == NPY_UNICODE || j == NPY_VOID) {
+                    _npy_type_promotion_table[i][j] = -1;
+                    _npy_type_promotion_table[j][i] = -1;
+                }
+                else if (_npy_can_cast_safely_table[i][j]) {
+                    _npy_type_promotion_table[i][j] = j;
+                    _npy_type_promotion_table[j][i] = j;
+                }
+                else if (_npy_can_cast_safely_table[j][i]) {
+                    _npy_type_promotion_table[i][j] = i;
+                    _npy_type_promotion_table[j][i] = i;
+                }
+                else {
+                    int k, iskind, jskind, skind;
+                    iskind = _npy_scalar_kinds_table[i];
+                    jskind = _npy_scalar_kinds_table[j];
+                    /* If there's no kind (void/string/etc) */
+                    if (iskind == NPY_NOSCALAR || jskind == NPY_NOSCALAR) {
+                        k = -1;
+                    }
+                    else {
+                        /* Start with the type of larger kind */
+                        if (iskind > jskind) {
+                            skind = iskind;
+                            k = i;
+                        }
+                        else {
+                            skind = jskind;
+                            k = j;
+                        }
+                        for (;;) {
+                            /* Try the next larger type of this kind */
+                            k = _npy_next_larger_type_table[k];
+
+                            /* If there is no larger, try a larger kind */
+                            if (k < 0) {
+                                ++skind;
+                                /* Use -1 to signal no promoted type found */
+                                if (skind < NPY_NSCALARKINDS) {
+                                    k = _npy_smallest_type_of_kind_table[skind];
+                                }
+                                else {
+                                    k = -1;
+                                    break;
+                                }
+                            }
+
+                            if (_npy_can_cast_safely_table[i][k] &&
+                                            _npy_can_cast_safely_table[j][k]) {
+                                break;
+                            }
+                        }
+                    }
+                    _npy_type_promotion_table[i][j] = k;
+                    _npy_type_promotion_table[j][i] = k;
+                }
+            }
+        }
+    }
+    /* Special case date-time */
+    _npy_type_promotion_table[NPY_DATETIME][NPY_TIMEDELTA] = NPY_DATETIME;
+    _npy_type_promotion_table[NPY_TIMEDELTA][NPY_DATETIME] = NPY_DATETIME;
 }
 
 
@@ -3634,19 +3754,19 @@ static PyTypeObject *typeobjects[] = {
     &PyULongArrType_Type,
     &PyLongLongArrType_Type,
     &PyULongLongArrType_Type,
-    &PyHalfArrType_Type,
     &PyFloatArrType_Type,
     &PyDoubleArrType_Type,
     &PyLongDoubleArrType_Type,
     &PyCFloatArrType_Type,
     &PyCDoubleArrType_Type,
     &PyCLongDoubleArrType_Type,
-    &PyDatetimeArrType_Type,
-    &PyTimedeltaArrType_Type,
     &PyObjectArrType_Type,
     &PyStringArrType_Type,
     &PyUnicodeArrType_Type,
-    &PyVoidArrType_Type
+    &PyVoidArrType_Type,
+    &PyDatetimeArrType_Type,
+    &PyTimedeltaArrType_Type,
+    &PyHalfArrType_Type
 };
 
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/multiarray/scalartypes.h b/numpy/core/src/multiarray/scalartypes.h
index c60f61dfb..53850947a 100644
--- a/numpy/core/src/multiarray/scalartypes.h
+++ b/numpy/core/src/multiarray/scalartypes.h
@@ -7,11 +7,23 @@ extern NPY_NO_EXPORT unsigned char
 _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
 extern NPY_NO_EXPORT char
 _npy_scalar_kinds_table[NPY_NTYPES];
+extern NPY_NO_EXPORT char
+_npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
+extern NPY_NO_EXPORT char
+_npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
+extern NPY_NO_EXPORT char
+_npy_next_larger_type_table[NPY_NTYPES];
 #else
 NPY_NO_EXPORT unsigned char
 _npy_can_cast_safely_table[NPY_NTYPES][NPY_NTYPES];
 NPY_NO_EXPORT char
 _npy_scalar_kinds_table[NPY_NTYPES];
+NPY_NO_EXPORT char
+_npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
+NPY_NO_EXPORT char
+_npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
+NPY_NO_EXPORT char
+_npy_next_larger_type_table[NPY_NTYPES];
 #endif
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 671dc1538..f42860f78 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -174,7 +174,7 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
  */
 NPY_NO_EXPORT PyObject *
 PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
-                 NPY_ORDER fortran)
+                 NPY_ORDER order)
 {
     intp i;
     intp *dimensions = newdims->ptr;
@@ -185,8 +185,8 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
     intp newstrides[MAX_DIMS];
     int flags;
 
-    if (fortran == PyArray_ANYORDER) {
-        fortran = PyArray_ISFORTRAN(self);
+    if (order == PyArray_ANYORDER) {
+        order = PyArray_ISFORTRAN(self);
     }
     /*  Quick check to make sure anything actually needs to be done */
     if (n == self->nd) {
@@ -233,12 +233,12 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
          */
         if (!(PyArray_ISONESEGMENT(self)) ||
             (((PyArray_CHKFLAGS(self, NPY_CONTIGUOUS) &&
-               fortran == NPY_FORTRANORDER) ||
+               order == NPY_FORTRANORDER) ||
               (PyArray_CHKFLAGS(self, NPY_FORTRAN) &&
-                  fortran == NPY_CORDER)) && (self->nd > 1))) {
+                  order == NPY_CORDER)) && (self->nd > 1))) {
             int success = 0;
             success = _attempt_nocopy_reshape(self,n,dimensions,
-                                              newstrides,fortran);
+                                              newstrides,order);
             if (success) {
                 /* no need to copy the array after all */
                 strides = newstrides;
@@ -246,7 +246,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             }
             else {
                 PyObject *new;
-                new = PyArray_NewCopy(self, fortran);
+                new = PyArray_NewCopy(self, order);
                 if (new == NULL) {
                     return NULL;
                 }
@@ -260,7 +260,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
 
         /* Make sure the flags argument is set. */
         if (n > 1) {
-            if (fortran == NPY_FORTRANORDER) {
+            if (order == NPY_FORTRANORDER) {
                 flags &= ~NPY_CONTIGUOUS;
                 flags |= NPY_FORTRAN;
             }
@@ -275,7 +275,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
          * replace any 0-valued strides with
          * appropriate value to preserve contiguousness
          */
-        if (fortran == PyArray_FORTRANORDER) {
+        if (order == NPY_FORTRANORDER) {
             if (strides[0] == 0) {
                 strides[0] = self->descr->elsize;
             }
@@ -760,29 +760,137 @@ PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute)
     return (PyObject *)ret;
 }
 
+/*
+ * Sorts items so stride is descending, because C-order
+ * is the default in the face of ambiguity.
+ */
+int _npy_stride_sort_item_comparator(const void *a, const void *b)
+{
+    npy_intp astride = ((_npy_stride_sort_item *)a)->stride,
+            bstride = ((_npy_stride_sort_item *)b)->stride;
+
+    /* Sort the absolute value of the strides */
+    if (astride < 0) {
+        astride = -astride;
+    }
+    if (bstride < 0) {
+        bstride = -bstride;
+    }
+
+    if (astride > bstride) {
+        return -1;
+    }
+    else if (astride == bstride) {
+        /*
+         * Make the qsort stable by next comparing the perm order.
+         * (Note that two perm entries will never be equal)
+         */
+        npy_intp aperm = ((_npy_stride_sort_item *)a)->perm,
+                bperm = ((_npy_stride_sort_item *)b)->perm;
+        return (aperm < bperm) ? -1 : 1;
+    }
+    else {
+        return 1;
+    }
+}
+
+/*
+ * This function populates the first PyArray_NDIM(arr) elements
+ * of strideperm with sorted descending by their absolute values.
+ * For example, the stride array (4, -2, 12) becomes
+ * [(2, 12), (0, 4), (1, -2)].
+ */
+NPY_NO_EXPORT void
+PyArray_CreateSortedStridePerm(PyArrayObject *arr,
+                           _npy_stride_sort_item *strideperm)
+{
+    int i, ndim = PyArray_NDIM(arr);
+
+    /* Set up the strideperm values */
+    for (i = 0; i < ndim; ++i) {
+        strideperm[i].perm = i;
+        strideperm[i].stride = PyArray_STRIDE(arr, i);
+    }
+
+    /* Sort them */
+    qsort(strideperm, ndim, sizeof(_npy_stride_sort_item),
+                                    &_npy_stride_sort_item_comparator);
+}
+
 /*NUMPY_API
  * Ravel
  * Returns a contiguous array
  */
 NPY_NO_EXPORT PyObject *
-PyArray_Ravel(PyArrayObject *a, NPY_ORDER fortran)
+PyArray_Ravel(PyArrayObject *a, NPY_ORDER order)
 {
     PyArray_Dims newdim = {NULL,1};
     intp val[1] = {-1};
 
-    if (fortran == PyArray_ANYORDER) {
-        fortran = PyArray_ISFORTRAN(a);
-    }
     newdim.ptr = val;
-    if (!fortran && PyArray_ISCONTIGUOUS(a)) {
-        return PyArray_Newshape(a, &newdim, PyArray_CORDER);
+
+    if (order == NPY_ANYORDER) {
+        order = PyArray_ISFORTRAN(a) ? NPY_FORTRANORDER : NPY_CORDER;
+    }
+    else if (order == NPY_KEEPORDER) {
+        if (PyArray_IS_C_CONTIGUOUS(a)) {
+            order = NPY_CORDER;
+        }
+        else if (PyArray_IS_F_CONTIGUOUS(a)) {
+            order = NPY_FORTRANORDER;
+        }
     }
-    else if (fortran && PyArray_ISFORTRAN(a)) {
-        return PyArray_Newshape(a, &newdim, PyArray_FORTRANORDER);
+
+    if (order == NPY_CORDER && PyArray_ISCONTIGUOUS(a)) {
+        return PyArray_Newshape(a, &newdim, NPY_CORDER);
     }
-    else {
-        return PyArray_Flatten(a, fortran);
+    else if (order == NPY_FORTRANORDER && PyArray_ISFORTRAN(a)) {
+        return PyArray_Newshape(a, &newdim, NPY_FORTRANORDER);
     }
+    /* For KEEPORDER, check if we can make a flattened view */
+    else if (order == NPY_KEEPORDER) {
+        _npy_stride_sort_item strideperm[NPY_MAXDIMS];
+        npy_intp stride;
+        int i, ndim = PyArray_NDIM(a);
+
+        PyArray_CreateSortedStridePerm(a, strideperm);
+
+        stride = PyArray_DESCR(a)->elsize;
+        for (i = ndim-1; i >= 0; --i) {
+            if (strideperm[i].stride != stride) {
+                break;
+            }
+            stride *= PyArray_DIM(a, strideperm[i].perm);
+        }
+
+        /* If all the strides matched a contiguous layout, return a view */
+        if (i < 0) {
+            PyObject *ret;
+            npy_intp stride = PyArray_DESCR(a)->elsize;
+
+            val[0] = PyArray_SIZE(a);
+
+            Py_INCREF(PyArray_DESCR(a));
+            ret = PyArray_NewFromDescr(Py_TYPE(a),
+                               PyArray_DESCR(a),
+                               1, val,
+                               &stride,
+                               PyArray_BYTES(a),
+                               PyArray_FLAGS(a),
+                               (PyObject *)a);
+
+            if (ret != NULL) {
+                PyArray_UpdateFlags((PyArrayObject *)ret,
+                                    NPY_CONTIGUOUS|NPY_FORTRAN);
+                Py_INCREF(a);
+                PyArray_BASE(ret) = (PyObject *)a;
+            }
+            return ret;
+        }
+
+    }
+
+    return PyArray_Flatten(a, order);
 }
 
 /*NUMPY_API
@@ -791,15 +899,16 @@ PyArray_Ravel(PyArrayObject *a, NPY_ORDER fortran)
 NPY_NO_EXPORT PyObject *
 PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
 {
-    PyObject *ret;
+    PyArrayObject *ret;
     intp size;
 
-    if (order == PyArray_ANYORDER) {
-        order = PyArray_ISFORTRAN(a);
+    if (order == NPY_ANYORDER) {
+        order = PyArray_ISFORTRAN(a) ? NPY_FORTRANORDER : NPY_CORDER;
     }
+
     size = PyArray_SIZE(a);
     Py_INCREF(a->descr);
-    ret = PyArray_NewFromDescr(Py_TYPE(a),
+    ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(a),
                                a->descr,
                                1, &size,
                                NULL,
@@ -809,11 +918,11 @@ PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
     if (ret == NULL) {
         return NULL;
     }
-    if (_flat_copyinto(ret, (PyObject *)a, order) < 0) {
+    if (PyArray_CopyAnyIntoOrdered(ret, a, order) < 0) {
         Py_DECREF(ret);
         return NULL;
     }
-    return ret;
+    return (PyObject *)ret;
 }
 
 
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index 1a5991a50..8038a9f25 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -1,4 +1,18 @@
 #ifndef _NPY_ARRAY_SHAPE_H_
 #define _NPY_ARRAY_SHAPE_H_
 
+typedef struct {
+    npy_intp perm, stride;
+} _npy_stride_sort_item;
+
+/*
+ * This function populates the first PyArray_NDIM(arr) elements
+ * of strideperm with sorted descending by their absolute values.
+ * For example, the stride array (4, -2, 12) becomes
+ * [(2, 12), (0, 4), (1, -2)].
+ */
+NPY_NO_EXPORT void
+PyArray_CreateSortedStridePerm(PyArrayObject *arr,
+                           _npy_stride_sort_item *strideperm);
+
 #endif
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 594722695..618ffb776 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -94,7 +94,7 @@ PyArray_InitArrFuncs(PyArray_ArrFuncs *f)
 {
     int i;
 
-    for(i = 0; i < PyArray_NTYPES; i++) {
+    for(i = 0; i < NPY_NTYPES_ABI_COMPATIBLE; i++) {
         f->cast[i] = NULL;
     }
     f->getitem = NULL;
@@ -188,11 +188,11 @@ PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
     PyObject *cobj, *key;
     int ret;
 
-    if (totype < PyArray_NTYPES) {
+    if (totype < NPY_NTYPES_ABI_COMPATIBLE) {
         descr->f->cast[totype] = castfunc;
         return 0;
     }
-    if (!PyTypeNum_ISUSERDEF(totype)) {
+    if (totype >= NPY_NTYPES && !PyTypeNum_ISUSERDEF(totype)) {
         PyErr_SetString(PyExc_TypeError, "invalid type number.");
         return -1;
     }
diff --git a/numpy/core/src/npymath/halffloat.c b/numpy/core/src/npymath/halffloat.c
index 79f77197b..38f52b86d 100644
--- a/numpy/core/src/npymath/halffloat.c
+++ b/numpy/core/src/npymath/halffloat.c
@@ -352,14 +352,14 @@ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
     npy_uint64 d_exp, d_sig;
     npy_uint16 h_sgn, h_exp, h_sig;
 
-    h_sgn = (d&0x8000000000000000u) >> 48;
-    d_exp = (d&0x7ff0000000000000u);
+    h_sgn = (d&0x8000000000000000ULL) >> 48;
+    d_exp = (d&0x7ff0000000000000ULL);
     
     /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (d_exp >= 0x40f0000000000000u) {
-        if (d_exp == 0x7ff0000000000000u) {
+    if (d_exp >= 0x40f0000000000000ULL) {
+        if (d_exp == 0x7ff0000000000000ULL) {
             /* Inf or NaN */
-            d_sig = (d&0x000fffffffffffffu);
+            d_sig = (d&0x000fffffffffffffULL);
             if (d_sig != 0) {
                 /* NaN - propagate the flag in the significand... */
                 npy_uint16 ret = (npy_uint16) (0x7c00u + (d_sig >> 42));
@@ -382,15 +382,15 @@ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
     }
     
     /* Exponent underflow converts to subnormal half or signed zero */
-    if (d_exp <= 0x3f00000000000000u) {
+    if (d_exp <= 0x3f00000000000000ULL) {
         /* 
          * Signed zeros, subnormal floats, and floats with small
          * exponents all convert to signed zero halfs.
          */
-        if (d_exp < 0x3e60000000000000u) {
+        if (d_exp < 0x3e60000000000000ULL) {
 #if NPY_HALF_GENERATE_UNDERFLOW 
             /* If d != 0, it underflowed to 0 */
-            if ((d&0x7fffffffffffffffu) != 0) {
+            if ((d&0x7fffffffffffffffULL) != 0) {
                 npy_set_floatstatus_underflow();
             }
 #endif
@@ -398,7 +398,7 @@ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
         }
         /* Make the subnormal significand */
         d_exp >>= 52;
-        d_sig = (0x0010000000000000u + (d&0x000fffffffffffffu));
+        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
 #if NPY_HALF_GENERATE_UNDERFLOW 
         /* If it's not exactly represented, it underflowed */
         if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) {
@@ -413,11 +413,11 @@ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
          * the remaining bit pattern is 1000...0, then we do not add one
          * to the bit after the half significand.  In all other cases, we do.
          */
-        if ((d_sig&0x000007ffffffffffu) != 0x0000020000000000u) {
-            d_sig += 0x0000020000000000u;
+        if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+            d_sig += 0x0000020000000000ULL;
         }
 #else
-        d_sig += 0x0000020000000000u;
+        d_sig += 0x0000020000000000ULL;
 #endif
         h_sig = (npy_uint16) (d_sig >> 42);
         /*
@@ -429,20 +429,20 @@ npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
     }
 
     /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000u) >> 42);
+    h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000ULL) >> 42);
     /* Handle rounding by adding 1 to the bit beyond half precision */
-    d_sig = (d&0x000fffffffffffffu);
+    d_sig = (d&0x000fffffffffffffULL);
 #if NPY_HALF_ROUND_TIES_TO_EVEN 
     /*
      * If the last bit in the half significand is 0 (already even), and
      * the remaining bit pattern is 1000...0, then we do not add one
      * to the bit after the half significand.  In all other cases, we do.
      */
-    if ((d_sig&0x000007ffffffffffu) != 0x0000020000000000u) {
-        d_sig += 0x0000020000000000u;
+    if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+        d_sig += 0x0000020000000000ULL;
     }
 #else
-    d_sig += 0x0000020000000000u;
+    d_sig += 0x0000020000000000ULL;
 #endif
     h_sig = (npy_uint16) (d_sig >> 42);
 
@@ -520,7 +520,7 @@ npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
             return d_sgn + d_exp + d_sig;
         case 0x7c00u: /* inf or NaN */
             /* All-ones exponent and a copy of the significand */
-            return d_sgn + 0x7ff0000000000000u +
+            return d_sgn + 0x7ff0000000000000ULL +
                                 (((npy_uint64)(h&0x03ffu)) << 42);
         default: /* normalized */
             /* Just need to adjust the exponent and shift */
diff --git a/numpy/core/src/scalarmathmodule.c.src b/numpy/core/src/scalarmathmodule.c.src
index d1a66e101..418e81531 100644
--- a/numpy/core/src/scalarmathmodule.c.src
+++ b/numpy/core/src/scalarmathmodule.c.src
@@ -704,10 +704,12 @@ static PyObject *
     PyObject *ret;
     npy_@name@ arg1, arg2;
     /*
-     * NOTE: In gcc >= 4.1, the compiler will reorder floating point operations and
-     *       floating point error state checks.  In particular, the arithmetic operations
-     *       were being reordered so that the errors weren't caught.  Declaring this output
-     *       variable volatile was the minimal fix for the issue. (Ticket #1671)
+     * NOTE: In gcc >= 4.1, the compiler will reorder floating point
+     *       operations and floating point error state checks. In
+     *       particular, the arithmetic operations were being reordered
+     *       so that the errors weren't caught.  Declaring this output
+     *       variable volatile was the minimal fix for the issue.
+     *       (Ticket #1671)
      */
     volatile npy_@otyp@ out;
 #if @twoout@
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 8c98db6b7..26cee7fa0 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -388,8 +388,8 @@ PyUFunc_O_O(char **args, intp *dimensions, intp *steps, void *func)
     UNARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = f(in1);
-        if ((ret == NULL) || PyErr_Occurred()) {
+        PyObject *ret = f(in1 ? in1 : Py_None);
+        if (ret == NULL) {
             return;
         }
         Py_XDECREF(*out);
@@ -405,7 +405,7 @@ PyUFunc_O_O_method(char **args, intp *dimensions, intp *steps, void *func)
     UNARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = PyObject_CallMethod(in1, meth, NULL);
+        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None, meth, NULL);
         if (ret == NULL) {
             return;
         }
@@ -423,8 +423,8 @@ PyUFunc_OO_O(char **args, intp *dimensions, intp *steps, void *func)
         PyObject *in1 = *(PyObject **)ip1;
         PyObject *in2 = *(PyObject **)ip2;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = f(in1, in2);
-        if (PyErr_Occurred()) {
+        PyObject *ret = f(in1 ? in1 : Py_None, in2 ? in2 : Py_None);
+        if (ret == NULL) {
             return;
         }
         Py_XDECREF(*out);
@@ -441,7 +441,8 @@ PyUFunc_OO_O_method(char **args, intp *dimensions, intp *steps, void *func)
         PyObject *in1 = *(PyObject **)ip1;
         PyObject *in2 = *(PyObject **)ip2;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = PyObject_CallMethod(in1, meth, "(O)", in2);
+        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None,
+                                            meth, "(O)", in2);
         if (ret == NULL) {
             return;
         }
@@ -482,8 +483,7 @@ PyUFunc_On_Om(char **args, intp *dimensions, intp *steps, void *func)
         for(j = 0; j < nin; j++) {
             in = *((PyObject **)ptrs[j]);
             if (in == NULL) {
-                Py_DECREF(arglist);
-                return;
+                in = Py_None;
             }
             PyTuple_SET_ITEM(arglist, j, in);
             Py_INCREF(in);
@@ -2037,7 +2037,9 @@ OBJECT_@kind@(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func)
     BINARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject *in2 = *(PyObject **)ip2;
-        int ret = PyObject_RichCompareBool(in1, in2, Py_@OP@);
+        int ret = PyObject_RichCompareBool(
+                            in1 ? in1 : Py_None,
+                            in2 ? in2 : Py_None, Py_@OP@);
         if (ret == -1) {
             return;
         }
@@ -2056,7 +2058,7 @@ OBJECT_sign(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func))
         PyObject **out = (PyObject **)op1;
         int v;
         PyObject *ret;
-        PyObject_Cmp(in1, zero, &v);
+        PyObject_Cmp(in1 ? in1 : Py_None, zero, &v);
         ret = PyLong_FromLong(v);
         if (PyErr_Occurred()) {
             return;
@@ -2070,7 +2072,8 @@ OBJECT_sign(char **args, intp *dimensions, intp *steps, void *NPY_UNUSED(func))
     UNARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = PyInt_FromLong(PyObject_Compare(in1, zero));
+        PyObject *ret = PyInt_FromLong(
+                            PyObject_Compare(in1 ? in1 : Py_None, zero));
         if (PyErr_Occurred()) {
             return;
         }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 39b04db73..514575bf0 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -37,10 +37,25 @@
 
 #include "numpy/noprefix.h"
 #include "numpy/ufuncobject.h"
+#include "lowlevel_strided_loops.h"
 
 #include "ufunc_object.h"
 
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_UF_DBG_TRACING 0
+
+#if NPY_UF_DBG_TRACING
+#define NPY_UF_DBG_PRINTF(...) printf(__VA_ARGS__)
+#else
+#define NPY_UF_DBG_PRINTF(...)
+#endif
+/**********************************************/
+
+
+/********************/
 #define USE_USE_DEFAULTS 1
+#define USE_NEW_ITERATOR_GENFUNC 1
+/********************/
 
 /* ---------------------------------------------------------------- */
 
@@ -206,65 +221,31 @@ PyUFunc_clearfperr()
 #define SIGNATURE_NOBUFFER_UFUNCLOOP 4
 
 
-static char
-_lowest_type(char intype)
-{
-    switch(intype) {
-    /* case PyArray_BYTE */
-    case PyArray_SHORT:
-    case PyArray_INT:
-    case PyArray_LONG:
-    case PyArray_LONGLONG:
-    case PyArray_DATETIME:
-    case PyArray_TIMEDELTA:
-        return PyArray_BYTE;
-    /* case PyArray_UBYTE */
-    case PyArray_USHORT:
-    case PyArray_UINT:
-    case PyArray_ULONG:
-    case PyArray_ULONGLONG:
-        return PyArray_UBYTE;
-    /* case PyArray_HALF: */
-    case PyArray_FLOAT:
-    case PyArray_DOUBLE:
-    case PyArray_LONGDOUBLE:
-        return PyArray_HALF;
-    /* case PyArray_CFLOAT:*/
-    case PyArray_CDOUBLE:
-    case PyArray_CLONGDOUBLE:
-        return PyArray_CFLOAT;
-    default:
-        return intype;
-    }
-}
-
-static char *_types_msg =  "function not supported for these types, "   \
-    "and can't coerce safely to supported types";
-
 /*
  * This function analyzes the input arguments
  * and determines an appropriate __array_prepare__ function to call
  * for the outputs.
  *
- * If an output argument is provided, then it is wrapped
+ * If an output argument is provided, then it is prepped
  * with its own __array_prepare__ not with the one determined by
  * the input arguments.
  *
  * if the provided output argument is already an ndarray,
- * the wrapping function is None (which means no wrapping will
+ * the prepping function is None (which means no prepping will
  * be done --- not even PyArray_Return).
  *
- * A NULL is placed in output_wrap for outputs that
+ * A NULL is placed in output_prep for outputs that
  * should just have PyArray_Return called.
  */
 static void
-_find_array_prepare(PyObject *args, PyObject **output_wrap, int nin, int nout)
+_find_array_prepare(PyObject *args, PyObject *kwds,
+                    PyObject **output_prep, int nin, int nout)
 {
     Py_ssize_t nargs;
     int i;
     int np = 0;
-    PyObject *with_wrap[NPY_MAXARGS], *wraps[NPY_MAXARGS];
-    PyObject *obj, *wrap = NULL;
+    PyObject *with_prep[NPY_MAXARGS], *preps[NPY_MAXARGS];
+    PyObject *obj, *prep = NULL;
 
     nargs = PyTuple_GET_SIZE(args);
     for (i = 0; i < nin; i++) {
@@ -272,16 +253,16 @@ _find_array_prepare(PyObject *args, PyObject **output_wrap, int nin, int nout)
         if (PyArray_CheckExact(obj) || PyArray_IsAnyScalar(obj)) {
             continue;
         }
-        wrap = PyObject_GetAttrString(obj, "__array_prepare__");
-        if (wrap) {
-            if (PyCallable_Check(wrap)) {
-                with_wrap[np] = obj;
-                wraps[np] = wrap;
+        prep = PyObject_GetAttrString(obj, "__array_prepare__");
+        if (prep) {
+            if (PyCallable_Check(prep)) {
+                with_prep[np] = obj;
+                preps[np] = prep;
                 ++np;
             }
             else {
-                Py_DECREF(wrap);
-                wrap = NULL;
+                Py_DECREF(prep);
+                prep = NULL;
             }
         }
         else {
@@ -289,33 +270,33 @@ _find_array_prepare(PyObject *args, PyObject **output_wrap, int nin, int nout)
         }
     }
     if (np > 0) {
-        /* If we have some wraps defined, find the one of highest priority */
-        wrap = wraps[0];
+        /* If we have some preps defined, find the one of highest priority */
+        prep = preps[0];
         if (np > 1) {
-            double maxpriority = PyArray_GetPriority(with_wrap[0],
+            double maxpriority = PyArray_GetPriority(with_prep[0],
                         PyArray_SUBTYPE_PRIORITY);
             for (i = 1; i < np; ++i) {
-                double priority = PyArray_GetPriority(with_wrap[i],
+                double priority = PyArray_GetPriority(with_prep[i],
                             PyArray_SUBTYPE_PRIORITY);
                 if (priority > maxpriority) {
                     maxpriority = priority;
-                    Py_DECREF(wrap);
-                    wrap = wraps[i];
+                    Py_DECREF(prep);
+                    prep = preps[i];
                 }
                 else {
-                    Py_DECREF(wraps[i]);
+                    Py_DECREF(preps[i]);
                 }
             }
         }
     }
 
     /*
-     * Here wrap is the wrapping function determined from the
+     * Here prep is the prepping function determined from the
      * input arrays (could be NULL).
      *
      * For all the output arrays decide what to do.
      *
-     * 1) Use the wrap function determined from the input arrays
+     * 1) Use the prep function determined from the input arrays
      * This is the default if the output array is not
      * passed in.
      *
@@ -327,381 +308,62 @@ _find_array_prepare(PyObject *args, PyObject **output_wrap, int nin, int nout)
     for (i = 0; i < nout; i++) {
         int j = nin + i;
         int incref = 1;
-        output_wrap[i] = wrap;
+        output_prep[i] = prep;
+        obj = NULL;
         if (j < nargs) {
             obj = PyTuple_GET_ITEM(args, j);
-            if (obj == Py_None) {
-                continue;
+            /* Output argument one may also be in a keyword argument */
+            if (i == 0 && obj == Py_None && kwds != NULL) {
+                obj = PyDict_GetItemString(kwds, "out");
             }
+        }
+        /* Output argument one may also be in a keyword argument */
+        else if (i == 0 && kwds != NULL) {
+            obj = PyDict_GetItemString(kwds, "out");
+        }
+
+        if (obj != Py_None && obj != NULL) {
             if (PyArray_CheckExact(obj)) {
-                output_wrap[i] = Py_None;
+                /* None signals to not call any wrapping */
+                output_prep[i] = Py_None;
             }
             else {
-                PyObject *owrap = PyObject_GetAttrString(obj,
+                PyObject *oprep = PyObject_GetAttrString(obj,
                             "__array_prepare__");
                 incref = 0;
-                if (!(owrap) || !(PyCallable_Check(owrap))) {
-                    Py_XDECREF(owrap);
-                    owrap = wrap;
+                if (!(oprep) || !(PyCallable_Check(oprep))) {
+                    Py_XDECREF(oprep);
+                    oprep = prep;
                     incref = 1;
                     PyErr_Clear();
                 }
-                output_wrap[i] = owrap;
+                output_prep[i] = oprep;
             }
         }
+
         if (incref) {
-            Py_XINCREF(output_wrap[i]);
+            Py_XINCREF(output_prep[i]);
         }
     }
-    Py_XDECREF(wrap);
+    Py_XDECREF(prep);
     return;
 }
 
-/*
- * Called for non-NULL user-defined functions.
- * The object should be a CObject pointing to a linked-list of functions
- * storing the function, data, and signature of all user-defined functions.
- * There must be a match with the input argument types or an error
- * will occur.
- */
-static int
-_find_matching_userloop(PyObject *obj, int *arg_types,
-                        PyArray_SCALARKIND *scalars,
-                        PyUFuncGenericFunction *function, void **data,
-                        int nargs, int nin)
-{
-    PyUFunc_Loop1d *funcdata;
-    int i;
-
-    funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-    while (funcdata != NULL) {
-        for (i = 0; i < nin; i++) {
-            if (!PyArray_CanCoerceScalar(arg_types[i],
-                                         funcdata->arg_types[i],
-                                         scalars[i]))
-                break;
-        }
-        if (i == nin) {
-            /* match found */
-            *function = funcdata->func;
-            *data = funcdata->data;
-            /* Make sure actual arg_types supported by the loop are used */
-            for (i = 0; i < nargs; i++) {
-                arg_types[i] = funcdata->arg_types[i];
-            }
-            return 0;
-        }
-        funcdata = funcdata->next;
-    }
-    return -1;
-}
-
-/*
- * if only one type is specified then it is the "first" output data-type
- * and the first signature matching this output data-type is returned.
- *
- * if a tuple of types is specified then an exact match to the signature
- * is searched and it much match exactly or an error occurs
- */
-static int
-extract_specified_loop(PyUFuncObject *self, int *arg_types,
-                       PyUFuncGenericFunction *function, void **data,
-                       PyObject *type_tup, int userdef)
-{
-    Py_ssize_t n = 1;
-    int *rtypenums;
-    static char msg[] = "loop written to specified type(s) not found";
-    PyArray_Descr *dtype;
-    int nargs;
-    int i, j;
-    int strtype = 0;
-
-    nargs = self->nargs;
-    if (PyTuple_Check(type_tup)) {
-        n = PyTuple_GET_SIZE(type_tup);
-        if (n != 1 && n != nargs) {
-            PyErr_Format(PyExc_ValueError,
-                         "a type-tuple must be specified " \
-                         "of length 1 or %d for %s", nargs,
-                         self->name ? self->name : "(unknown)");
-            return -1;
-        }
-    }
-    else if (PyString_Check(type_tup)) {
-            Py_ssize_t slen;
-            char *thestr;
-
-            slen = PyString_GET_SIZE(type_tup);
-            thestr = PyString_AS_STRING(type_tup);
-            for (i = 0; i < slen - 2; i++) {
-                if (thestr[i] == '-' && thestr[i+1] == '>') {
-                    break;
-                }
-            }
-            if (i < slen-2) {
-                strtype = 1;
-                n = slen - 2;
-                if (i != self->nin
-                    || slen - 2 - i != self->nout) {
-                    PyErr_Format(PyExc_ValueError,
-                                 "a type-string for %s, "   \
-                                 "requires %d typecode(s) before " \
-                                 "and %d after the -> sign",
-                                 self->name ? self->name : "(unknown)",
-                                 self->nin, self->nout);
-                    return -1;
-                }
-            }
-        }
-    rtypenums = (int *)_pya_malloc(n*sizeof(int));
-    if (rtypenums == NULL) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    if (strtype) {
-        char *ptr;
-        ptr = PyString_AS_STRING(type_tup);
-        i = 0;
-        while (i < n) {
-            if (*ptr == '-' || *ptr == '>') {
-                ptr++;
-                continue;
-            }
-            dtype = PyArray_DescrFromType((int) *ptr);
-            if (dtype == NULL) {
-                goto fail;
-            }
-            rtypenums[i] = dtype->type_num;
-            Py_DECREF(dtype);
-            ptr++;
-            i++;
-        }
-    }
-    else if (PyTuple_Check(type_tup)) {
-        for (i = 0; i < n; i++) {
-            if (PyArray_DescrConverter(PyTuple_GET_ITEM(type_tup, i),
-                                       &dtype) == NPY_FAIL) {
-                goto fail;
-            }
-            rtypenums[i] = dtype->type_num;
-            Py_DECREF(dtype);
-        }
-    }
-    else {
-        if (PyArray_DescrConverter(type_tup, &dtype) == NPY_FAIL) {
-            goto fail;
-        }
-        rtypenums[0] = dtype->type_num;
-        Py_DECREF(dtype);
-    }
-
-    if (userdef > 0) {
-        /* search in the user-defined functions */
-        PyObject *key, *obj;
-        PyUFunc_Loop1d *funcdata;
-
-        obj = NULL;
-        key = PyInt_FromLong((long) userdef);
-        if (key == NULL) {
-            goto fail;
-        }
-        obj = PyDict_GetItem(self->userloops, key);
-        Py_DECREF(key);
-        if (obj == NULL) {
-            PyErr_SetString(PyExc_TypeError,
-                            "user-defined type used in ufunc" \
-                            " with no registered loops");
-            goto fail;
-        }
-        /*
-         * extract the correct function
-         * data and argtypes
-         */
-        funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-        while (funcdata != NULL) {
-            if (n != 1) {
-                for (i = 0; i < nargs; i++) {
-                    if (rtypenums[i] != funcdata->arg_types[i]) {
-                        break;
-                    }
-                }
-            }
-            else if (rtypenums[0] == funcdata->arg_types[self->nin]) {
-                i = nargs;
-            }
-            else {
-                i = -1;
-            }
-            if (i == nargs) {
-                *function = funcdata->func;
-                *data = funcdata->data;
-                for(i = 0; i < nargs; i++) {
-                    arg_types[i] = funcdata->arg_types[i];
-                }
-                Py_DECREF(obj);
-                goto finish;
-            }
-            funcdata = funcdata->next;
-        }
-        PyErr_SetString(PyExc_TypeError, msg);
-        goto fail;
-    }
-
-    /* look for match in self->functions */
-    for (j = 0; j < self->ntypes; j++) {
-        if (n != 1) {
-            for(i = 0; i < nargs; i++) {
-                if (rtypenums[i] != self->types[j*nargs + i]) {
-                    break;
-                }
-            }
-        }
-        else if (rtypenums[0] == self->types[j*nargs+self->nin]) {
-            i = nargs;
-        }
-        else {
-            i = -1;
-        }
-        if (i == nargs) {
-            *function = self->functions[j];
-            *data = self->data[j];
-            for (i = 0; i < nargs; i++) {
-                arg_types[i] = self->types[j*nargs+i];
-            }
-            goto finish;
-        }
-    }
-    PyErr_SetString(PyExc_TypeError, msg);
-
- fail:
-    _pya_free(rtypenums);
-    return -1;
-
- finish:
-    _pya_free(rtypenums);
-    return 0;
-}
-
-
-/*
- * Called to determine coercion
- * Can change arg_types.
- */
-static int
-select_types(PyUFuncObject *self, int *arg_types,
-             PyUFuncGenericFunction *function, void **data,
-             PyArray_SCALARKIND *scalars,
-             PyObject *typetup)
-{
-    int i, j;
-    char start_type;
-    int userdef = -1;
-    int userdef_ind = -1;
-
-    if (self->userloops) {
-        for(i = 0; i < self->nin; i++) {
-            if (PyTypeNum_ISUSERDEF(arg_types[i])) {
-                userdef = arg_types[i];
-                userdef_ind = i;
-                break;
-            }
-        }
-    }
-
-    if (typetup != NULL)
-        return extract_specified_loop(self, arg_types, function, data,
-                                      typetup, userdef);
-
-    if (userdef > 0) {
-        PyObject *key, *obj;
-        int ret = -1;
-        obj = NULL;
-
-        /*
-         * Look through all the registered loops for all the user-defined
-         * types to find a match.
-         */
-        while (ret == -1) {
-            if (userdef_ind >= self->nin) {
-                break;
-            }
-            userdef = arg_types[userdef_ind++];
-            if (!(PyTypeNum_ISUSERDEF(userdef))) {
-                continue;
-            }
-            key = PyInt_FromLong((long) userdef);
-            if (key == NULL) {
-                return -1;
-            }
-            obj = PyDict_GetItem(self->userloops, key);
-            Py_DECREF(key);
-            if (obj == NULL) {
-                continue;
-            }
-            /*
-             * extract the correct function
-             * data and argtypes for this user-defined type.
-             */
-            ret = _find_matching_userloop(obj, arg_types, scalars,
-                                          function, data, self->nargs,
-                                          self->nin);
-        }
-        if (ret == 0) {
-            return ret;
-        }
-        PyErr_SetString(PyExc_TypeError, _types_msg);
-        return ret;
-    }
-
-    start_type = arg_types[0];
-    /*
-     * If the first argument is a scalar we need to place
-     * the start type as the lowest type in the class
-     */
-    if (scalars[0] != PyArray_NOSCALAR) {
-        start_type = _lowest_type(start_type);
-    }
-
-    i = 0;
-    while (i < self->ntypes && start_type > self->types[i*self->nargs]) {
-        i++;
-    }
-    for (; i < self->ntypes; i++) {
-        for (j = 0; j < self->nin; j++) {
-            if (!PyArray_CanCoerceScalar(arg_types[j],
-                                         self->types[i*self->nargs + j],
-                                         scalars[j]))
-                break;
-        }
-        if (j == self->nin) {
-            break;
-        }
-    }
-    if (i >= self->ntypes) {
-        PyErr_SetString(PyExc_TypeError, _types_msg);
-        return -1;
-    }
-    for (j = 0; j < self->nargs; j++) {
-        arg_types[j] = self->types[i*self->nargs+j];
-    }
-    if (self->data) {
-        *data = self->data[i];
-    }
-    else {
-        *data = NULL;
-    }
-    *function = self->functions[i];
-
-    return 0;
-}
-
 #if USE_USE_DEFAULTS==1
 static int PyUFunc_NUM_NODEFAULTS = 0;
 #endif
 static PyObject *PyUFunc_PYVALS_NAME = NULL;
 
 
+/*
+ * Extracts some values from the global pyvals tuple.
+ * ref - should hold the global tuple
+ * name - is the name of the ufunc (ufuncobj->name)
+ * bufsize - receives the buffer size to use
+ * errmask - receives the bitmask for error handling
+ * errobj - receives the python object to call with the error,
+ *          if an error handling method is 'call'
+ */
 static int
 _extract_pyvals(PyObject *ref, char *name, int *bufsize,
                 int *errmask, PyObject **errobj)
@@ -794,53 +456,6 @@ PyUFunc_GetPyValues(char *name, int *bufsize, int *errmask, PyObject **errobj)
     return _extract_pyvals(ref, name, bufsize, errmask, errobj);
 }
 
-/*
- * Create copies for any arrays that are less than loop->bufsize
- * in total size (or core_enabled) and are mis-behaved or in need
- * of casting.
- */
-static int
-_create_copies(PyUFuncLoopObject *loop, int *arg_types, PyArrayObject **mps)
-{
-    int nin = loop->ufunc->nin;
-    int i;
-    intp size;
-    PyObject *new;
-    PyArray_Descr *ntype;
-    PyArray_Descr *atype;
-
-    for (i = 0; i < nin; i++) {
-        size = PyArray_SIZE(mps[i]);
-        /*
-         * if the type of mps[i] is equivalent to arg_types[i]
-         * then set arg_types[i] equal to type of mps[i] for later checking....
-         */
-        if (PyArray_TYPE(mps[i]) != arg_types[i]) {
-            ntype = mps[i]->descr;
-            atype = PyArray_DescrFromType(arg_types[i]);
-            if (PyArray_EquivTypes(atype, ntype)) {
-                arg_types[i] = ntype->type_num;
-            }
-            Py_DECREF(atype);
-        }
-        if (size < loop->bufsize || loop->ufunc->core_enabled) {
-            if (!(PyArray_ISBEHAVED_RO(mps[i]))
-                || PyArray_TYPE(mps[i]) != arg_types[i]) {
-                ntype = PyArray_DescrFromType(arg_types[i]);
-                new = PyArray_FromAny((PyObject *)mps[i],
-                                      ntype, 0, 0,
-                                      FORCECAST | ALIGNED, NULL);
-                if (new == NULL) {
-                        return -1;
-                }
-                Py_DECREF(mps[i]);
-                mps[i] = (PyArrayObject *)new;
-            }
-        }
-    }
-    return 0;
-}
-
 #define _GETATTR_(str, rstr) do {if (strcmp(name, #str) == 0)     \
         return PyObject_HasAttrString(op, "__" #rstr "__");} while (0);
 
@@ -1068,1964 +683,2673 @@ fail:
     return -1;
 }
 
-/*
- * Concatenate the loop and core dimensions of
- * PyArrayMultiIterObject's iarg-th argument, to recover a full
- * dimension array (used for output arguments).
- */
-static npy_intp*
-_compute_output_dims(PyUFuncLoopObject *loop, int iarg,
-                     int *out_nd, npy_intp *tmp_dims)
-{
-    int i;
-    PyUFuncObject *ufunc = loop->ufunc;
-    if (ufunc->core_enabled == 0) {
-        /* case of ufunc with trivial core-signature */
-        *out_nd = loop->nd;
-        return loop->dimensions;
-    }
-
-    *out_nd = loop->nd + ufunc->core_num_dims[iarg];
-    if (*out_nd > NPY_MAXARGS) {
-        PyErr_SetString(PyExc_ValueError,
-                        "dimension of output variable exceeds limit");
-        return NULL;
-    }
-
-    /* copy loop dimensions */
-    memcpy(tmp_dims, loop->dimensions, sizeof(npy_intp) * loop->nd);
-
-    /* copy core dimension */
-    for (i = 0; i < ufunc->core_num_dims[iarg]; i++) {
-        tmp_dims[loop->nd + i] = loop->core_dim_sizes[1 +
-            ufunc->core_dim_ixs[ufunc->core_offsets[iarg] + i]];
-    }
-    return tmp_dims;
-}
 
-/* Check and set core_dim_sizes and core_strides for the i-th argument. */
-static int
-_compute_dimension_size(PyUFuncLoopObject *loop, PyArrayObject **mps, int i)
-{
-    PyUFuncObject *ufunc = loop->ufunc;
-    int j = ufunc->core_offsets[i];
-    int k = PyArray_NDIM(mps[i]) - ufunc->core_num_dims[i];
-    int ind;
-    for (ind = 0; ind < ufunc->core_num_dims[i]; ind++, j++, k++) {
-        npy_intp dim = k < 0 ? 1 : PyArray_DIM(mps[i], k);
-        /* First element of core_dim_sizes will be used for looping */
-        int dim_ix = ufunc->core_dim_ixs[j] + 1;
-        if (loop->core_dim_sizes[dim_ix] == 1) {
-            /* broadcast core dimension  */
-            loop->core_dim_sizes[dim_ix] = dim;
-        }
-        else if (dim != 1 && dim != loop->core_dim_sizes[dim_ix]) {
-            PyErr_SetString(PyExc_ValueError, "core dimensions mismatch");
-            return -1;
-        }
-        /* First ufunc->nargs elements will be used for looping */
-        loop->core_strides[ufunc->nargs + j] =
-            dim == 1 ? 0 : PyArray_STRIDE(mps[i], k);
-    }
-    return 0;
-}
+/********* GENERIC UFUNC USING ITERATOR *********/
 
-/* Return a view of array "ap" with "core_nd" dimensions cut from tail. */
-static PyArrayObject *
-_trunc_coredim(PyArrayObject *ap, int core_nd)
+/*
+ * Parses the positional and keyword arguments for a generic ufunc call.
+ *
+ * Note that if an error is returned, the caller must free the
+ * non-zero references in out_op.  This
+ * function does not do its own clean-up.
+ */
+static int get_ufunc_arguments(PyUFuncObject *self,
+                PyObject *args, PyObject *kwds,
+                PyArrayObject **out_op,
+                NPY_ORDER *out_order,
+                NPY_CASTING *out_casting,
+                PyObject **out_extobj,
+                PyObject **out_typetup,
+                int *out_any_object)
 {
-    PyArrayObject *ret;
-    int nd = ap->nd - core_nd;
+    npy_intp i, nargs, nin = self->nin;
+    PyObject *obj, *context;
+    char *ufunc_name;
 
-    if (nd < 0) {
-        nd = 0;
-    }
-    /* The following code is basically taken from PyArray_Transpose */
-    /* NewFromDescr will steal this reference */
-    Py_INCREF(ap->descr);
-    ret = (PyArrayObject *)
-        PyArray_NewFromDescr(Py_TYPE(ap), ap->descr,
-                             nd, ap->dimensions,
-                             ap->strides, ap->data, ap->flags,
-                             (PyObject *)ap);
-    if (ret == NULL) {
-        return NULL;
-    }
-    /* point at true owner of memory: */
-    ret->base = (PyObject *)ap;
-    Py_INCREF(ap);
-    PyArray_UpdateFlags(ret, CONTIGUOUS | FORTRAN);
-    return ret;
-}
+    int any_flexible = 0, any_object = 0;
 
-static Py_ssize_t
-construct_arrays(PyUFuncLoopObject *loop, PyObject *args, PyArrayObject **mps,
-                 PyObject *typetup)
-{
-    Py_ssize_t nargs;
-    int i;
-    int arg_types[NPY_MAXARGS];
-    PyArray_SCALARKIND scalars[NPY_MAXARGS];
-    PyArray_SCALARKIND maxarrkind, maxsckind, new;
-    PyUFuncObject *self = loop->ufunc;
-    Bool allscalars = TRUE;
-    PyTypeObject *subtype = &PyArray_Type;
-    PyObject *context = NULL;
-    PyObject *obj;
-    int flexible = 0;
-    int object = 0;
-
-    npy_intp temp_dims[NPY_MAXDIMS];
-    npy_intp *out_dims;
-    int out_nd;
-    PyObject *wraparr[NPY_MAXARGS];
+    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
 
     /* Check number of arguments */
     nargs = PyTuple_Size(args);
-    if ((nargs < self->nin) || (nargs > self->nargs)) {
+    if ((nargs < nin) || (nargs > self->nargs)) {
         PyErr_SetString(PyExc_ValueError, "invalid number of arguments");
         return -1;
     }
 
-    /* Get each input argument */
-    maxarrkind = PyArray_NOSCALAR;
-    maxsckind = PyArray_NOSCALAR;
-    for(i = 0; i < self->nin; i++) {
-        obj = PyTuple_GET_ITEM(args,i);
+    /* Get input arguments */
+    for(i = 0; i < nin; ++i) {
+        obj = PyTuple_GET_ITEM(args, i);
         if (!PyArray_Check(obj) && !PyArray_IsScalar(obj, Generic)) {
+            /*
+             * TODO: There should be a comment here explaining what
+             *       context does.
+             */
             context = Py_BuildValue("OOi", self, args, i);
+            if (context == NULL) {
+                return -1;
+            }
         }
         else {
             context = NULL;
         }
-        mps[i] = (PyArrayObject *)PyArray_FromAny(obj, NULL, 0, 0, 0, context);
+        out_op[i] = (PyArrayObject *)PyArray_FromAny(obj,
+                                        NULL, 0, 0, 0, context);
         Py_XDECREF(context);
-        if (mps[i] == NULL) {
+        if (out_op[i] == NULL) {
             return -1;
         }
-        arg_types[i] = PyArray_TYPE(mps[i]);
-        if (!flexible && PyTypeNum_ISFLEXIBLE(arg_types[i])) {
-            flexible = 1;
+        if (!any_flexible &&
+                PyTypeNum_ISFLEXIBLE(PyArray_DESCR(out_op[i])->type_num)) {
+            any_flexible = 1;
         }
-        if (!object && PyTypeNum_ISOBJECT(arg_types[i])) {
-            object = 1;
+        if (!any_object &&
+                PyTypeNum_ISOBJECT(PyArray_DESCR(out_op[i])->type_num)) {
+            any_object = 1;
         }
-        /*
-         * debug
-         * fprintf(stderr, "array %d has reference %d\n", i,
-         * (mps[i])->ob_refcnt);
-         */
+    }
 
-        /*
-         * Scalars are 0-dimensional arrays at this point
-         */
+    /*
+     * Indicate not implemented if there are flexible objects (structured
+     * type or string) but no object types.
+     *
+     * Not sure - adding this increased to 246 errors, 150 failures.
+     */
+    if (any_flexible && !any_object) {
+        return -2;
 
-        /*
-         * We need to keep track of whether or not scalars
-         * are mixed with arrays of different kinds.
-         */
+    }
 
-        if (mps[i]->nd > 0) {
-            scalars[i] = PyArray_NOSCALAR;
-            allscalars = FALSE;
-            new = PyArray_ScalarKind(arg_types[i], NULL);
-            maxarrkind = NPY_MAX(new, maxarrkind);
+    /* Get positional output arguments */
+    for (i = nin; i < nargs; ++i) {
+        obj = PyTuple_GET_ITEM(args, i);
+        /* Translate None to NULL */
+        if (obj == Py_None) {
+            continue;
+        }
+        /* If it's an array, can use it */
+        if (PyArray_Check(obj)) {
+            if (!PyArray_ISWRITEABLE(obj)) {
+                PyErr_SetString(PyExc_ValueError,
+                                "return array is not writeable");
+                return -1;
+            }
+            Py_INCREF(obj);
+            out_op[i] = (PyArrayObject *)obj;
         }
         else {
-            scalars[i] = PyArray_ScalarKind(arg_types[i], &(mps[i]));
-            maxsckind = NPY_MAX(scalars[i], maxsckind);
+            PyErr_SetString(PyExc_TypeError,
+                            "return arrays must be "
+                            "of ArrayType");
+            return -1;
         }
     }
 
-    /* We don't do strings */
-    if (flexible && !object) {
-        loop->notimplemented = 1;
-        return nargs;
-    }
-
     /*
-     * If everything is a scalar, or scalars mixed with arrays of
-     * different kinds of lesser kinds then use normal coercion rules
+     * Get keyword output and other arguments.
+     * Raise an error if anything else is present in the
+     * keyword dictionary.
      */
-    if (allscalars || (maxsckind > maxarrkind)) {
-        for (i = 0; i < self->nin; i++) {
-            scalars[i] = PyArray_NOSCALAR;
+    if (kwds != NULL) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwds, &pos, &key, &value)) {
+            Py_ssize_t length = 0;
+            char *str = NULL;
+            int bad_arg = 1;
+            
+            if (PyString_AsStringAndSize(key, &str, &length) == -1) {
+                PyErr_SetString(PyExc_TypeError, "invalid keyword argument");
+                return -1;
+            }
+
+            switch (str[0]) {
+                case 'c':
+                    /* Provides a policy for allowed casting */
+                    if (strncmp(str,"casting",7) == 0) {
+                        if (!PyArray_CastingConverter(value, out_casting)) {
+                            return -1;
+                        }
+                        bad_arg = 0;
+                    }
+                    break;
+                case 'e':
+                    /*
+                     * Overrides the global parameters buffer size,
+                     * error mask, and error object
+                     */
+                    if (strncmp(str,"extobj",6) == 0) {
+                        *out_extobj = value;
+                        bad_arg = 0;
+                    }
+                    break;
+                case 'o':
+                    /* First output may be specified as a keyword parameter */
+                    if (strncmp(str,"out",3) == 0) {
+                        if (out_op[nin] != NULL) {
+                            PyErr_SetString(PyExc_ValueError,
+                                    "cannot specify 'out' as both a "
+                                    "positional and keyword argument");
+                            return -1;
+                        }
+
+                        if (PyArray_Check(value)) {
+                            if (!PyArray_ISWRITEABLE(value)) {
+                                PyErr_SetString(PyExc_ValueError,
+                                        "return array is not writeable");
+                                return -1;
+                            }
+                            Py_INCREF(value);
+                            out_op[nin] = (PyArrayObject *)value;
+                        }
+                        else {
+                            PyErr_SetString(PyExc_TypeError,
+                                            "return arrays must be "
+                                            "of ArrayType");
+                            return -1;
+                        }
+                        bad_arg = 0;
+                    }
+                    /* Allows the default output layout to be overridden */
+                    else if (strncmp(str,"order",5) == 0) {
+                        if (!PyArray_OrderConverter(value, out_order)) {
+                            return -1;
+                        }
+                        bad_arg = 0;
+                    }
+                    break;
+                case 's':
+                    /* Allows a specific function inner loop to be selected */
+                    if (strncmp(str,"sig",3) == 0) {
+                        if (*out_typetup != NULL) {
+                            PyErr_SetString(PyExc_RuntimeError,
+                                    "cannot specify both 'sig' and 'dtype'");
+                            return -1;
+                        }
+                        *out_typetup = value;
+                        Py_INCREF(value);
+                        bad_arg = 0;
+                    }
+                    break;
+                case 'd':
+                    /* Another way to specify 'sig' */
+                    if (strncmp(str,"dtype",5) == 0) {
+                        /* Allow this parameter to be None */
+                        PyArray_Descr *dtype;
+                        if (!PyArray_DescrConverter2(value, &dtype)) {
+                            return -1;
+                        }
+                        if (dtype != NULL) {
+                            if (*out_typetup != NULL) {
+                                PyErr_SetString(PyExc_RuntimeError,
+                                    "cannot specify both 'sig' and 'dtype'");
+                                return -1;
+                            }
+                            *out_typetup = Py_BuildValue("(N)", dtype);
+                        }
+                        bad_arg = 0;
+                    }
+            }
+
+            if (bad_arg) {
+                char *format = "'%s' is an invalid keyword to ufunc '%s'";
+                PyErr_Format(PyExc_TypeError, format, str, ufunc_name);
+                return -1;
+            }
         }
     }
 
-    /* Select an appropriate function for these argument types. */
-    if (select_types(loop->ufunc, arg_types, &(loop->function),
-                     &(loop->funcdata), scalars, typetup) == -1) {
-        return -1;
+    *out_any_object = any_object;
+
+    return 0;
+}
+
+static const char *
+_casting_to_string(NPY_CASTING casting)
+{
+    switch (casting) {
+        case NPY_NO_CASTING:
+            return "no";
+        case NPY_EQUIV_CASTING:
+            return "equiv";
+        case NPY_SAFE_CASTING:
+            return "safe";
+        case NPY_SAME_KIND_CASTING:
+            return "same_kind";
+        case NPY_UNSAFE_CASTING:
+            return "unsafe";
+        default:
+            return "<unknown>";
     }
+}
+
+
+static int
+ufunc_loop_matches(PyUFuncObject *self,
+                    PyArrayObject **op,
+                    NPY_CASTING input_casting,
+                    NPY_CASTING output_casting,
+                    int any_object,
+                    int all_inputs_scalar,
+                    int *types,
+                    int *out_no_castable_output,
+                    char *out_err_src_typecode,
+                    char *out_err_dst_typecode)
+{
+    npy_intp i, nin = self->nin, niter = nin + self->nout;
+
     /*
-     * FAIL with NotImplemented if the other object has
-     * the __r<op>__ method and has __array_priority__ as
-     * an attribute (signalling it can handle ndarray's)
-     * and is not already an ndarray or a subtype of the same type.
+     * First check if all the inputs can be safely cast
+     * to the types for this function
      */
-    if ((arg_types[1] == PyArray_OBJECT)
-        && (loop->ufunc->nin==2) && (loop->ufunc->nout == 1)) {
-        PyObject *_obj = PyTuple_GET_ITEM(args, 1);
-        if (!PyArray_CheckExact(_obj)
-            /* If both are same subtype of object arrays, then proceed */
-            && !(Py_TYPE(_obj) == Py_TYPE(PyTuple_GET_ITEM(args, 0)))
-            && PyObject_HasAttrString(_obj, "__array_priority__")
-            && _has_reflected_op(_obj, loop->ufunc->name)) {
-            loop->notimplemented = 1;
-            return nargs;
+    for (i = 0; i < nin; ++i) {
+        PyArray_Descr *tmp = PyArray_DescrFromType(types[i]);
+        if (tmp == NULL) {
+            return -1;
         }
-    }
 
-    /*
-     * Create copies for some of the arrays if they are small
-     * enough and not already contiguous
-     */
-    if (_create_copies(loop, arg_types, mps) < 0) {
-        return -1;
+        /*
+         * If no inputs are objects and there are more than one
+         * loop, don't allow conversion to object.  The rationale
+         * behind this is mostly performance.  Except for custom
+         * ufuncs built with just one object-parametered inner loop,
+         * only the types that are supported are implemented.  Trying
+         * the object version of logical_or on float arguments doesn't
+         * seem right.
+         */
+        if (types[i] == NPY_OBJECT && !any_object && self->ntypes > 1) {
+            return 0;
+        }
+#if NPY_UF_DBG_TRACING
+        printf("Checking type for op %d, type %d: ", (int)i, (int)types[i]);
+        PyObject_Print((PyObject *)tmp, stdout, 0);
+        printf(", operand type: ");
+        PyObject_Print((PyObject *)PyArray_DESCR(op[i]), stdout, 0);
+        printf("\n");
+#endif
+        /*
+         * If all the inputs are scalars, use the regular
+         * promotion rules, not the special value-checking ones.
+         */
+        if (all_inputs_scalar) {
+            if (!PyArray_CanCastTypeTo(PyArray_DESCR(op[i]), tmp,
+                                                    input_casting)) {
+                Py_DECREF(tmp);
+                return 0;
+            }
+        }
+        else {
+            if (!PyArray_CanCastArrayTo(op[i], tmp, input_casting)) {
+                Py_DECREF(tmp);
+                return 0;
+            }
+        }
+        Py_DECREF(tmp);
     }
+    NPY_UF_DBG_PRINTF("The inputs all worked\n");
 
     /*
-     * Only use loop dimensions when constructing Iterator:
-     * temporarily replace mps[i] (will be recovered below).
+     * If all the inputs were ok, then check casting back to the
+     * outputs.
      */
-    if (self->core_enabled) {
-        for (i = 0; i < self->nin; i++) {
-            PyArrayObject *ao;
-
-            if (_compute_dimension_size(loop, mps, i) < 0) {
+    for (i = nin; i < niter; ++i) {
+        if (op[i] != NULL) {
+            PyArray_Descr *tmp = PyArray_DescrFromType(types[i]);
+            if (tmp == NULL) {
                 return -1;
             }
-            ao = _trunc_coredim(mps[i], self->core_num_dims[i]);
-            if (ao == NULL) {
-                return -1;
+            if (!PyArray_CanCastTypeTo(tmp, PyArray_DESCR(op[i]),
+                                                        output_casting)) {
+                Py_DECREF(tmp);
+                if (!(*out_no_castable_output)) {
+                    *out_no_castable_output = 1;
+                    *out_err_src_typecode = tmp->type;
+                    *out_err_dst_typecode = PyArray_DESCR(op[i])->type;
+                }
+                return 0;
             }
-            mps[i] = ao;
+            Py_DECREF(tmp);
         }
     }
+    NPY_UF_DBG_PRINTF("The outputs all worked\n");
 
-    /* Create Iterators for the Inputs */
-    for (i = 0; i < self->nin; i++) {
-        loop->iters[i] = (PyArrayIterObject *)
-            PyArray_IterNew((PyObject *)mps[i]);
-        if (loop->iters[i] == NULL) {
-            return -1;
-        }
-    }
-
-    /* Recover mps[i]. */
-    if (self->core_enabled) {
-        for (i = 0; i < self->nin; i++) {
-            PyArrayObject *ao = mps[i];
-            mps[i] = (PyArrayObject *)mps[i]->base;
-            Py_DECREF(ao);
-        }
-    }
+    return 1;
+}
 
-    /* Broadcast the result */
-    loop->numiter = self->nin;
-    if (PyArray_Broadcast((PyArrayMultiIterObject *)loop) < 0) {
-        return -1;
-    }
+static int
+set_ufunc_loop_data_types(PyUFuncObject *self, PyArrayObject **op,
+                    PyArray_Descr **out_dtype,
+                    int *types,
+                    npy_intp buffersize, int *out_trivial_loop_ok)
+{
+    npy_intp i, nin = self->nin, niter = nin + self->nout;
 
-    /* Get any return arguments */
-    for (i = self->nin; i < nargs; i++) {
-        mps[i] = (PyArrayObject *)PyTuple_GET_ITEM(args, i);
-        if (((PyObject *)mps[i])==Py_None) {
-            mps[i] = NULL;
-            continue;
+    *out_trivial_loop_ok = 1;
+    /* Fill the dtypes array */
+    for (i = 0; i < niter; ++i) {
+        out_dtype[i] = PyArray_DescrFromType(types[i]);
+        if (out_dtype[i] == NULL) {
+            return -1;
         }
-        Py_INCREF(mps[i]);
-        if (!PyArray_Check((PyObject *)mps[i])) {
-            PyObject *new;
-            if (PyArrayIter_Check(mps[i])) {
-                new = PyObject_CallMethod((PyObject *)mps[i],
-                                          "__array__", NULL);
-                Py_DECREF(mps[i]);
-                mps[i] = (PyArrayObject *)new;
+        /*
+         * If the dtype doesn't match, or the array isn't aligned,
+         * indicate that the trivial loop can't be done.
+         */
+        if (*out_trivial_loop_ok && op[i] != NULL &&
+                (!PyArray_ISALIGNED(op[i]) ||
+                !PyArray_EquivTypes(out_dtype[i], PyArray_DESCR(op[i]))
+                                        )) {
+            /*
+             * If op[j] is a scalar or small one dimensional
+             * array input, make a copy to keep the opportunity
+             * for a trivial loop.
+             */
+            if (i < nin && (PyArray_NDIM(op[i]) == 0 ||
+                    (PyArray_NDIM(op[i]) == 1 &&
+                     PyArray_DIM(op[i],0) <= buffersize))) {
+                PyArrayObject *tmp;
+                Py_INCREF(out_dtype[i]);
+                tmp = (PyArrayObject *)
+                            PyArray_CastToType(op[i], out_dtype[i], 0);
+                if (tmp == NULL) {
+                    return -1;
+                }
+                Py_DECREF(op[i]);
+                op[i] = tmp;
             }
             else {
-                PyErr_SetString(PyExc_TypeError,
-                                "return arrays must be "\
-                                "of ArrayType");
-                Py_DECREF(mps[i]);
-                mps[i] = NULL;
-                return -1;
+                *out_trivial_loop_ok = 0;
             }
         }
+    }
+
+    return 0;
+}
+
+/*
+ * Does a search through the arguments and the loops
+ */
+static int
+find_ufunc_matching_userloop(PyUFuncObject *self,
+                        PyArrayObject **op,
+                        NPY_CASTING input_casting,
+                        NPY_CASTING output_casting,
+                        npy_intp buffersize,
+                        int any_object,
+                        int all_inputs_scalar,
+                        PyArray_Descr **out_dtype,
+                        PyUFuncGenericFunction *out_innerloop,
+                        void **out_innerloopdata,
+                        int *out_trivial_loop_ok,
+                        int *out_no_castable_output,
+                        char *out_err_src_typecode,
+                        char *out_err_dst_typecode)
+{
+    npy_intp i, nin = self->nin;
+    PyUFunc_Loop1d *funcdata;
 
-        if (self->core_enabled) {
-            if (_compute_dimension_size(loop, mps, i) < 0) {
+    /* Use this to try to avoid repeating the same userdef loop search */
+    int last_userdef = -1;
+
+    for (i = 0; i < nin; ++i) {
+        int type_num = PyArray_DESCR(op[i])->type_num;
+        if (type_num != last_userdef && PyTypeNum_ISUSERDEF(type_num)) {
+            PyObject *key, *obj;
+
+            last_userdef = type_num;
+
+            key = PyInt_FromLong(type_num);
+            if (key == NULL) {
                 return -1;
             }
-        }
-        out_dims = _compute_output_dims(loop, i, &out_nd, temp_dims);
-        if (!out_dims) {
-            return -1;
-        }
-        if (mps[i]->nd != out_nd
-            || !PyArray_CompareLists(mps[i]->dimensions, out_dims, out_nd)) {
-            PyErr_SetString(PyExc_ValueError, "invalid return array shape");
-            Py_DECREF(mps[i]);
-            mps[i] = NULL;
-            return -1;
-        }
-        if (!PyArray_ISWRITEABLE(mps[i])) {
-            PyErr_SetString(PyExc_ValueError, "return array is not writeable");
-            Py_DECREF(mps[i]);
-            mps[i] = NULL;
-            return -1;
+            obj = PyDict_GetItem(self->userloops, key);
+            Py_DECREF(key);
+            if (obj == NULL) {
+                continue;
+            }
+            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            while (funcdata != NULL) {
+                int *types = funcdata->arg_types;
+                switch (ufunc_loop_matches(self, op,
+                            input_casting, output_casting,
+                            any_object, all_inputs_scalar,
+                            types,
+                            out_no_castable_output, out_err_src_typecode,
+                            out_err_dst_typecode)) {
+                    /* Error */
+                    case -1:
+                        return -1;
+                    /* Found a match */
+                    case 1:
+                        set_ufunc_loop_data_types(self, op, out_dtype, types,
+                                            buffersize, out_trivial_loop_ok);
+
+                        /* Save the inner loop and its data */
+                        *out_innerloop = funcdata->func;
+                        *out_innerloopdata = funcdata->data;
+
+                        NPY_UF_DBG_PRINTF("Returning userdef inner "
+                                                "loop successfully\n");
+
+                        return 0;
+                }
+
+                funcdata = funcdata->next;
+            }
         }
     }
 
-    /* construct any missing return arrays and make output iterators */
-    for(i = self->nin; i < self->nargs; i++) {
-        PyArray_Descr *ntype;
+    /* Didn't find a match */
+    return 0;
+}
+
+/*
+ * Does a search through the arguments and the loops
+ */
+static int
+find_ufunc_specified_userloop(PyUFuncObject *self,
+                        int n_specified,
+                        int *specified_types,
+                        PyArrayObject **op,
+                        NPY_CASTING casting,
+                        npy_intp buffersize,
+                        int any_object,
+                        int all_inputs_scalar,
+                        PyArray_Descr **out_dtype,
+                        PyUFuncGenericFunction *out_innerloop,
+                        void **out_innerloopdata,
+                        int *out_trivial_loop_ok)
+{
+    npy_intp i, j, nin = self->nin, niter = nin + self->nout;
+    PyUFunc_Loop1d *funcdata;
+
+    /* Use this to try to avoid repeating the same userdef loop search */
+    int last_userdef = -1;
+
+    int no_castable_output = 0;
+    char err_src_typecode = '-', err_dst_typecode = '-';
+
+    for (i = 0; i < nin; ++i) {
+        int type_num = PyArray_DESCR(op[i])->type_num;
+        if (type_num != last_userdef && PyTypeNum_ISUSERDEF(type_num)) {
+            PyObject *key, *obj;
 
-        if (mps[i] == NULL) {
-            out_dims = _compute_output_dims(loop, i, &out_nd, temp_dims);
-            if (!out_dims) {
+            last_userdef = type_num;
+
+            key = PyInt_FromLong(type_num);
+            if (key == NULL) {
                 return -1;
             }
-            mps[i] = (PyArrayObject *)PyArray_New(subtype,
-                                                  out_nd,
-                                                  out_dims,
-                                                  arg_types[i],
-                                                  NULL, NULL,
-                                                  0, 0, NULL);
-            if (mps[i] == NULL) {
-                return -1;
+            obj = PyDict_GetItem(self->userloops, key);
+            Py_DECREF(key);
+            if (obj == NULL) {
+                continue;
             }
-        }
-
-        /*
-         * reset types for outputs that are equivalent
-         * -- no sense casting uselessly
-         */
-        else {
-            if (mps[i]->descr->type_num != arg_types[i]) {
-                PyArray_Descr *atype;
-                ntype = mps[i]->descr;
-                atype = PyArray_DescrFromType(arg_types[i]);
-                if (PyArray_EquivTypes(atype, ntype)) {
-                    arg_types[i] = ntype->type_num;
+            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            while (funcdata != NULL) {
+                int *types = funcdata->arg_types;
+                int matched = 1;
+
+                if (n_specified == niter) {
+                    for (j = 0; j < niter; ++j) {
+                        if (types[j] != specified_types[j]) {
+                            matched = 0;
+                            break;
+                        }
+                    }
+                } else {
+                    if (types[nin] != specified_types[0]) {
+                        matched = 0;
+                    }
+                }
+                if (!matched) {
+                    continue;
                 }
-                Py_DECREF(atype);
-            }
 
-            /* still not the same -- or will we have to use buffers?*/
-            if (mps[i]->descr->type_num != arg_types[i]
-                || !PyArray_ISBEHAVED_RO(mps[i])) {
-                if (loop->size < loop->bufsize || self->core_enabled) {
-                    PyObject *new;
-                    /*
-                     * Copy the array to a temporary copy
-                     * and set the UPDATEIFCOPY flag
-                     */
-                    ntype = PyArray_DescrFromType(arg_types[i]);
-                    new = PyArray_FromAny((PyObject *)mps[i],
-                                          ntype, 0, 0,
-                                          FORCECAST | ALIGNED |
-                                          UPDATEIFCOPY, NULL);
-                    if (new == NULL) {
+                switch (ufunc_loop_matches(self, op,
+                            casting, casting,
+                            any_object, all_inputs_scalar,
+                            types,
+                            &no_castable_output, &err_src_typecode,
+                            &err_dst_typecode)) {
+                    /* It works */
+                    case 1:
+                        set_ufunc_loop_data_types(self, op, out_dtype, types,
+                                            buffersize, out_trivial_loop_ok);
+
+                        /* Save the inner loop and its data */
+                        *out_innerloop = funcdata->func;
+                        *out_innerloopdata = funcdata->data;
+
+                        NPY_UF_DBG_PRINTF("Returning userdef inner "
+                                                "loop successfully\n");
+
+                        return 0;
+                    /* Didn't match */
+                    case 0:
+                        PyErr_Format(PyExc_TypeError,
+                             "found a user loop for ufunc '%s' "
+                             "matching the type-tuple, "
+                             "but the inputs and/or outputs could not be "
+                             "cast according to the casting rule",
+                             self->name ? self->name : "(unknown)");
+                        return -1;
+                    /* Error */
+                    case -1:
                         return -1;
-                    }
-                    Py_DECREF(mps[i]);
-                    mps[i] = (PyArrayObject *)new;
                 }
+
+                funcdata = funcdata->next;
             }
         }
+    }
 
-        if (self->core_enabled) {
-            PyArrayObject *ao;
+    /* Didn't find a match */
+    return 0;
+}
 
-            /* computer for all output arguments, and set strides in "loop" */
-            if (_compute_dimension_size(loop, mps, i) < 0) {
-                return -1;
-            }
-            ao = _trunc_coredim(mps[i], self->core_num_dims[i]);
-            if (ao == NULL) {
-                return -1;
-            }
-            /* Temporarily modify mps[i] for constructing iterator. */
-            mps[i] = ao;
-        }
+/*
+ * Does a linear search for the best inner loop of the ufunc.
+ * When op[i] is a scalar or a one dimensional array smaller than
+ * the buffersize, and needs a dtype conversion, this function
+ * may substitute op[i] with a version cast to the correct type.  This way,
+ * the later trivial loop detection has a higher chance of being triggered.
+ *
+ * Note that if an error is returned, the caller must free the non-zero
+ * references in out_dtype.  This function does not do its own clean-up.
+ */
+static int
+find_best_ufunc_inner_loop(PyUFuncObject *self,
+                        PyArrayObject **op,
+                        NPY_CASTING input_casting,
+                        NPY_CASTING output_casting,
+                        npy_intp buffersize,
+                        int any_object,
+                        PyArray_Descr **out_dtype,
+                        PyUFuncGenericFunction *out_innerloop,
+                        void **out_innerloopdata,
+                        int *out_trivial_loop_ok)
+{
+    npy_intp i, j, nin = self->nin, niter = nin + self->nout;
+    int types[NPY_MAXARGS];
+    char *ufunc_name;
+    int no_castable_output, all_inputs_scalar;
 
-        loop->iters[i] = (PyArrayIterObject *)
-            PyArray_IterNew((PyObject *)mps[i]);
-        if (loop->iters[i] == NULL) {
-            return -1;
-        }
+    /* For making a better error message on coercion error */
+    char err_dst_typecode = '-', err_src_typecode = '-';
 
-        /* Recover mps[i]. */
-        if (self->core_enabled) {
-            PyArrayObject *ao = mps[i];
-            mps[i] = (PyArrayObject *)mps[i]->base;
-            Py_DECREF(ao);
+    ufunc_name = self->name ? self->name : "(unknown)";
+
+    /* Check whether all the inputs are scalar */
+    all_inputs_scalar = 1;
+    for(i = 0; i < nin; ++i) {
+        if (PyArray_NDIM(op[i]) > 0) {
+            all_inputs_scalar = 0;
         }
+    }
 
+    /* If the ufunc has userloops, search for them. */
+    if (self->userloops) {
+        switch (find_ufunc_matching_userloop(self, op,
+                                input_casting, output_casting,
+                                buffersize, any_object, all_inputs_scalar,
+                                out_dtype, out_innerloop, out_innerloopdata,
+                                out_trivial_loop_ok,
+                                &no_castable_output, &err_src_typecode,
+                                &err_dst_typecode)) {
+            /* Error */
+            case -1:
+                return -1;
+            /* A loop was found */
+            case 1:
+                return 0;
+        }
     }
 
     /*
-     * Use __array_prepare__ on all outputs
-     * if present on one of the input arguments.
-     * If present for multiple inputs:
-     * use __array_prepare__ of input object with largest
-     * __array_priority__ (default = 0.0)
+     * Determine the UFunc loop.  This could in general be *much* faster,
+     * and a better way to implement it might be for the ufunc to
+     * provide a function which gives back the result type and inner
+     * loop function.
      *
-     * Exception:  we should not wrap outputs for items already
-     * passed in as output-arguments.  These items should either
-     * be left unwrapped or wrapped by calling their own __array_prepare__
-     * routine.
+     * A default fast mechanism could be provided for functions which
+     * follow the most typical pattern, when all functions have signatures
+     * "xx...x -> x" for some built-in data type x, as follows.
+     *  - Use PyArray_ResultType to get the output type
+     *  - Look up the inner loop in a table based on the output type_num
      *
-     * For each output argument, wrap will be either
-     * NULL --- call PyArray_Return() -- default if no output arguments given
-     * None --- array-object passed in don't call PyArray_Return
-     * method --- the __array_prepare__ method to call.
+     * The method for finding the loop in the previous code did not
+     * appear consistent (as noted by some asymmetry in the generated
+     * coercion tables for np.add).
      */
-    _find_array_prepare(args, wraparr, loop->ufunc->nin, loop->ufunc->nout);
-
-    /* wrap outputs */
-    for (i = 0; i < loop->ufunc->nout; i++) {
-        int j = loop->ufunc->nin+i;
-        PyObject *wrap;
-        PyObject *res;
-        wrap = wraparr[i];
-        if (wrap != NULL) {
-            if (wrap == Py_None) {
-                Py_DECREF(wrap);
-                continue;
-            }
-            res = PyObject_CallFunction(wrap, "O(OOi)",
-                        mps[j], loop->ufunc, args, i);
-            Py_DECREF(wrap);
-            if ((res == NULL) || (res == Py_None)) {
-                if (!PyErr_Occurred()){
-                    PyErr_SetString(PyExc_TypeError,
-                            "__array_prepare__ must return an ndarray or subclass thereof");
-                }
+    no_castable_output = 0;
+    for (i = 0; i < self->ntypes; ++i) {
+        char *orig_types = self->types + i*self->nargs;
+
+        /* Copy the types into an int array for matching */
+        for (j = 0; j < niter; ++j) {
+            types[j] = orig_types[j];
+        }
+
+        NPY_UF_DBG_PRINTF("Trying function loop %d\n", (int)i);
+        switch (ufunc_loop_matches(self, op,
+                    input_casting, output_casting,
+                    any_object, all_inputs_scalar,
+                    types,
+                    &no_castable_output, &err_src_typecode,
+                    &err_dst_typecode)) {
+            /* Error */
+            case -1:
                 return -1;
-            }
-            Py_DECREF(mps[j]);
-            mps[j] = (PyArrayObject *)res;
+            /* Found a match */
+            case 1:
+                set_ufunc_loop_data_types(self, op, out_dtype, types,
+                                    buffersize, out_trivial_loop_ok);
+
+                /* Save the inner loop and its data */
+                *out_innerloop = self->functions[i];
+                *out_innerloopdata = self->data[i];
+
+                NPY_UF_DBG_PRINTF("Returning inner loop successfully\n");
+
+                return 0;
         }
+
     }
 
-    /*
-     * If any of different type, or misaligned or swapped
-     * then must use buffers
-     */
-    loop->bufcnt = 0;
-    loop->obj = 0;
-    /* Determine looping method needed */
-    loop->meth = NO_UFUNCLOOP;
-    if (loop->size == 0) {
-        return nargs;
+    /* If no function was found, throw an error */
+    NPY_UF_DBG_PRINTF("No loop was found\n");
+    if (no_castable_output) {
+        PyErr_Format(PyExc_TypeError,
+                "ufunc '%s' output (typecode '%c') could not be coerced to "
+                "provided output parameter (typecode '%c') according "
+                "to the casting rule '%s'",
+                ufunc_name, err_src_typecode, err_dst_typecode,
+                _casting_to_string(output_casting));
     }
-    if (self->core_enabled) {
-        loop->meth = SIGNATURE_NOBUFFER_UFUNCLOOP;
-    }
-    for (i = 0; i < self->nargs; i++) {
-        loop->needbuffer[i] = 0;
-        if (arg_types[i] != mps[i]->descr->type_num
-            || !PyArray_ISBEHAVED_RO(mps[i])) {
-            if (self->core_enabled) {
-                PyErr_SetString(PyExc_RuntimeError,
-                                "never reached; copy should have been made");
+    else {
+        /*
+         * TODO: We should try again if the casting rule is same_kind
+         *       or unsafe, and look for a function more liberally.
+         */
+        PyErr_Format(PyExc_TypeError,
+                "ufunc '%s' not supported for the input types, and the "
+                "inputs could not be safely coerced to any supported "
+                "types according to the casting rule '%s'",
+                ufunc_name,
+                _casting_to_string(input_casting));
+    }
+
+    return -1;
+}
+
+/*
+ * Does a linear search for the inner loop of the ufunc specified by type_tup.
+ * When op[i] is a scalar or a one dimensional array smaller than
+ * the buffersize, and needs a dtype conversion, this function
+ * may substitute op[i] with a version cast to the correct type.  This way,
+ * the later trivial loop detection has a higher chance of being triggered.
+ *
+ * Note that if an error is returned, the caller must free the non-zero
+ * references in out_dtype.  This function does not do its own clean-up.
+ */
+static int
+find_specified_ufunc_inner_loop(PyUFuncObject *self,
+                        PyObject *type_tup,
+                        PyArrayObject **op,
+                        NPY_CASTING casting,
+                        npy_intp buffersize,
+                        int any_object,
+                        PyArray_Descr **out_dtype,
+                        PyUFuncGenericFunction *out_innerloop,
+                        void **out_innerloopdata,
+                        int *out_trivial_loop_ok)
+{
+    npy_intp i, j, n, nin = self->nin, niter = nin + self->nout;
+    int n_specified = 0;
+    int specified_types[NPY_MAXARGS], types[NPY_MAXARGS];
+    char *ufunc_name;
+    int no_castable_output, all_inputs_scalar;
+
+    /* For making a better error message on coercion error */
+    char err_dst_typecode = '-', err_src_typecode = '-';
+
+    ufunc_name = self->name ? self->name : "(unknown)";
+
+    /* Check whether all the inputs are scalar */
+    all_inputs_scalar = 1;
+    for(i = 0; i < nin; ++i) {
+        if (PyArray_NDIM(op[i]) > 0) {
+            all_inputs_scalar = 0;
+        }
+    }
+
+    /* Fill in specified_types from the tuple or string */
+    if (PyTuple_Check(type_tup)) {
+        n = PyTuple_GET_SIZE(type_tup);
+        if (n != 1 && n != niter) {
+            PyErr_Format(PyExc_ValueError,
+                         "a type-tuple must be specified " \
+                         "of length 1 or %d for ufunc '%s'", (int)niter,
+                         self->name ? self->name : "(unknown)");
+            return -1;
+        }
+
+        for (i = 0; i < n; ++i) {
+            PyArray_Descr *dtype = NULL;
+            if (!PyArray_DescrConverter(PyTuple_GET_ITEM(type_tup, i),
+                                                                &dtype)) {
                 return -1;
             }
-            loop->meth = BUFFER_UFUNCLOOP;
-            loop->needbuffer[i] = 1;
+            specified_types[i] = dtype->type_num;
+            Py_DECREF(dtype);
+        }
+
+        n_specified = n;
+    }
+    else if (PyString_Check(type_tup)) {
+        Py_ssize_t length;
+        char *str;
+
+        if (!PyString_AsStringAndSize(type_tup, &str, &length) < 0) {
+            return -1;
+        }
+        if (length != 1 && (length != niter + 2 ||
+                                str[nin] != '-' || str[nin+1] != '>')) {
+            PyErr_Format(PyExc_ValueError,
+                                 "a type-string for %s, "   \
+                                 "requires 1 typecode, or "
+                                 "%d typecode(s) before " \
+                                 "and %d after the -> sign",
+                                 self->name ? self->name : "(unknown)",
+                                 self->nin, self->nout);
+            return -1;
         }
-        if (!(loop->obj & UFUNC_OBJ_ISOBJECT)
-                && ((mps[i]->descr->type_num == PyArray_OBJECT)
-                    || (arg_types[i] == PyArray_OBJECT))) {
-            loop->obj = UFUNC_OBJ_ISOBJECT|UFUNC_OBJ_NEEDS_API;
+        if (length == 1) {
+            PyArray_Descr *dtype;
+            n_specified = 1;
+            dtype = PyArray_DescrFromType(str[0]);
+            if (dtype == NULL) {
+                return -1;
+            }
+            NPY_UF_DBG_PRINTF("signature character '%c', type num %d\n",
+                                str[0], dtype->type_num);
+            specified_types[0] = dtype->type_num;
+            Py_DECREF(dtype);
         }
-        if (!(loop->obj & UFUNC_OBJ_NEEDS_API)
-                && ((mps[i]->descr->type_num == PyArray_DATETIME)
-                    || (mps[i]->descr->type_num == PyArray_TIMEDELTA)
-                    || (arg_types[i] == PyArray_DATETIME)
-                    || (arg_types[i] == PyArray_TIMEDELTA))) {
-            loop->obj = UFUNC_OBJ_NEEDS_API;
+        else {
+            PyArray_Descr *dtype;
+            n_specified = (int)niter;
+
+            for (i = 0; i < niter; ++i) {
+                npy_intp istr = i < nin ? i : i+2;
+
+                dtype = PyArray_DescrFromType(str[istr]);
+                if (dtype == NULL) {
+                    return -1;
+                }
+                NPY_UF_DBG_PRINTF("signature character '%c', type num %d\n",
+                                    str[istr], dtype->type_num);
+                specified_types[i] = dtype->type_num;
+                Py_DECREF(dtype);
+            }
         }
     }
 
-    if (self->core_enabled && (loop->obj & UFUNC_OBJ_ISOBJECT)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Object type not allowed in ufunc with signature");
-        return -1;
+    /* If the ufunc has userloops, search for them. */
+    if (self->userloops) {
+        NPY_UF_DBG_PRINTF("Searching user loops for specified sig\n");
+        switch (find_ufunc_specified_userloop(self,
+                        n_specified, specified_types,
+                        op, casting,
+                        buffersize, any_object, all_inputs_scalar,
+                        out_dtype, out_innerloop, out_innerloopdata,
+                        out_trivial_loop_ok)) {
+            /* Error */
+            case -1:
+                return -1;
+            /* Found matching loop */
+            case 1:
+                return 0;
+        }
     }
-    if (loop->meth == NO_UFUNCLOOP) {
-        loop->meth = ONE_UFUNCLOOP;
+ 
+    NPY_UF_DBG_PRINTF("Searching loops for specified sig\n");
+    for (i = 0; i < self->ntypes; ++i) {
+        char *orig_types = self->types + i*self->nargs;
+        int matched = 1;
 
-        /* All correct type and BEHAVED */
-        /* Check for non-uniform stridedness */
-        for (i = 0; i < self->nargs; i++) {
-            if (!(loop->iters[i]->contiguous)) {
-                /*
-                 * May still have uniform stride
-                 * if (broadcast result) <= 1-d
-                 */
-                if (mps[i]->nd != 0 &&                  \
-                    (loop->iters[i]->nd_m1 > 0)) {
-                    loop->meth = NOBUFFER_UFUNCLOOP;
+        NPY_UF_DBG_PRINTF("Trying function loop %d\n", (int)i);
+
+        /* Copy the types into an int array for matching */
+        for (j = 0; j < niter; ++j) {
+            types[j] = orig_types[j];
+        }
+
+        if (n_specified == niter) {
+            for (j = 0; j < niter; ++j) {
+                if (types[j] != specified_types[j]) {
+                    matched = 0;
                     break;
                 }
             }
-        }
-        if (loop->meth == ONE_UFUNCLOOP) {
-            for (i = 0; i < self->nargs; i++) {
-                loop->bufptr[i] = mps[i]->data;
+        } else {
+            NPY_UF_DBG_PRINTF("Specified type: %d, first output type: %d\n",
+                                        specified_types[0], types[nin]);
+            if (types[nin] != specified_types[0]) {
+                matched = 0;
             }
         }
+        if (!matched) {
+            continue;
+        }
+
+        NPY_UF_DBG_PRINTF("It matches, confirming type casting\n");
+        switch (ufunc_loop_matches(self, op,
+                    casting, casting,
+                    any_object, all_inputs_scalar,
+                    types,
+                    &no_castable_output, &err_src_typecode,
+                    &err_dst_typecode)) {
+            /* Error */
+            case -1:
+                return -1;
+            /* It worked */
+            case 1:
+                set_ufunc_loop_data_types(self, op, out_dtype, types,
+                                    buffersize, out_trivial_loop_ok);
+
+                /* Save the inner loop and its data */
+                *out_innerloop = self->functions[i];
+                *out_innerloopdata = self->data[i];
+
+                NPY_UF_DBG_PRINTF("Returning specified inner loop successfully\n");
+
+                return 0;
+            /* Didn't work */
+            case 0:
+                PyErr_Format(PyExc_TypeError,
+                     "found a loop for ufunc '%s' "
+                     "matching the type-tuple, "
+                     "but the inputs and/or outputs could not be "
+                     "cast according to the casting rule",
+                     ufunc_name);
+                return -1;
+        }
+
+    }
+
+    /* If no function was found, throw an error */
+    NPY_UF_DBG_PRINTF("No specified loop was found\n");
+
+    PyErr_Format(PyExc_TypeError,
+            "No loop matching the specified signature was found "
+            "for ufunc %s", ufunc_name);
+
+    return -1;
+}
+
+static void
+trivial_two_operand_loop(PyArrayObject **op,
+                    PyUFuncGenericFunction innerloop,
+                    void *innerloopdata)
+{
+    char *data[2];
+    npy_intp count[2], stride[2];
+    int needs_api;
+    NPY_BEGIN_THREADS_DEF;
+
+    needs_api = PyDataType_REFCHK(PyArray_DESCR(op[0])) ||
+                PyDataType_REFCHK(PyArray_DESCR(op[1]));
+
+    PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(op[0], op[1],
+                                            count[0],
+                                            data[0], data[1],
+                                            stride[0], stride[1]);
+    count[1] = count[0];
+    NPY_UF_DBG_PRINTF("two operand loop count %d\n", (int)count[0]);
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
     }
 
-    loop->numiter = self->nargs;
+    innerloop(data, count, stride, innerloopdata);
 
-    /* Fill in steps  */
-    if (loop->meth == SIGNATURE_NOBUFFER_UFUNCLOOP && loop->nd == 0) {
-        /* Use default core_strides */
+    if (!needs_api) {
+        NPY_END_THREADS;
     }
-    else if (loop->meth != ONE_UFUNCLOOP) {
-        int ldim;
-        intp minsum;
-        intp maxdim;
-        PyArrayIterObject *it;
-        intp stride_sum[NPY_MAXDIMS];
-        int j;
+}
 
-        /* Fix iterators */
+static void
+trivial_three_operand_loop(PyArrayObject **op,
+                    PyUFuncGenericFunction innerloop,
+                    void *innerloopdata)
+{
+    char *data[3];
+    npy_intp count[3], stride[3];
+    int needs_api;
+    NPY_BEGIN_THREADS_DEF;
 
-        /*
-         * Optimize axis the iteration takes place over
-         *
-         * The first thought was to have the loop go
-         * over the largest dimension to minimize the number of loops
-         *
-         * However, on processors with slow memory bus and cache,
-         * the slowest loops occur when the memory access occurs for
-         * large strides.
-         *
-         * Thus, choose the axis for which strides of the last iterator is
-         * smallest but non-zero.
-         */
-        for (i = 0; i < loop->nd; i++) {
-            stride_sum[i] = 0;
-            for (j = 0; j < loop->numiter; j++) {
-                stride_sum[i] += loop->iters[j]->strides[i];
+    needs_api = PyDataType_REFCHK(PyArray_DESCR(op[0])) ||
+                PyDataType_REFCHK(PyArray_DESCR(op[1])) ||
+                PyDataType_REFCHK(PyArray_DESCR(op[2]));
+
+    PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(op[0], op[1], op[2],
+                                            count[0],
+                                            data[0], data[1], data[2],
+                                            stride[0], stride[1], stride[2]);
+    count[1] = count[0];
+    count[2] = count[0];
+    NPY_UF_DBG_PRINTF("three operand loop count %d\n", (int)count[0]);
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    innerloop(data, count, stride, innerloopdata);
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+}
+
+/*
+ * Calls the given __array_prepare__ function on the operand *op,
+ * substituting it in place if a new array is returned and matches
+ * the old one.
+ *
+ * This requires that the dimensions, strides and data type remain
+ * exactly the same, which may be more strict than before.
+ */
+static int
+prepare_ufunc_output(PyUFuncObject *self,
+                    PyArrayObject **op,
+                    PyObject *arr_prep,
+                    PyObject *arr_prep_args,
+                    int i)
+{
+    if (arr_prep != NULL && arr_prep != Py_None) {
+        PyObject *res;
+
+        res = PyObject_CallFunction(arr_prep, "O(OOi)",
+                    *op, self, arr_prep_args, i);
+        if ((res == NULL) || (res == Py_None) || !PyArray_Check(res)) {
+            if (!PyErr_Occurred()){
+                PyErr_SetString(PyExc_TypeError,
+                        "__array_prepare__ must return an "
+                        "ndarray or subclass thereof");
             }
+            Py_XDECREF(res);
+            return -1;
         }
 
-        ldim = loop->nd - 1;
-        minsum = stride_sum[loop->nd - 1];
-        for (i = loop->nd - 2; i >= 0; i--) {
-            if (stride_sum[i] < minsum ) {
-                ldim = i;
-                minsum = stride_sum[i];
-            }
+        /* If the same object was returned, nothing to do */
+        if (res == (PyObject *)*op) {
+            Py_DECREF(res);
+        }
+        /* If the result doesn't match, throw an error */
+        else if (PyArray_NDIM(res) != PyArray_NDIM(*op) ||
+                !PyArray_CompareLists(PyArray_DIMS(res),
+                                      PyArray_DIMS(*op),
+                                      PyArray_NDIM(res)) ||
+                !PyArray_CompareLists(PyArray_STRIDES(res),
+                                      PyArray_STRIDES(*op),
+                                      PyArray_NDIM(res)) ||
+                !PyArray_EquivTypes(PyArray_DESCR(res),
+                                    PyArray_DESCR(*op))) {
+            PyErr_SetString(PyExc_TypeError,
+                    "__array_prepare__ must return an "
+                    "ndarray or subclass thereof which is "
+                    "otherwise identical to its input");
+            Py_DECREF(res);
+            return -1;
+        }
+        /* Replace the op value */
+        else {
+            Py_DECREF(*op);
+            *op = (PyArrayObject *)res;
+        }
+    }
+
+    return 0;
+}
+
+static int
+iterator_loop(PyUFuncObject *self,
+                    PyArrayObject **op,
+                    PyArray_Descr **dtype,
+                    NPY_ORDER order,
+                    npy_intp buffersize,
+                    PyObject **arr_prep,
+                    PyObject *arr_prep_args,
+                    PyUFuncGenericFunction innerloop,
+                    void *innerloopdata)
+{
+    npy_intp i, nin = self->nin, nout = self->nout;
+    npy_intp niter = nin + nout;
+    npy_uint32 op_flags[NPY_MAXARGS];
+    NpyIter *iter;
+    char *baseptrs[NPY_MAXARGS];
+    int needs_api;
+    NPY_BEGIN_THREADS_DEF;
+
+    NpyIter_IterNext_Fn iternext;
+    char **dataptr;
+    npy_intp *stride;
+    npy_intp *count_ptr;
+
+    PyArrayObject **op_it;
+
+    /* Set up the flags */
+    for (i = 0; i < nin; ++i) {
+        op_flags[i] = NPY_ITER_READONLY|
+                      NPY_ITER_ALIGNED;
+    }
+    for (i = nin; i < niter; ++i) {
+        op_flags[i] = NPY_ITER_WRITEONLY|
+                      NPY_ITER_ALIGNED|
+                      NPY_ITER_ALLOCATE|
+                      NPY_ITER_NO_BROADCAST|
+                      NPY_ITER_NO_SUBTYPE;
+    }
+
+    /*
+     * Allocate the iterator.  Because the types of the inputs
+     * were already checked, we use the casting rule 'unsafe' which
+     * is faster to calculate.
+     */
+    iter = NpyIter_MultiNew(niter, op,
+                        NPY_ITER_NO_INNER_ITERATION|
+                        NPY_ITER_REFS_OK|
+                        NPY_ITER_ZEROSIZE_OK|
+                        NPY_ITER_BUFFERED|
+                        NPY_ITER_GROWINNER|
+                        NPY_ITER_DELAY_BUFALLOC,
+                        order, NPY_UNSAFE_CASTING,
+                        op_flags, dtype,
+                        0, NULL, buffersize);
+    if (iter == NULL) {
+        return -1;
+    }
+
+    needs_api = NpyIter_IterationNeedsAPI(iter);
+
+    /* Copy any allocated outputs */
+    op_it = NpyIter_GetOperandArray(iter);
+    for (i = nin; i < niter; ++i) {
+        if (op[i] == NULL) {
+            op[i] = op_it[i];
+            Py_INCREF(op[i]);
         }
-        maxdim = loop->dimensions[ldim];
-        loop->size /= maxdim;
-        loop->bufcnt = maxdim;
-        loop->lastdim = ldim;
+    }
 
-        /*
-         * Fix the iterators so the inner loop occurs over the
-         * largest dimensions -- This can be done by
-         * setting the size to 1 in that dimension
-         * (just in the iterators)
-         */
-        for (i = 0; i < loop->numiter; i++) {
-            it = loop->iters[i];
-            it->contiguous = 0;
-            it->size /= (it->dims_m1[ldim] + 1);
-            it->dims_m1[ldim] = 0;
-            it->backstrides[ldim] = 0;
+    /* Call the __array_prepare__ functions where necessary */
+    for (i = 0; i < nout; ++i) {
+        if (prepare_ufunc_output(self, &op[nin+i],
+                            arr_prep[i], arr_prep_args, i) < 0) {
+            NpyIter_Deallocate(iter);
+            return -1;
+        }
+    }
 
-            /*
-             * (won't fix factors because we
-             * don't use PyArray_ITER_GOTO1D
-             * so don't change them)
-             *
-             * Set the steps to the strides in that dimension
-             */
-            loop->steps[i] = it->strides[ldim];
+    /* Only do the loop if the iteration size is non-zero */
+    if (NpyIter_GetIterSize(iter) != 0) {
+
+        /* Reset the iterator with the base pointers from the wrapped outputs */
+        for (i = 0; i < niter; ++i) {
+            baseptrs[i] = PyArray_BYTES(op[i]);
+        }
+        if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
+            NpyIter_Deallocate(iter);
+            return -1;
         }
 
-        /*
-         * Set looping part of core_dim_sizes and core_strides.
-         */
-        if (loop->meth == SIGNATURE_NOBUFFER_UFUNCLOOP) {
-            loop->core_dim_sizes[0] = maxdim;
-            for (i = 0; i < self->nargs; i++) {
-                loop->core_strides[i] = loop->steps[i];
-            }
+        /* Get the variables needed for the loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            return -1;
         }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-        /*
-         * fix up steps where we will be copying data to
-         * buffers and calculate the ninnerloops and leftover
-         * values -- if step size is already zero that is not changed...
-         */
-        if (loop->meth == BUFFER_UFUNCLOOP) {
-            loop->leftover = maxdim % loop->bufsize;
-            loop->ninnerloops = (maxdim / loop->bufsize) + 1;
-            for (i = 0; i < self->nargs; i++) {
-                if (loop->needbuffer[i] && loop->steps[i]) {
-                    loop->steps[i] = mps[i]->descr->elsize;
-                }
-                /* These are changed later if casting is needed */
-            }
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
         }
-    }
-    else if (loop->meth == ONE_UFUNCLOOP) {
-        /* uniformly-strided case */
-        for (i = 0; i < self->nargs; i++) {
-            if (PyArray_SIZE(mps[i]) == 1) {
-                loop->steps[i] = 0;
-            }
-            else {
-                loop->steps[i] = mps[i]->strides[mps[i]->nd - 1];
-            }
+
+        /* Execute the loop */
+        do {
+            NPY_UF_DBG_PRINTF("iterator loop count %d\n", (int)*count_ptr);
+            innerloop(dataptr, count_ptr, stride, innerloopdata);
+        } while (iternext(iter));
+
+        if (!needs_api) {
+            NPY_END_THREADS;
         }
     }
 
+    NpyIter_Deallocate(iter);
+    return 0;
+}
 
-    /* Finally, create memory for buffers if we need them */
+/*
+ * trivial_loop_ok - 1 if no alignment, data conversion, etc required
+ * nin             - number of inputs
+ * nout            - number of outputs
+ * op              - the operands (nin + nout of them)
+ * order           - the loop execution order/output memory order
+ * buffersize      - how big of a buffer to use
+ * arr_prep        - the __array_prepare__ functions for the outputs
+ * innerloop       - the inner loop function
+ * innerloopdata   - data to pass to the inner loop
+ */
+static int
+execute_ufunc_loop(PyUFuncObject *self,
+                    int trivial_loop_ok,
+                    PyArrayObject **op,
+                    PyArray_Descr **dtype,
+                    NPY_ORDER order,
+                    npy_intp buffersize,
+                    PyObject **arr_prep,
+                    PyObject *arr_prep_args,
+                    PyUFuncGenericFunction innerloop,
+                    void *innerloopdata)
+{
+    npy_intp nin = self->nin, nout = self->nout;
+
+    /* First check for the trivial cases that don't need an iterator */
+    if (trivial_loop_ok) {
+        if (nin == 1 && nout == 1) {
+            if (op[1] == NULL &&
+                        (order == NPY_ANYORDER || order == NPY_KEEPORDER) &&
+                        PyArray_TRIVIALLY_ITERABLE(op[0])) {
+                Py_INCREF(dtype[1]);
+                op[1] = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
+                             dtype[1],
+                             PyArray_NDIM(op[0]),
+                             PyArray_DIMS(op[0]),
+                             NULL, NULL,
+                             PyArray_ISFORTRAN(op[0]) ? NPY_F_CONTIGUOUS : 0,
+                             NULL);
+
+                /* Call the __prepare_array__ if necessary */
+                if (prepare_ufunc_output(self, &op[1],
+                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                    return -1;
+                }
 
-    /*
-     * Buffers for scalars are specially made small -- scalars are
-     * not copied multiple times
-     */
-    if (loop->meth == BUFFER_UFUNCLOOP) {
-        int cnt = 0, cntcast = 0;
-        int scnt = 0, scntcast = 0;
-        char *castptr;
-        char *bufptr;
-        int last_was_scalar = 0;
-        int last_cast_was_scalar = 0;
-        int oldbufsize = 0;
-        int oldsize = 0;
-        int scbufsize = 4*sizeof(double);
-        int memsize;
-        PyArray_Descr *descr;
-
-        /* compute the element size */
-        for (i = 0; i < self->nargs; i++) {
-            if (!loop->needbuffer[i]) {
-                continue;
+                NPY_UF_DBG_PRINTF("trivial 1 input with allocated output\n");
+                trivial_two_operand_loop(op, innerloop, innerloopdata);
+
+                return 0;
             }
-            if (arg_types[i] != mps[i]->descr->type_num) {
-                descr = PyArray_DescrFromType(arg_types[i]);
-                if (loop->steps[i]) {
-                    cntcast += descr->elsize;
-                }
-                else {
-                    scntcast += descr->elsize;
+            else if (op[1] != NULL &&
+                        PyArray_NDIM(op[1]) >= PyArray_NDIM(op[0]) &&
+                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1])) {
+
+                /* Call the __prepare_array__ if necessary */
+                if (prepare_ufunc_output(self, &op[1],
+                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                    return -1;
                 }
-                if (i < self->nin) {
-                    loop->cast[i] = PyArray_GetCastFunc(mps[i]->descr,
-                                            arg_types[i]);
+
+                NPY_UF_DBG_PRINTF("trivial 1 input\n");
+                trivial_two_operand_loop(op, innerloop, innerloopdata);
+
+                return 0;
+            }
+        }
+        else if (nin == 2 && nout == 1) {
+            if (op[2] == NULL &&
+                        (order == NPY_ANYORDER || order == NPY_KEEPORDER) &&
+                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1])) {
+                PyArrayObject *tmp;
+                /*
+                 * Have to choose the input with more dimensions to clone, as
+                 * one of them could be a scalar.
+                 */
+                if (PyArray_NDIM(op[0]) >= PyArray_NDIM(op[1])) {
+                    tmp = op[0];
                 }
                 else {
-                    loop->cast[i] = PyArray_GetCastFunc \
-                        (descr, mps[i]->descr->type_num);
+                    tmp = op[1];
                 }
-                Py_DECREF(descr);
-                if (!loop->cast[i]) {
+                Py_INCREF(dtype[2]);
+                op[2] = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
+                                 dtype[2],
+                                 PyArray_NDIM(tmp),
+                                 PyArray_DIMS(tmp),
+                                 NULL, NULL,
+                                 PyArray_ISFORTRAN(tmp) ? NPY_F_CONTIGUOUS : 0,
+                                 NULL);
+
+                /* Call the __prepare_array__ if necessary */
+                if (prepare_ufunc_output(self, &op[2],
+                                    arr_prep[0], arr_prep_args, 0) < 0) {
                     return -1;
                 }
-            }
-            loop->swap[i] = !(PyArray_ISNOTSWAPPED(mps[i]));
-            if (loop->steps[i]) {
-                cnt += mps[i]->descr->elsize;
-            }
-            else {
-                scnt += mps[i]->descr->elsize;
-            }
-        }
-        memsize = loop->bufsize*(cnt+cntcast) + scbufsize*(scnt+scntcast);
-        loop->buffer[0] = PyDataMem_NEW(memsize);
 
-        /*
-         * debug
-         * fprintf(stderr, "Allocated buffer at %p of size %d, cnt=%d, cntcast=%d\n",
-         *               loop->buffer[0], loop->bufsize * (cnt + cntcast), cnt, cntcast);
-         */
-        if (loop->buffer[0] == NULL) {
-            PyErr_NoMemory();
-            return -1;
-        }
-        if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-            memset(loop->buffer[0], 0, memsize);
-        }
-        castptr = loop->buffer[0] + loop->bufsize*cnt + scbufsize*scnt;
-        bufptr = loop->buffer[0];
-        loop->objfunc = 0;
-        for (i = 0; i < self->nargs; i++) {
-            if (!loop->needbuffer[i]) {
-                continue;
-            }
-            loop->buffer[i] = bufptr + (last_was_scalar ? scbufsize :
-                                        loop->bufsize)*oldbufsize;
-            last_was_scalar = (loop->steps[i] == 0);
-            bufptr = loop->buffer[i];
-            oldbufsize = mps[i]->descr->elsize;
-            /* fprintf(stderr, "buffer[%d] = %p\n", i, loop->buffer[i]); */
-            if (loop->cast[i]) {
-                PyArray_Descr *descr;
-                loop->castbuf[i] = castptr + (last_cast_was_scalar ? scbufsize :
-                                              loop->bufsize)*oldsize;
-                last_cast_was_scalar = last_was_scalar;
-                /* fprintf(stderr, "castbuf[%d] = %p\n", i, loop->castbuf[i]); */
-                descr = PyArray_DescrFromType(arg_types[i]);
-                oldsize = descr->elsize;
-                Py_DECREF(descr);
-                loop->bufptr[i] = loop->castbuf[i];
-                castptr = loop->castbuf[i];
-                if (loop->steps[i]) {
-                    loop->steps[i] = oldsize;
-                }
-            }
-            else {
-                loop->bufptr[i] = loop->buffer[i];
+                NPY_UF_DBG_PRINTF("trivial 2 input with allocated output\n");
+                trivial_three_operand_loop(op, innerloop, innerloopdata);
+
+                return 0;
             }
-            if (!loop->objfunc && (loop->obj & UFUNC_OBJ_ISOBJECT)) {
-                if (arg_types[i] == PyArray_OBJECT) {
-                    loop->objfunc = 1;
+            else if (op[2] != NULL &&
+                    PyArray_NDIM(op[2]) >= PyArray_NDIM(op[0]) &&
+                    PyArray_NDIM(op[2]) >= PyArray_NDIM(op[1]) &&
+                    PyArray_TRIVIALLY_ITERABLE_TRIPLE(op[0], op[1], op[2])) {
+
+                /* Call the __prepare_array__ if necessary */
+                if (prepare_ufunc_output(self, &op[2],
+                                    arr_prep[0], arr_prep_args, 0) < 0) {
+                    return -1;
                 }
+
+                NPY_UF_DBG_PRINTF("trivial 2 input\n");
+                trivial_three_operand_loop(op, innerloop, innerloopdata);
+
+                return 0;
             }
         }
     }
 
-    if (_does_loop_use_arrays(loop->funcdata)) {
-        loop->funcdata = (void*)mps;
-    }
-
-    return nargs;
-}
+    /*
+     * If no trivial loop matched, an iterator is required to
+     * resolve broadcasting, etc
+     */
 
-static void
-ufuncreduce_dealloc(PyUFuncReduceObject *self)
-{
-    if (self->ufunc) {
-        Py_XDECREF(self->it);
-        Py_XDECREF(self->rit);
-        Py_XDECREF(self->ret);
-        Py_XDECREF(self->errobj);
-        Py_XDECREF(self->decref);
-        if (self->buffer) {
-            PyDataMem_FREE(self->buffer);
-        }
-        Py_DECREF(self->ufunc);
+    NPY_UF_DBG_PRINTF("iterator loop\n");
+    if (iterator_loop(self, op, dtype, order,
+                    buffersize, arr_prep, arr_prep_args,
+                    innerloop, innerloopdata) < 0) {
+        return -1;
     }
-    _pya_free(self);
+
+    return 0;
 }
 
-static void
-ufuncloop_dealloc(PyUFuncLoopObject *self)
+static PyObject *
+make_arr_prep_args(npy_intp nin, PyObject *args, PyObject *kwds)
 {
-    int i;
+    PyObject *out = kwds ? PyDict_GetItemString(kwds, "out") : NULL;
+    PyObject *arr_prep_args;
 
-    if (self->ufunc != NULL) {
-        if (self->core_dim_sizes) {
-            _pya_free(self->core_dim_sizes);
+    if (out == NULL) {
+        Py_INCREF(args);
+        return args;
+    }
+    else {
+        npy_intp i, nargs = PyTuple_GET_SIZE(args), n;
+        n = nargs;
+        if (n < nin + 1) {
+            n = nin + 1;
         }
-        if (self->core_strides) {
-            _pya_free(self->core_strides);
+        arr_prep_args = PyTuple_New(n);
+        if (arr_prep_args == NULL) {
+            return NULL;
         }
-        for (i = 0; i < self->ufunc->nargs; i++) {
-            Py_XDECREF(self->iters[i]);
+        /* Copy the tuple, but set the nin-th item to the keyword arg */
+        for (i = 0; i < nin; ++i) {
+            PyObject *item = PyTuple_GET_ITEM(args, i);
+            Py_INCREF(item);
+            PyTuple_SET_ITEM(arr_prep_args, i, item);
         }
-        if (self->buffer[0]) {
-            PyDataMem_FREE(self->buffer[0]);
+        Py_INCREF(out);
+        PyTuple_SET_ITEM(arr_prep_args, nin, out);
+        for (i = nin+1; i < n; ++i) {
+            PyObject *item = PyTuple_GET_ITEM(args, i);
+            Py_INCREF(item);
+            PyTuple_SET_ITEM(arr_prep_args, i, item);
         }
-        Py_XDECREF(self->errobj);
-        Py_DECREF(self->ufunc);
+
+        return arr_prep_args;
     }
-    _pya_free(self);
 }
 
-static PyUFuncLoopObject *
-construct_loop(PyUFuncObject *self, PyObject *args, PyObject *kwds, PyArrayObject **mps)
+static int
+PyUFunc_GeneralizedFunction(PyUFuncObject *self,
+                        PyObject *args, PyObject *kwds,
+                        PyArrayObject **op)
 {
-    PyUFuncLoopObject *loop;
-    int i;
-    PyObject *typetup = NULL;
-    PyObject *extobj = NULL;
-    char *name;
+    npy_intp nin, nout;
+    npy_intp i, idim, niter;
+    char *ufunc_name;
+    int retval = -1, any_object = 0;
+    NPY_CASTING input_casting;
+
+    PyArray_Descr *dtype[NPY_MAXARGS];
+
+    /* Use remapped axes for generalized ufunc */
+    npy_intp broadcast_ndim, op_ndim;
+    npy_intp op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    npy_intp *op_axes[NPY_MAXARGS];
+
+    npy_uint32 op_flags[NPY_MAXARGS];
+
+    NpyIter *iter = NULL;
+
+    /* These parameters come from extobj= or from a TLS global */
+    int buffersize = 0, errormask = 0;
+    PyObject *errobj = NULL;
+    int first_error = 1;
+
+    /* The selected inner loop */
+    PyUFuncGenericFunction innerloop = NULL;
+    void *innerloopdata = NULL;
+    /* The dimensions which get passed to the inner loop */
+    npy_intp inner_dimensions[NPY_MAXDIMS+1];
+    /* The strides which get passed to the inner loop */
+    npy_intp *inner_strides = NULL;
+
+    npy_intp *inner_strides_tmp, *ax_strides_tmp[NPY_MAXDIMS];
+    int core_dim_ixs_size, *core_dim_ixs;
+
+    /* The __array_prepare__ function to call for each output */
+    PyObject *arr_prep[NPY_MAXARGS];
+    /*
+     * This is either args, or args with the out= parameter from
+     * kwds added appropriately.
+     */
+    PyObject *arr_prep_args = NULL;
+
+    int trivial_loop_ok = 0;
+
+    /* TODO: For 1.6, the default should probably be NPY_CORDER */
+    NPY_ORDER order = NPY_KEEPORDER;
+    /*
+     * Many things in NumPy do unsafe casting (doing int += float, etc).
+     * The strictness should probably become a state parameter, similar
+     * to the seterr/geterr.
+     */
+    NPY_CASTING casting = NPY_UNSAFE_CASTING;
+    /* When provided, extobj and typetup contain borrowed references */
+    PyObject *extobj = NULL, *type_tup = NULL;
 
     if (self == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
-        return NULL;
+        return -1;
     }
-    if ((loop = _pya_malloc(sizeof(PyUFuncLoopObject))) == NULL) {
-        PyErr_NoMemory();
-        return loop;
+
+    nin = self->nin;
+    nout = self->nout;
+    niter = nin + nout;
+
+    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
+
+    NPY_UF_DBG_PRINTF("\nEvaluating ufunc %s\n", ufunc_name);
+
+    /* Initialize all the operands and dtypes to NULL */
+    for (i = 0; i < niter; ++i) {
+        op[i] = NULL;
+        dtype[i] = NULL;
+        arr_prep[i] = NULL;
     }
 
-    loop->index = 0;
-    loop->ufunc = self;
-    Py_INCREF(self);
-    loop->buffer[0] = NULL;
-    for (i = 0; i < self->nargs; i++) {
-        loop->iters[i] = NULL;
-        loop->cast[i] = NULL;
+    NPY_UF_DBG_PRINTF("Getting arguments\n");
+
+    /* Get all the arguments */
+    retval = get_ufunc_arguments(self, args, kwds,
+                op, &order, &casting, &extobj, &type_tup, &any_object);
+    if (retval < 0) {
+        goto fail;
     }
-    loop->errobj = NULL;
-    loop->notimplemented = 0;
-    loop->first = 1;
-    loop->core_dim_sizes = NULL;
-    loop->core_strides = NULL;
 
-    if (self->core_enabled) {
-        int num_dim_ix = 1 + self->core_num_dim_ix;
-        int nstrides = self->nargs + self->core_offsets[self->nargs - 1]
-                        + self->core_num_dims[self->nargs - 1];
-        loop->core_dim_sizes = _pya_malloc(sizeof(npy_intp)*num_dim_ix);
-        loop->core_strides = _pya_malloc(sizeof(npy_intp)*nstrides);
-        if (loop->core_dim_sizes == NULL || loop->core_strides == NULL) {
-            PyErr_NoMemory();
-            goto fail;
-        }
-        memset(loop->core_strides, 0, sizeof(npy_intp) * nstrides);
-        for (i = 0; i < num_dim_ix; i++) {
-            loop->core_dim_sizes[i] = 1;
+    /* Figure out the number of dimensions needed by the iterator */
+    broadcast_ndim = 0;
+    for (i = 0; i < nin; ++i) {
+        npy_intp n = PyArray_NDIM(op[i]) - self->core_num_dims[i];
+        if (n > broadcast_ndim) {
+            broadcast_ndim = n;
         }
     }
-    name = self->name ? self->name : "";
-
-    /*
-     * Extract sig= keyword and extobj= keyword if present.
-     * Raise an error if anything else is present in the
-     * keyword dictionary
-     */
-    if (kwds != NULL) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwds, &pos, &key, &value)) {
-            char *keystring = PyString_AsString(key);
+    op_ndim = broadcast_ndim + self->core_num_dim_ix;
+    if (op_ndim > NPY_MAXDIMS) {
+        PyErr_Format(PyExc_ValueError,
+                    "too many dimensions for generalized ufunc %s",
+                    ufunc_name);
+        retval = -1;
+        goto fail;
+    }
 
-            if (keystring == NULL) {
-                PyErr_Clear();
-                PyErr_SetString(PyExc_TypeError, "invalid keyword");
-                goto fail;
+    /* Fill in op_axes for all the operands */
+    core_dim_ixs_size = 0;
+    core_dim_ixs = self->core_dim_ixs;
+    for (i = 0; i < niter; ++i) {
+        npy_intp n;
+        if (op[i]) {
+            /*
+             * Note that n may be negative if broadcasting
+             * extends into the core dimensions.
+             */
+            n = PyArray_NDIM(op[i]) - self->core_num_dims[i];
+        }
+        else {
+            n = broadcast_ndim;
+        }
+        /* Broadcast all the unspecified dimensions normally */
+        for (idim = 0; idim < broadcast_ndim; ++idim) {
+            if (idim >= broadcast_ndim - n) {
+                op_axes_arrays[i][idim] = idim - (broadcast_ndim - n);
             }
-            if (strncmp(keystring,"extobj",6) == 0) {
-                extobj = value;
+            else {
+                op_axes_arrays[i][idim] = -1;
             }
-            else if (strncmp(keystring,"sig",3) == 0) {
-                typetup = value;
+        }
+        /* Use the signature information for the rest */
+        for (idim = broadcast_ndim; idim < op_ndim; ++idim) {
+            op_axes_arrays[i][idim] = -1;
+        }
+        for (idim = 0; idim < self->core_num_dims[i]; ++idim) {
+            if (n + idim >= 0) {
+                op_axes_arrays[i][broadcast_ndim + core_dim_ixs[idim]] =
+                                                                    n + idim;
             }
             else {
-                char *format = "'%s' is an invalid keyword to %s";
-                PyErr_Format(PyExc_TypeError,format,keystring, name);
-                goto fail;
+                op_axes_arrays[i][broadcast_ndim + core_dim_ixs[idim]] = -1;
             }
         }
+        core_dim_ixs_size += self->core_num_dims[i];
+        core_dim_ixs += self->core_num_dims[i];
+        op_axes[i] = op_axes_arrays[i];
     }
 
+    /* Get the buffersize, errormask, and error object globals */
     if (extobj == NULL) {
-        if (PyUFunc_GetPyValues(name,
-                                &(loop->bufsize), &(loop->errormask),
-                                &(loop->errobj)) < 0) {
+        if (PyUFunc_GetPyValues(ufunc_name,
+                                &buffersize, &errormask, &errobj) < 0) {
+            retval = -1;
             goto fail;
         }
     }
     else {
-        if (_extract_pyvals(extobj, name,
-                            &(loop->bufsize), &(loop->errormask),
-                            &(loop->errobj)) < 0) {
+        if (_extract_pyvals(extobj, ufunc_name,
+                                &buffersize, &errormask, &errobj) < 0) {
+            retval = -1;
             goto fail;
         }
     }
 
-    /* Setup the arrays */
-    if (construct_arrays(loop, args, mps, typetup) < 0) {
+    NPY_UF_DBG_PRINTF("Finding inner loop\n");
+
+    /*
+     * Decide the casting rules for inputs and outputs.  We want
+     * NPY_SAFE_CASTING or stricter, so that the loop selection code
+     * doesn't choose an integer loop for float inputs, for example.
+     */
+    input_casting = (casting > NPY_SAFE_CASTING) ? NPY_SAFE_CASTING : casting;
+
+    if (type_tup == NULL) {
+        /* Find the best ufunc inner loop, and fill in the dtypes */
+        retval = find_best_ufunc_inner_loop(self, op, input_casting, casting,
+                        buffersize, any_object, dtype,
+                        &innerloop, &innerloopdata, &trivial_loop_ok);
+    } else {
+        /* Find the specified ufunc inner loop, and fill in the dtypes */
+        retval = find_specified_ufunc_inner_loop(self, type_tup,
+                        op, casting,
+                        buffersize, any_object, dtype,
+                        &innerloop, &innerloopdata, &trivial_loop_ok);
+    }
+    if (retval < 0) {
         goto fail;
     }
-    PyUFunc_clearfperr();
-    return loop;
 
-fail:
-    ufuncloop_dealloc(loop);
-    return NULL;
-}
+    /*
+     * FAIL with NotImplemented if the other object has
+     * the __r<op>__ method and has __array_priority__ as
+     * an attribute (signalling it can handle ndarray's)
+     * and is not already an ndarray or a subtype of the same type.
+    */
+    if (nin == 2 && nout == 1 && dtype[1]->type_num == NPY_OBJECT) {
+        PyObject *_obj = PyTuple_GET_ITEM(args, 1);
+        if (!PyArray_CheckExact(_obj)
+               /* If both are same subtype of object arrays, then proceed */
+                && !(Py_TYPE(_obj) == Py_TYPE(PyTuple_GET_ITEM(args, 0)))
+                && PyObject_HasAttrString(_obj, "__array_priority__")
+                && _has_reflected_op(_obj, ufunc_name)) {
+            retval = -2;
+            goto fail;
+        }
+    }
 
+#if NPY_UF_DBG_TRACING
+    printf("input types:\n");
+    for (i = 0; i < nin; ++i) {
+        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        printf(" ");
+    }
+    printf("\noutput types:\n");
+    for (i = nin; i < niter; ++i) {
+        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        printf(" ");
+    }
+    printf("\n");
+#endif
 
-/*
-  static void
-  _printbytebuf(PyUFuncLoopObject *loop, int bufnum)
-  {
-  int i;
-
-  fprintf(stderr, "Printing byte buffer %d\n", bufnum);
-  for(i=0; i<loop->bufcnt; i++) {
-  fprintf(stderr, "  %d\n", *(((byte *)(loop->buffer[bufnum]))+i));
-  }
-  }
-
-  static void
-  _printlongbuf(PyUFuncLoopObject *loop, int bufnum)
-  {
-  int i;
-
-  fprintf(stderr, "Printing long buffer %d\n", bufnum);
-  for(i=0; i<loop->bufcnt; i++) {
-  fprintf(stderr, "  %ld\n", *(((long *)(loop->buffer[bufnum]))+i));
-  }
-  }
-
-  static void
-  _printlongbufptr(PyUFuncLoopObject *loop, int bufnum)
-  {
-  int i;
-
-  fprintf(stderr, "Printing long buffer %d\n", bufnum);
-  for(i=0; i<loop->bufcnt; i++) {
-  fprintf(stderr, "  %ld\n", *(((long *)(loop->bufptr[bufnum]))+i));
-  }
-  }
-
-
-
-  static void
-  _printcastbuf(PyUFuncLoopObject *loop, int bufnum)
-  {
-  int i;
-
-  fprintf(stderr, "Printing long buffer %d\n", bufnum);
-  for(i=0; i<loop->bufcnt; i++) {
-  fprintf(stderr, "  %ld\n", *(((long *)(loop->castbuf[bufnum]))+i));
-  }
-  }
+    /*
+     * Get the appropriate __array_prepare__ function to call
+     * for each output
+     */
+    _find_array_prepare(args, kwds, arr_prep, nin, nout);
 
-*/
+    /* Set up arr_prep_args if a prep function was needed */
+    for (i = 0; i < nout; ++i) {
+        if (arr_prep[i] != NULL && arr_prep[i] != Py_None) {
+            arr_prep_args = make_arr_prep_args(nin, args, kwds);
+            break;
+        }
+    }
 
+    /* If the loop wants the arrays, provide them */
+    if (_does_loop_use_arrays(innerloopdata)) {
+        innerloopdata = (void*)op;
+    }
 
+    /*
+     * Set up the iterator per-op flags.  For generalized ufuncs, we
+     * can't do buffering, so must COPY or UPDATEIFCOPY.
+     */
+    for (i = 0; i < nin; ++i) {
+        op_flags[i] = NPY_ITER_READONLY|
+                      NPY_ITER_COPY|
+                      NPY_ITER_ALIGNED;
+    }
+    for (i = nin; i < niter; ++i) {
+        op_flags[i] = NPY_ITER_READWRITE|
+                      NPY_ITER_UPDATEIFCOPY|
+                      NPY_ITER_ALIGNED|
+                      NPY_ITER_ALLOCATE|
+                      NPY_ITER_NO_BROADCAST;
+    }
+
+    /* Create the iterator */
+    iter = NpyIter_MultiNew(niter, op, NPY_ITER_COORDS|
+                                      NPY_ITER_REFS_OK|
+                                      NPY_ITER_REDUCE_OK,
+                           order, NPY_UNSAFE_CASTING, op_flags,
+                           dtype, op_ndim, op_axes, 0);
+    if (iter == NULL) {
+        retval = -1;
+        goto fail;
+    }
 
+    /* Fill in any allocated outputs */
+    for (i = nin; i < niter; ++i) {
+        if (op[i] == NULL) {
+            op[i] = NpyIter_GetOperandArray(iter)[i];
+            Py_INCREF(op[i]);
+        }
+    }
 
-/*
- * currently generic ufuncs cannot be built for use on flexible arrays.
- *
- * The cast functions in the generic loop would need to be fixed to pass
- * in something besides NULL, NULL.
- *
- * Also the underlying ufunc loops would not know the element-size unless
- * that was passed in as data (which could be arranged).
- *
- */
+    /*
+     * Set up the inner strides array. Because we're not doing
+     * buffering, the strides are fixed throughout the looping.
+     */
+    inner_strides = (npy_intp *)_pya_malloc(
+                        NPY_SIZEOF_INTP * (niter+core_dim_ixs_size));
+    /* The strides after the first niter match core_dim_ixs */
+    core_dim_ixs = self->core_dim_ixs;
+    inner_strides_tmp = inner_strides + niter;
+    for (idim = 0; idim < self->core_num_dim_ix; ++idim) {
+        ax_strides_tmp[idim] = NpyIter_GetAxisStrideArray(iter,
+                                                broadcast_ndim+idim);
+        if (ax_strides_tmp[idim] == NULL) {
+            retval = -1;
+            goto fail;
+        }
+    }
+    for (i = 0; i < niter; ++i) {
+        for (idim = 0; idim < self->core_num_dims[i]; ++idim) {
+            inner_strides_tmp[idim] = ax_strides_tmp[core_dim_ixs[idim]][i];
+        }
 
-/*UFUNC_API
- *
- * This generic function is called with the ufunc object, the arguments to it,
- * and an array of (pointers to) PyArrayObjects which are NULL.  The
- * arguments are parsed and placed in mps in construct_loop (construct_arrays)
- */
-NPY_NO_EXPORT int
-PyUFunc_GenericFunction(PyUFuncObject *self, PyObject *args, PyObject *kwds,
-                        PyArrayObject **mps)
-{
-    PyUFuncLoopObject *loop;
-    int i;
-    NPY_BEGIN_THREADS_DEF;
+        core_dim_ixs += self->core_num_dims[i];
+        inner_strides_tmp += self->core_num_dims[i];
+    }
 
-    if (!(loop = construct_loop(self, args, kwds, mps))) {
-        return -1;
+    /* Set up the inner dimensions array */
+    if (NpyIter_GetShape(iter, inner_dimensions) != NPY_SUCCEED) {
+        retval = -1;
+        goto fail;
     }
-    if (loop->notimplemented) {
-        ufuncloop_dealloc(loop);
-        return -2;
+    /* Move the core dimensions to start at the second element */
+    memmove(&inner_dimensions[1], &inner_dimensions[broadcast_ndim],
+                        NPY_SIZEOF_INTP * self->core_num_dim_ix);
+
+    /* Remove all the core dimensions from the iterator */
+    for (i = 0; i < self->core_num_dim_ix; ++i) {
+        if (NpyIter_RemoveAxis(iter, broadcast_ndim) != NPY_SUCCEED) {
+            retval = -1;
+            goto fail;
+        }
     }
-    if (self->core_enabled && loop->meth != SIGNATURE_NOBUFFER_UFUNCLOOP) {
-        PyErr_SetString(PyExc_RuntimeError,
-                        "illegal loop method for ufunc with signature");
+    if (NpyIter_RemoveCoords(iter) != NPY_SUCCEED) {
+        retval = -1;
+        goto fail;
+    }
+    if (NpyIter_RemoveInnerLoop(iter) != NPY_SUCCEED) {
+        retval = -1;
         goto fail;
     }
 
-    NPY_LOOP_BEGIN_THREADS;
-    switch(loop->meth) {
-    case ONE_UFUNCLOOP:
-        /*
-         * Everything is contiguous, notswapped, aligned,
-         * and of the right type.  -- Fastest.
-         * Or if not contiguous, then a single-stride
-         * increment moves through the entire array.
-         */
-        /*fprintf(stderr, "ONE...%d\n", loop->size);*/
-        loop->function((char **)loop->bufptr, &(loop->size),
-                loop->steps, loop->funcdata);
-        UFUNC_CHECK_ERROR(loop);
-        break;
-    case NOBUFFER_UFUNCLOOP:
-        /*
-         * Everything is notswapped, aligned and of the
-         * right type but not contiguous. -- Almost as fast.
-         */
-        /*fprintf(stderr, "NOBUFFER...%d\n", loop->size);*/
-        while (loop->index < loop->size) {
-            for (i = 0; i < self->nargs; i++) {
-                loop->bufptr[i] = loop->iters[i]->dataptr;
-            }
-            loop->function((char **)loop->bufptr, &(loop->bufcnt),
-                    loop->steps, loop->funcdata);
-            UFUNC_CHECK_ERROR(loop);
+    /*
+     * The first niter strides are for the inner loop (but only can
+     * copy them after removing the core axes
+     */
+    memcpy(inner_strides, NpyIter_GetInnerStrideArray(iter),
+                                    NPY_SIZEOF_INTP * niter);
 
-            /* Adjust loop pointers */
-            for (i = 0; i < self->nargs; i++) {
-                PyArray_ITER_NEXT(loop->iters[i]);
-            }
-            loop->index++;
-        }
-        break;
-    case SIGNATURE_NOBUFFER_UFUNCLOOP:
-        while (loop->index < loop->size) {
-            for (i = 0; i < self->nargs; i++) {
-                loop->bufptr[i] = loop->iters[i]->dataptr;
-            }
-            loop->function((char **)loop->bufptr, loop->core_dim_sizes,
-                    loop->core_strides, loop->funcdata);
-            UFUNC_CHECK_ERROR(loop);
+#if 0
+    printf("strides: ");
+    for (i = 0; i < niter+core_dim_ixs_size; ++i) {
+        printf("%d ", (int)inner_strides[i]);
+    }
+    printf("\n");
+#endif
 
-            /* Adjust loop pointers */
-            for (i = 0; i < self->nargs; i++) {
-                PyArray_ITER_NEXT(loop->iters[i]);
-            }
-            loop->index++;
-        }
-        break;
-    case BUFFER_UFUNCLOOP: {
-        /* This should be a function */
-        PyArray_CopySwapNFunc *copyswapn[NPY_MAXARGS];
-        PyArrayIterObject **iters=loop->iters;
-        int *swap=loop->swap;
-        char **dptr=loop->dptr;
-        int mpselsize[NPY_MAXARGS];
-        intp laststrides[NPY_MAXARGS];
-        int fastmemcpy[NPY_MAXARGS];
-        int *needbuffer = loop->needbuffer;
-        intp index=loop->index, size=loop->size;
-        int bufsize;
-        intp bufcnt;
-        int copysizes[NPY_MAXARGS];
-        char **bufptr = loop->bufptr;
-        char **buffer = loop->buffer;
-        char **castbuf = loop->castbuf;
-        intp *steps = loop->steps;
-        char *tptr[NPY_MAXARGS];
-        int ninnerloops = loop->ninnerloops;
-        Bool pyobject[NPY_MAXARGS];
-        int datasize[NPY_MAXARGS];
-        int j, k, stopcondition;
-        char *myptr1, *myptr2;
-
-        for (i = 0; i <self->nargs; i++) {
-            copyswapn[i] = mps[i]->descr->f->copyswapn;
-            mpselsize[i] = mps[i]->descr->elsize;
-            pyobject[i] = ((loop->obj & UFUNC_OBJ_ISOBJECT)
-                    && (mps[i]->descr->type_num == PyArray_OBJECT));
-            laststrides[i] = iters[i]->strides[loop->lastdim];
-            if (steps[i] && laststrides[i] != mpselsize[i]) {
-                fastmemcpy[i] = 0;
-            }
-            else {
-                fastmemcpy[i] = 1;
-            }
-        }
-        /* Do generic buffered looping here (works for any kind of
-         * arrays -- some need buffers, some don't.
-         *
-         *
-         * New algorithm: N is the largest dimension.  B is the buffer-size.
-         * quotient is loop->ninnerloops-1
-         * remainder is loop->leftover
-         *
-         * Compute N = quotient * B + remainder.
-         * quotient = N / B  # integer math
-         * (store quotient + 1) as the number of innerloops
-         * remainder = N % B # integer remainder
-         *
-         * On the inner-dimension we will have (quotient + 1) loops where
-         * the size of the inner function is B for all but the last when the niter size is
-         * remainder.
-         *
-         * So, the code looks very similar to NOBUFFER_LOOP except the inner-most loop is
-         * replaced with...
-         *
-         * for(i=0; i<quotient+1; i++) {
-         * if (i==quotient+1) make itersize remainder size
-         * copy only needed items to buffer.
-         * swap input buffers if needed
-         * cast input buffers if needed
-         * call loop_function()
-         * cast outputs in buffers if needed
-         * swap outputs in buffers if needed
-         * copy only needed items back to output arrays.
-         * update all data-pointers by strides*niter
-         * }
-         */
+    /* Start with the floating-point exception flags cleared */
+    PyUFunc_clearfperr();
 
+    NPY_UF_DBG_PRINTF("Executing inner loop\n");
 
-        /*
-         * fprintf(stderr, "BUFFER...%d,%d,%d\n", loop->size,
-         * loop->ninnerloops, loop->leftover);
-         */
-        /*
-         * for(i=0; i<self->nargs; i++) {
-         * fprintf(stderr, "iters[%d]->dataptr = %p, %p of size %d\n", i,
-         * iters[i], iters[i]->ao->data, PyArray_NBYTES(iters[i]->ao));
-         * }
-         */
-        stopcondition = ninnerloops;
-        if (loop->leftover == 0) {
-            stopcondition--;
-        }
-        while (index < size) {
-            bufsize=loop->bufsize;
-            for(i = 0; i<self->nargs; i++) {
-                tptr[i] = loop->iters[i]->dataptr;
-                if (needbuffer[i]) {
-                    dptr[i] = bufptr[i];
-                    datasize[i] = (steps[i] ? bufsize : 1);
-                    copysizes[i] = datasize[i] * mpselsize[i];
-                }
-                else {
-                    dptr[i] = tptr[i];
-                }
-            }
+    /* Do the ufunc loop */
+    if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNext_Fn iternext;
+        char **dataptr;
+        npy_intp *count_ptr;
 
-            /* This is the inner function over the last dimension */
-            for (k = 1; k<=stopcondition; k++) {
-                if (k == ninnerloops) {
-                    bufsize = loop->leftover;
-                    for (i=0; i<self->nargs;i++) {
-                        if (!needbuffer[i]) {
-                            continue;
-                        }
-                        datasize[i] = (steps[i] ? bufsize : 1);
-                        copysizes[i] = datasize[i] * mpselsize[i];
-                    }
-                }
-                for (i = 0; i < self->nin; i++) {
-                    if (!needbuffer[i]) {
-                        continue;
-                    }
-                    if (fastmemcpy[i]) {
-                        memcpy(buffer[i], tptr[i], copysizes[i]);
-                    }
-                    else {
-                        myptr1 = buffer[i];
-                        myptr2 = tptr[i];
-                        for (j = 0; j < bufsize; j++) {
-                            memcpy(myptr1, myptr2, mpselsize[i]);
-                            myptr1 += mpselsize[i];
-                            myptr2 += laststrides[i];
-                        }
-                    }
-
-                    /* swap the buffer if necessary */
-                    if (swap[i]) {
-                        /* fprintf(stderr, "swapping...\n");*/
-                        copyswapn[i](buffer[i], mpselsize[i], NULL, -1,
-                                (intp) datasize[i], 1,
-                                mps[i]);
-                    }
-                    /* cast to the other buffer if necessary */
-                    if (loop->cast[i]) {
-                        /* fprintf(stderr, "casting... %d, %p %p\n", i, buffer[i]); */
-                        loop->cast[i](buffer[i], castbuf[i],
-                                (intp) datasize[i],
-                                NULL, NULL);
-                    }
-                }
-
-                bufcnt = (intp) bufsize;
-                loop->function((char **)dptr, &bufcnt, steps, loop->funcdata);
-                UFUNC_CHECK_ERROR(loop);
+        /* Get the variables needed for the loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            NpyIter_Deallocate(iter);
+            retval = -1;
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-                for (i = self->nin; i < self->nargs; i++) {
-                    if (!needbuffer[i]) {
-                        continue;
-                    }
-                    if (loop->cast[i]) {
-                        /* fprintf(stderr, "casting back... %d, %p", i, castbuf[i]); */
-                        loop->cast[i](castbuf[i],
-                                buffer[i],
-                                (intp) datasize[i],
-                                NULL, NULL);
-                    }
-                    if (swap[i]) {
-                        copyswapn[i](buffer[i], mpselsize[i], NULL, -1,
-                                (intp) datasize[i], 1,
-                                mps[i]);
-                    }
-                    /*
-                     * copy back to output arrays
-                     * decref what's already there for object arrays
-                     */
-                    if (pyobject[i]) {
-                        myptr1 = tptr[i];
-                        for (j = 0; j < datasize[i]; j++) {
-                            Py_XDECREF(*((PyObject **)myptr1));
-                            myptr1 += laststrides[i];
-                        }
-                    }
-                    if (fastmemcpy[i]) {
-                        memcpy(tptr[i], buffer[i], copysizes[i]);
-                    }
-                    else {
-                        myptr2 = buffer[i];
-                        myptr1 = tptr[i];
-                        for (j = 0; j < bufsize; j++) {
-                            memcpy(myptr1, myptr2, mpselsize[i]);
-                            myptr1 += laststrides[i];
-                            myptr2 += mpselsize[i];
-                        }
-                    }
-                }
-                if (k == stopcondition) {
-                    continue;
-                }
-                for (i = 0; i < self->nargs; i++) {
-                    tptr[i] += bufsize * laststrides[i];
-                    if (!needbuffer[i]) {
-                        dptr[i] = tptr[i];
-                    }
-                }
-            }
-            /* end inner function over last dimension */
+        do {
+            inner_dimensions[0] = *count_ptr;
+            innerloop(dataptr, inner_dimensions, inner_strides, innerloopdata);
+        } while (iternext(iter));
+    }
 
-            if (loop->objfunc) {
-                /*
-                 * DECREF castbuf when underlying function used
-                 * object arrays and casting was needed to get
-                 * to object arrays
-                 */
-                for (i = 0; i < self->nargs; i++) {
-                    if (loop->cast[i]) {
-                        if (steps[i] == 0) {
-                            Py_XDECREF(*((PyObject **)castbuf[i]));
-                        }
-                        else {
-                            int size = loop->bufsize;
-
-                            PyObject **objptr = (PyObject **)castbuf[i];
-                            /*
-                             * size is loop->bufsize unless there
-                             * was only one loop
-                             */
-                            if (ninnerloops == 1) {
-                                size = loop->leftover;
-                            }
-                            for (j = 0; j < size; j++) {
-                                Py_XDECREF(*objptr);
-                                *objptr = NULL;
-                                objptr += 1;
-                            }
-                        }
-                    }
-                }
-            }
-            /* fixme -- probably not needed here*/
-            UFUNC_CHECK_ERROR(loop);
+    /* Check whether any errors occurred during the loop */
+    if (PyErr_Occurred() || (errormask &&
+            PyUFunc_checkfperr(errormask, errobj, &first_error))) {
+        retval = -1;
+        goto fail;
+    }
 
-            for (i = 0; i < self->nargs; i++) {
-                PyArray_ITER_NEXT(loop->iters[i]);
-            }
-            index++;
-        }
-    } /* end of last case statement */
+    _pya_free(inner_strides);
+    NpyIter_Deallocate(iter);
+    /* The caller takes ownership of all the references in op */
+    for (i = 0; i < niter; ++i) {
+        Py_XDECREF(dtype[i]);
+        Py_XDECREF(arr_prep[i]);
     }
+    Py_XDECREF(errobj);
+    Py_XDECREF(type_tup);
+    Py_XDECREF(arr_prep_args);
+
+    NPY_UF_DBG_PRINTF("Returning Success\n");
 
-    NPY_LOOP_END_THREADS;
-    ufuncloop_dealloc(loop);
     return 0;
 
 fail:
-    NPY_LOOP_END_THREADS;
-    if (loop) {
-        ufuncloop_dealloc(loop);
+    NPY_UF_DBG_PRINTF("Returning failure code %d\n", retval);
+    if (inner_strides) {
+        _pya_free(inner_strides);
     }
-    return -1;
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
+    }
+    for (i = 0; i < niter; ++i) {
+        Py_XDECREF(op[i]);
+        op[i] = NULL;
+        Py_XDECREF(dtype[i]);
+        Py_XDECREF(arr_prep[i]);
+    }
+    Py_XDECREF(errobj);
+    Py_XDECREF(type_tup);
+    Py_XDECREF(arr_prep_args);
+
+    return retval;
 }
 
-static PyArrayObject *
-_getidentity(PyUFuncObject *self, int otype, char *str)
+/*UFUNC_API
+ *
+ * This generic function is called with the ufunc object, the arguments to it,
+ * and an array of (pointers to) PyArrayObjects which are NULL.
+ */
+NPY_NO_EXPORT int
+PyUFunc_GenericFunction(PyUFuncObject *self,
+                        PyObject *args, PyObject *kwds,
+                        PyArrayObject **op)
 {
-    PyObject *obj, *arr;
-    PyArray_Descr *typecode;
+    npy_intp nin, nout;
+    npy_intp i, niter;
+    char *ufunc_name;
+    int retval = -1, any_object = 0;
+    NPY_CASTING input_casting;
 
-    if (self->identity == PyUFunc_None) {
-        PyErr_Format(PyExc_ValueError,
-                     "zero-size array to ufunc.%s "      \
-                     "without identity", str);
-        return NULL;
+    PyArray_Descr *dtype[NPY_MAXARGS];
+
+    /* These parameters come from extobj= or from a TLS global */
+    int buffersize = 0, errormask = 0;
+    PyObject *errobj = NULL;
+    int first_error = 1;
+
+    /* The selected inner loop */
+    PyUFuncGenericFunction innerloop = NULL;
+    void *innerloopdata = NULL;
+
+    /* The __array_prepare__ function to call for each output */
+    PyObject *arr_prep[NPY_MAXARGS];
+    /*
+     * This is either args, or args with the out= parameter from
+     * kwds added appropriately.
+     */
+    PyObject *arr_prep_args = NULL;
+
+    int trivial_loop_ok = 0;
+
+    /* TODO: For 1.6, the default should probably be NPY_CORDER */
+    NPY_ORDER order = NPY_KEEPORDER;
+    /*
+     * Many things in NumPy do unsafe casting (doing int += float, etc).
+     * The strictness should probably become a state parameter, similar
+     * to the seterr/geterr.
+     */
+    NPY_CASTING casting = NPY_UNSAFE_CASTING;
+    /* When provided, extobj and typetup contain borrowed references */
+    PyObject *extobj = NULL, *type_tup = NULL;
+
+    if (self == NULL) {
+        PyErr_SetString(PyExc_ValueError, "function not supported");
+        return -1;
     }
-    if (self->identity == PyUFunc_One) {
-        obj = PyInt_FromLong((long) 1);
-    } else {
-        obj = PyInt_FromLong((long) 0);
+
+    /* TODO: support generalized ufunc */
+    if (self->core_enabled) {
+        return PyUFunc_GeneralizedFunction(self, args, kwds, op);
     }
 
-    typecode = PyArray_DescrFromType(otype);
-    arr = PyArray_FromAny(obj, typecode, 0, 0, CARRAY, NULL);
-    Py_DECREF(obj);
-    return (PyArrayObject *)arr;
-}
+    nin = self->nin;
+    nout = self->nout;
+    niter = nin + nout;
 
-static int
-_create_reduce_copy(PyUFuncReduceObject *loop, PyArrayObject **arr, int rtype)
-{
-    intp maxsize;
-    PyObject *new;
-    PyArray_Descr *ntype;
-
-    maxsize = PyArray_SIZE(*arr);
-
-    if (maxsize < loop->bufsize) {
-        if (!(PyArray_ISBEHAVED_RO(*arr))
-            || PyArray_TYPE(*arr) != rtype) {
-            ntype = PyArray_DescrFromType(rtype);
-            new = PyArray_FromAny((PyObject *)(*arr),
-                                  ntype, 0, 0,
-                                  FORCECAST | ALIGNED, NULL);
-            if (new == NULL) {
-                return -1;
-            }
-            *arr = (PyArrayObject *)new;
-            loop->decref = new;
+    ufunc_name = self->name ? self->name : "<unnamed ufunc>";
+
+    NPY_UF_DBG_PRINTF("\nEvaluating ufunc %s\n", ufunc_name);
+
+    /* Initialize all the operands and dtypes to NULL */
+    for (i = 0; i < niter; ++i) {
+        op[i] = NULL;
+        dtype[i] = NULL;
+        arr_prep[i] = NULL;
+    }
+
+    NPY_UF_DBG_PRINTF("Getting arguments\n");
+
+    /* Get all the arguments */
+    retval = get_ufunc_arguments(self, args, kwds,
+                op, &order, &casting, &extobj, &type_tup, &any_object);
+    if (retval < 0) {
+        goto fail;
+    }
+
+    /* Get the buffersize, errormask, and error object globals */
+    if (extobj == NULL) {
+        if (PyUFunc_GetPyValues(ufunc_name,
+                                &buffersize, &errormask, &errobj) < 0) {
+            retval = -1;
+            goto fail;
+        }
+    }
+    else {
+        if (_extract_pyvals(extobj, ufunc_name,
+                                &buffersize, &errormask, &errobj) < 0) {
+            retval = -1;
+            goto fail;
         }
     }
 
+    NPY_UF_DBG_PRINTF("Finding inner loop\n");
+
     /*
-     * Don't decref *arr before re-assigning
-     * because it was not going to be DECREF'd anyway.
-     *
-     * If a copy is made, then the copy will be removed
-     * on deallocation of the loop structure by setting
-     * loop->decref.
+     * Decide the casting rules for inputs and outputs.  We want
+     * NPY_SAFE_CASTING or stricter, so that the loop selection code
+     * doesn't choose an integer loop for float inputs, for example.
      */
-    return 0;
-}
+    input_casting = (casting > NPY_SAFE_CASTING) ? NPY_SAFE_CASTING : casting;
 
-static PyUFuncReduceObject *
-construct_reduce(PyUFuncObject *self, PyArrayObject **arr, PyArrayObject *out,
-                 int axis, int otype, int operation, intp ind_size, char *str)
-{
-    PyUFuncReduceObject *loop;
-    PyArrayObject *idarr;
-    PyArrayObject *aar;
-    intp loop_i[MAX_DIMS], outsize = 0;
-    int arg_types[3];
-    PyArray_SCALARKIND scalars[3] = {PyArray_NOSCALAR, PyArray_NOSCALAR,
-                                     PyArray_NOSCALAR};
-    int i, j, nd;
-    int flags;
-
-    /* Reduce type is the type requested of the input during reduction */
-    if (self->core_enabled) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "construct_reduce not allowed on ufunc with signature");
-        return NULL;
+    if (type_tup == NULL) {
+        /* Find the best ufunc inner loop, and fill in the dtypes */
+        retval = find_best_ufunc_inner_loop(self, op, input_casting, casting,
+                        buffersize, any_object, dtype,
+                        &innerloop, &innerloopdata, &trivial_loop_ok);
+    } else {
+        /* Find the specified ufunc inner loop, and fill in the dtypes */
+        retval = find_specified_ufunc_inner_loop(self, type_tup,
+                        op, casting,
+                        buffersize, any_object, dtype,
+                        &innerloop, &innerloopdata, &trivial_loop_ok);
     }
-    nd = (*arr)->nd;
-    arg_types[0] = otype;
-    arg_types[1] = otype;
-    arg_types[2] = otype;
-    if ((loop = _pya_malloc(sizeof(PyUFuncReduceObject))) == NULL) {
-        PyErr_NoMemory();
-        return loop;
-    }
-
-    loop->retbase = 0;
-    loop->swap = 0;
-    loop->index = 0;
-    loop->ufunc = self;
-    Py_INCREF(self);
-    loop->cast = NULL;
-    loop->buffer = NULL;
-    loop->ret = NULL;
-    loop->it = NULL;
-    loop->rit = NULL;
-    loop->errobj = NULL;
-    loop->first = 1;
-    loop->decref = NULL;
-    loop->N = (*arr)->dimensions[axis];
-    loop->instrides = (*arr)->strides[axis];
-    if (select_types(loop->ufunc, arg_types, &(loop->function),
-                     &(loop->funcdata), scalars, NULL) == -1) {
+    if (retval < 0) {
         goto fail;
     }
+
     /*
-     * output type may change -- if it does
-     * reduction is forced into that type
-     * and we need to select the reduction function again
-     */
-    if (otype != arg_types[2]) {
-        otype = arg_types[2];
-        arg_types[0] = otype;
-        arg_types[1] = otype;
-        if (select_types(loop->ufunc, arg_types, &(loop->function),
-                         &(loop->funcdata), scalars, NULL) == -1) {
+     * FAIL with NotImplemented if the other object has
+     * the __r<op>__ method and has __array_priority__ as
+     * an attribute (signalling it can handle ndarray's)
+     * and is not already an ndarray or a subtype of the same type.
+    */
+    if (nin == 2 && nout == 1 && dtype[1]->type_num == NPY_OBJECT) {
+        PyObject *_obj = PyTuple_GET_ITEM(args, 1);
+        if (!PyArray_CheckExact(_obj)
+               /* If both are same subtype of object arrays, then proceed */
+                && !(Py_TYPE(_obj) == Py_TYPE(PyTuple_GET_ITEM(args, 0)))
+                && PyObject_HasAttrString(_obj, "__array_priority__")
+                && _has_reflected_op(_obj, ufunc_name)) {
+            retval = -2;
             goto fail;
         }
     }
 
-    /* get looping parameters from Python */
-    if (PyUFunc_GetPyValues(str, &(loop->bufsize), &(loop->errormask),
-                            &(loop->errobj)) < 0) {
-        goto fail;
+#if NPY_UF_DBG_TRACING
+    printf("input types:\n");
+    for (i = 0; i < nin; ++i) {
+        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        printf(" ");
     }
-    /* Make copy if misbehaved or not otype for small arrays */
-    if (_create_reduce_copy(loop, arr, otype) < 0) {
-        goto fail;
+    printf("\noutput types:\n");
+    for (i = nin; i < niter; ++i) {
+        PyObject_Print((PyObject *)dtype[i], stdout, 0);
+        printf(" ");
     }
-    aar = *arr;
+    printf("\n");
+#endif
 
-    if (loop->N == 0) {
-        loop->meth = ZERO_EL_REDUCELOOP;
-    }
-    else if (PyArray_ISBEHAVED_RO(aar) && (otype == (aar)->descr->type_num)) {
-        if (loop->N == 1) {
-            loop->meth = ONE_EL_REDUCELOOP;
-        }
-        else {
-            loop->meth = NOBUFFER_UFUNCLOOP;
-            loop->steps[1] = (aar)->strides[axis];
-            loop->N -= 1;
+    /*
+     * Get the appropriate __array_prepare__ function to call
+     * for each output
+     */
+    _find_array_prepare(args, kwds, arr_prep, nin, nout);
+
+    /* Set up arr_prep_args if a prep function was needed */
+    for (i = 0; i < nout; ++i) {
+        if (arr_prep[i] != NULL && arr_prep[i] != Py_None) {
+            arr_prep_args = make_arr_prep_args(nin, args, kwds);
+            break;
         }
     }
-    else {
-        loop->meth = BUFFER_UFUNCLOOP;
-        loop->swap = !(PyArray_ISNOTSWAPPED(aar));
+
+    /* If the loop wants the arrays, provide them */
+    if (_does_loop_use_arrays(innerloopdata)) {
+        innerloopdata = (void*)op;
     }
 
-    /* Determine if object arrays are involved */
-    if (otype == PyArray_OBJECT || aar->descr->type_num == PyArray_OBJECT) {
-        loop->obj = UFUNC_OBJ_ISOBJECT | UFUNC_OBJ_NEEDS_API;
+    /* Start with the floating-point exception flags cleared */
+    PyUFunc_clearfperr();
+
+    NPY_UF_DBG_PRINTF("Executing inner loop\n");
+
+    /* Do the ufunc loop */
+    retval = execute_ufunc_loop(self, trivial_loop_ok, op, dtype, order,
+                        buffersize, arr_prep, arr_prep_args,
+                        innerloop, innerloopdata);
+    if (retval < 0) {
+        goto fail;
     }
-    else if ((otype == PyArray_DATETIME)
-            || (aar->descr->type_num == PyArray_DATETIME)
-            || (otype == PyArray_TIMEDELTA)
-            || (aar->descr->type_num == PyArray_TIMEDELTA))
-    {
-        loop->obj = UFUNC_OBJ_NEEDS_API;
+
+    /* Check whether any errors occurred during the loop */
+    if (PyErr_Occurred() || (errormask &&
+            PyUFunc_checkfperr(errormask, errobj, &first_error))) {
+        retval = -1;
+        goto fail;
     }
-    else {
-        loop->obj = 0;
+
+    /* The caller takes ownership of all the references in op */
+    for (i = 0; i < niter; ++i) {
+        Py_XDECREF(dtype[i]);
+        Py_XDECREF(arr_prep[i]);
     }
-    if ((loop->meth == ZERO_EL_REDUCELOOP)
-            || ((operation == UFUNC_REDUCEAT)
-                && (loop->meth == BUFFER_UFUNCLOOP))) {
-        idarr = _getidentity(self, otype, str);
-        if (idarr == NULL) {
-            goto fail;
+    Py_XDECREF(errobj);
+    Py_XDECREF(type_tup);
+    Py_XDECREF(arr_prep_args);
+
+    NPY_UF_DBG_PRINTF("Returning Success\n");
+
+    return 0;
+
+fail:
+    NPY_UF_DBG_PRINTF("Returning failure code %d\n", retval);
+    for (i = 0; i < niter; ++i) {
+        Py_XDECREF(op[i]);
+        op[i] = NULL;
+        Py_XDECREF(dtype[i]);
+        Py_XDECREF(arr_prep[i]);
+    }
+    Py_XDECREF(errobj);
+    Py_XDECREF(type_tup);
+    Py_XDECREF(arr_prep_args);
+
+    return retval;
+}
+
+/*
+ * Given the output type, finds the specified binary op.  The
+ * ufunc must have nin==2 and nout==1.  The function may modify
+ * otype if the given type isn't found.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+get_binary_op_function(PyUFuncObject *self, int *otype,
+                        PyUFuncGenericFunction *out_innerloop,
+                        void **out_innerloopdata)
+{
+    int i;
+    PyUFunc_Loop1d *funcdata;
+
+    NPY_UF_DBG_PRINTF("Getting binary op function for type number %d\n",
+                                *otype);
+
+    /* If the type is custom and there are userloops, search for it here */
+    if (self->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
+        PyObject *key, *obj;
+        key = PyInt_FromLong(*otype);
+        if (key == NULL) {
+            return -1;
         }
-        if (idarr->descr->elsize > UFUNC_MAXIDENTITY) {
-            PyErr_Format(PyExc_RuntimeError,
-                    "UFUNC_MAXIDENTITY (%d) is too small"\
-                    "(needs to be at least %d)",
-                    UFUNC_MAXIDENTITY, idarr->descr->elsize);
-            Py_DECREF(idarr);
-            goto fail;
+        obj = PyDict_GetItem(self->userloops, key);
+        Py_DECREF(key);
+        if (obj != NULL) {
+            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            while (funcdata != NULL) {
+                int *types = funcdata->arg_types;
+
+                if (types[0] == *otype && types[1] == *otype &&
+                                                types[2] == *otype) {
+                    *out_innerloop = funcdata->func;
+                    *out_innerloopdata = funcdata->data;
+                    return 0;
+                }
+
+                funcdata = funcdata->next;
+            }
         }
-        memcpy(loop->idptr, idarr->data, idarr->descr->elsize);
-        Py_DECREF(idarr);
     }
 
-    /* Construct return array */
-    flags = NPY_CARRAY | NPY_UPDATEIFCOPY | NPY_FORCECAST;
-    switch(operation) {
-    case UFUNC_REDUCE:
-        for (j = 0, i = 0; i < nd; i++) {
-            if (i != axis) {
-                loop_i[j++] = (aar)->dimensions[i];
+    /* Search for a function with compatible inputs */
+    for (i = 0; i < self->ntypes; ++i) {
+        char *types = self->types + i*self->nargs;
+
+        NPY_UF_DBG_PRINTF("Trying loop with signature %d %d -> %d\n",
+                                types[0], types[1], types[2]);
+
+        if (PyArray_CanCastSafely(*otype, types[0]) &&
+                    types[0] == types[1] &&
+                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
+            /* If the signature is "xx->x", we found the loop */
+            if (types[2] == types[0]) {
+                *out_innerloop = self->functions[i];
+                *out_innerloopdata = self->data[i];
+                *otype = types[0];
+                return 0;
+            }
+            /*
+             * Otherwise, we found the natural type of the reduction,
+             * replace otype and search again
+             */
+            else {
+                *otype = types[2];
+                break;
             }
         }
-        if (out == NULL) {
-            loop->ret = (PyArrayObject *)
-                PyArray_New(Py_TYPE(aar), aar->nd-1, loop_i,
-                            otype, NULL, NULL, 0, 0,
-                            (PyObject *)aar);
-        }
-        else {
-            outsize = PyArray_MultiplyList(loop_i, aar->nd - 1);
-        }
-        break;
-    case UFUNC_ACCUMULATE:
-        if (out == NULL) {
-            loop->ret = (PyArrayObject *)
-                PyArray_New(Py_TYPE(aar), aar->nd, aar->dimensions,
-                        otype, NULL, NULL, 0, 0, (PyObject *)aar);
-        }
-        else {
-            outsize = PyArray_MultiplyList(aar->dimensions, aar->nd);
-        }
-        break;
-    case UFUNC_REDUCEAT:
-        memcpy(loop_i, aar->dimensions, nd*sizeof(intp));
-        /* Index is 1-d array */
-        loop_i[axis] = ind_size;
-        if (out == NULL) {
-            loop->ret = (PyArrayObject *)
-                PyArray_New(Py_TYPE(aar), aar->nd, loop_i, otype,
-                        NULL, NULL, 0, 0, (PyObject *)aar);
-        }
-        else {
-            outsize = PyArray_MultiplyList(loop_i, aar->nd);
-        }
-        if (ind_size == 0) {
-            loop->meth = ZERO_EL_REDUCELOOP;
-            return loop;
-        }
-        if (loop->meth == ONE_EL_REDUCELOOP) {
-            loop->meth = NOBUFFER_REDUCELOOP;
-        }
-        break;
     }
-    if (out) {
-        if (PyArray_SIZE(out) != outsize) {
-            PyErr_SetString(PyExc_ValueError,
-                    "wrong shape for output");
-            goto fail;
-        }
-        loop->ret = (PyArrayObject *)
-            PyArray_FromArray(out, PyArray_DescrFromType(otype), flags);
-        if (loop->ret && loop->ret != out) {
-            loop->retbase = 1;
+
+    /* Search for the exact function */
+    for (i = 0; i < self->ntypes; ++i) {
+        char *types = self->types + i*self->nargs;
+
+        if (PyArray_CanCastSafely(*otype, types[0]) &&
+                    types[0] == types[1] &&
+                    types[1] == types[2] &&
+                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
+            /* Since the signature is "xx->x", we found the loop */
+            *out_innerloop = self->functions[i];
+            *out_innerloopdata = self->data[i];
+            *otype = types[0];
+            return 0;
         }
     }
-    if (loop->ret == NULL) {
-        goto fail;
-    }
-    loop->insize = aar->descr->elsize;
-    loop->outsize = loop->ret->descr->elsize;
-    loop->bufptr[0] = loop->ret->data;
 
-    if (loop->meth == ZERO_EL_REDUCELOOP) {
-        loop->size = PyArray_SIZE(loop->ret);
-        return loop;
-    }
+    return -1;
+}
+
+/*
+ * The implementation of the reduction operators with the new iterator
+ * turned into a bit of a long function here, but I think the design
+ * of this part needs to be changed to be more like einsum, so it may
+ * not be worth refactoring it too much.  Consider this timing:
+ *
+ * >>> a = arange(10000)
+ *
+ * >>> timeit sum(a)
+ * 10000 loops, best of 3: 17 us per loop
+ *
+ * >>> timeit einsum("i->",a)
+ * 100000 loops, best of 3: 13.5 us per loop
+ *
+ */
+static PyObject *
+PyUFunc_ReductionOp(PyUFuncObject *self, PyArrayObject *arr,
+                    PyArrayObject *out,
+                    int axis, int otype, int operation, char *opname)
+{
+    PyArrayObject *op[2];
+    PyArray_Descr *op_dtypes[2] = {NULL, NULL};
+    npy_intp op_axes_arrays[2][NPY_MAXDIMS];
+    npy_intp *op_axes[2] = {op_axes_arrays[0], op_axes_arrays[1]};
+    npy_uint32 op_flags[2];
+    int i, idim, ndim, otype_final;
+    int needs_api, need_outer_iterator;
+    NPY_BEGIN_THREADS_DEF;
+
+    NpyIter *iter = NULL, *iter_inner = NULL;
+
+    /* The selected inner loop */
+    PyUFuncGenericFunction innerloop = NULL;
+    void *innerloopdata = NULL;
+
+    char *ufunc_name = self->name ? self->name : "(unknown)";
+
+    /* These parameters come from extobj= or from a TLS global */
+    int buffersize = 0, errormask = 0;
+    PyObject *errobj = NULL;
+
+    NPY_UF_DBG_PRINTF("\nEvaluating ufunc %s.%s\n", ufunc_name, opname);
+
+#if 0
+    printf("Doing %s.%s on array with dtype :  ", ufunc_name, opname);
+    PyObject_Print((PyObject *)PyArray_DESCR(arr), stdout, 0);
+    printf("\n");
+#endif
 
-    loop->it = (PyArrayIterObject *)PyArray_IterNew((PyObject *)aar);
-    if (loop->it == NULL) {
+    if (PyUFunc_GetPyValues(opname, &buffersize, &errormask, &errobj) < 0) {
         return NULL;
     }
-    if (loop->meth == ONE_EL_REDUCELOOP) {
-        loop->size = loop->it->size;
-        return loop;
+
+    /* Take a reference to out for later returning */
+    Py_XINCREF(out);
+
+    otype_final = otype;
+    if (get_binary_op_function(self, &otype_final,
+                                &innerloop, &innerloopdata) < 0) {
+        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
+        PyErr_Format(PyExc_ValueError,
+                     "could not find a matching type for %s.%s, "
+                     "requested type has type code '%c'",
+                            ufunc_name, opname, dtype ? dtype->type : '-');
+        Py_XDECREF(dtype);
+        goto fail;
     }
 
-    /*
-     * Fix iterator to loop over correct dimension
-     * Set size in axis dimension to 1
-     */
-    loop->it->contiguous = 0;
-    loop->it->size /= (loop->it->dims_m1[axis]+1);
-    loop->it->dims_m1[axis] = 0;
-    loop->it->backstrides[axis] = 0;
-    loop->size = loop->it->size;
+    ndim = PyArray_NDIM(arr);
+
+    /* Set up the output data type */
+    op_dtypes[0] = PyArray_DescrFromType(otype_final);
+    if (op_dtypes[0] == NULL) {
+        goto fail;
+    }
+
+#if NPY_UF_DBG_TRACING
+    printf("Found %s.%s inner loop with dtype :  ", ufunc_name, opname);
+    PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
+    printf("\n");
+#endif
+
+    /* Set up the op_axes for the outer loop */
     if (operation == UFUNC_REDUCE) {
-        loop->steps[0] = 0;
+        for (i = 0, idim = 0; idim < ndim; ++idim) {
+            if (idim != axis) {
+                op_axes_arrays[0][i] = i;
+                op_axes_arrays[1][i] = idim;
+                i++;
+            }
+        }
+    }
+    else if (operation == UFUNC_ACCUMULATE) {
+        for (idim = 0; idim < ndim; ++idim) {
+            op_axes_arrays[0][idim] = idim;
+            op_axes_arrays[1][idim] = idim;
+        }
     }
     else {
-        loop->rit = (PyArrayIterObject *)                       \
-            PyArray_IterNew((PyObject *)(loop->ret));
-        if (loop->rit == NULL) {
-            return NULL;
+        PyErr_Format(PyExc_RuntimeError,
+                    "invalid reduction operation %s.%s", ufunc_name, opname);
+        goto fail;
+    }
+
+    /* The per-operand flags for the outer loop */
+    op_flags[0] = NPY_ITER_READWRITE|
+                  NPY_ITER_NO_BROADCAST|
+                  NPY_ITER_ALLOCATE|
+                  NPY_ITER_NO_SUBTYPE;
+    op_flags[1] = NPY_ITER_READONLY;
+
+    op[0] = out;
+    op[1] = arr;
+
+    need_outer_iterator = (ndim > 1);
+    if (operation == UFUNC_ACCUMULATE) {
+        /* This is because we can't buffer, so must do UPDATEIFCOPY */
+        if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) ||
+                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) ||
+                (out &&
+                 !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
+            need_outer_iterator = 1;
         }
-        /*
-         * Fix iterator to loop over correct dimension
-         * Set size in axis dimension to 1
-         */
-        loop->rit->contiguous = 0;
-        loop->rit->size /= (loop->rit->dims_m1[axis] + 1);
-        loop->rit->dims_m1[axis] = 0;
-        loop->rit->backstrides[axis] = 0;
+    }
 
-        if (operation == UFUNC_ACCUMULATE) {
-            loop->steps[0] = loop->ret->strides[axis];
+    if (need_outer_iterator) {
+        npy_intp ndim_iter = 0;
+        npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
+                           NPY_ITER_REFS_OK;
+        PyArray_Descr **op_dtypes_param = NULL;
+
+        if (operation == UFUNC_REDUCE) {
+            ndim_iter = ndim - 1;
+            if (out == NULL) {
+                op_dtypes_param = op_dtypes;
+            }
         }
-        else {
-            loop->steps[0] = 0;
+        else if (operation == UFUNC_ACCUMULATE) {
+            /*
+             * The way accumulate is set up, we can't do buffering,
+             * so make a copy instead when necessary.
+             */
+            ndim_iter = ndim;
+            flags |= NPY_ITER_COORDS;
+            /* Add some more flags */
+            op_flags[0] |= NPY_ITER_UPDATEIFCOPY|NPY_ITER_ALIGNED;
+            op_flags[1] |= NPY_ITER_COPY|NPY_ITER_ALIGNED;
+            op_dtypes_param = op_dtypes;
+            op_dtypes[1] = op_dtypes[0];
+        }
+        NPY_UF_DBG_PRINTF("Allocating outer iterator\n");
+        iter = NpyIter_MultiNew(2, op, flags,
+                                   NPY_KEEPORDER, NPY_UNSAFE_CASTING,
+                                   op_flags,
+                                   op_dtypes_param,
+                                   ndim_iter, op_axes, 0);
+        if (iter == NULL) {
+            goto fail;
         }
-    }
-    loop->steps[2] = loop->steps[0];
-    loop->bufptr[2] = loop->bufptr[0] + loop->steps[2];
-    if (loop->meth == BUFFER_UFUNCLOOP) {
-        int _size;
 
-        loop->steps[1] = loop->outsize;
-        if (otype != aar->descr->type_num) {
-            _size=loop->bufsize*(loop->outsize + aar->descr->elsize);
-            loop->buffer = PyDataMem_NEW(_size);
-            if (loop->buffer == NULL) {
+        if (operation == UFUNC_ACCUMULATE) {
+            if (NpyIter_RemoveAxis(iter, axis) != NPY_SUCCEED) {
                 goto fail;
             }
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                memset(loop->buffer, 0, _size);
-            }
-            loop->castbuf = loop->buffer + loop->bufsize*aar->descr->elsize;
-            loop->bufptr[1] = loop->castbuf;
-            loop->cast = PyArray_GetCastFunc(aar->descr, otype);
-            if (loop->cast == NULL) {
+            if (NpyIter_RemoveCoords(iter) != NPY_SUCCEED) {
                 goto fail;
             }
+
+            /* In case COPY or UPDATEIFCOPY occurred */
+            op[0] = NpyIter_GetOperandArray(iter)[0];
+            op[1] = NpyIter_GetOperandArray(iter)[1];
+        }
+    }
+
+    /* Get the output */
+    if (!out) {
+        if (iter) {
+            op[0] = out = NpyIter_GetOperandArray(iter)[0];
+            Py_INCREF(out);
         }
         else {
-            _size = loop->bufsize * loop->outsize;
-            loop->buffer = PyDataMem_NEW(_size);
-            if (loop->buffer == NULL) {
+            PyArray_Descr *dtype = op_dtypes[0];
+            Py_INCREF(dtype);
+            if (operation == UFUNC_REDUCE) {
+                op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
+                                        &PyArray_Type, dtype,
+                                        0, NULL, NULL, NULL,
+                                        0, NULL);
+            }
+            else if (operation == UFUNC_ACCUMULATE) {
+                op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
+                                        &PyArray_Type, dtype,
+                                        ndim, PyArray_DIMS(op[1]), NULL, NULL,
+                                        0, NULL);
+            }
+            if (out == NULL) {
+                goto fail;
+            }
+        }
+    }
+
+    /*
+     * If the reduction unit has size zero, either return the reduction
+     * unit for UFUNC_REDUCE, or return the zero-sized output array
+     * for UFUNC_ACCUMULATE.
+     */
+    if (operation == UFUNC_REDUCE && PyArray_DIM(op[1], axis) == 0) {
+        if (self->identity == PyUFunc_None) {
+            PyErr_Format(PyExc_ValueError,
+                         "zero-size array to %s.%s "
+                         "without identity", ufunc_name, opname);
+            goto fail;
+        }
+        if (self->identity == PyUFunc_One) {
+            PyObject *obj = PyInt_FromLong((long) 1);
+            if (obj == NULL) {
                 goto fail;
             }
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                memset(loop->buffer, 0, _size);
+            PyArray_FillWithScalar(op[0], obj);
+            Py_DECREF(obj);
+        } else {
+            PyObject *obj = PyInt_FromLong((long) 0);
+            if (obj == NULL) {
+                goto fail;
             }
-            loop->bufptr[1] = loop->buffer;
+            PyArray_FillWithScalar(op[0], obj);
+            Py_DECREF(obj);
         }
+
+        goto finish;
+    }
+    else if (PyArray_SIZE(op[0]) == 0) {
+        goto finish;
     }
-    PyUFunc_clearfperr();
-    return loop;
 
- fail:
-    ufuncreduce_dealloc(loop);
-    return NULL;
-}
+    /* Only allocate an inner iterator if it's necessary */
+    if (!PyArray_ISALIGNED(op[1]) || !PyArray_ISALIGNED(op[0]) ||
+                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(op[1])) ||
+                !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(op[0]))) {
+        /* Also set the dtype for buffering arr */
+        op_dtypes[1] = op_dtypes[0];
 
+        NPY_UF_DBG_PRINTF("Allocating inner iterator\n");
+        if (operation == UFUNC_REDUCE) {
+            /* The per-operand flags for the inner loop */
+            op_flags[0] = NPY_ITER_READWRITE|
+                          NPY_ITER_ALIGNED;
+            op_flags[1] = NPY_ITER_READONLY|
+                          NPY_ITER_ALIGNED;
 
-/*
- * We have two basic kinds of loops. One is used when arr is not-swapped
- * and aligned and output type is the same as input type.  The other uses
- * buffers when one of these is not satisfied.
- *
- *  Zero-length and one-length axes-to-be-reduced are handled separately.
- */
-static PyObject *
-PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
-        int axis, int otype)
-{
-    PyArrayObject *ret = NULL;
-    PyUFuncReduceObject *loop;
-    intp i, n;
-    char *dptr;
-    NPY_BEGIN_THREADS_DEF;
+            op_axes[0][0] = -1;
+            op_axes[1][0] = axis;
 
-    /* Construct loop object */
-    loop = construct_reduce(self, &arr, out, axis, otype, UFUNC_REDUCE, 0,
-            "reduce");
-    if (!loop) {
-        return NULL;
+            iter_inner = NpyIter_MultiNew(2, op, NPY_ITER_NO_INNER_ITERATION|
+                                       NPY_ITER_BUFFERED|
+                                       NPY_ITER_DELAY_BUFALLOC|
+                                       NPY_ITER_GROWINNER|
+                                       NPY_ITER_REDUCE_OK|
+                                       NPY_ITER_REFS_OK,
+                                       NPY_CORDER, NPY_UNSAFE_CASTING,
+                                       op_flags, op_dtypes,
+                                       1, op_axes, buffersize);
+        }
+        /* Should never get an inner iterator for ACCUMULATE */
+        else {
+            PyErr_SetString(PyExc_RuntimeError,
+                "internal ufunc reduce error, should not need inner iterator");
+            goto fail;
+        }
+        if (iter_inner == NULL) {
+            goto fail;
+        }
     }
 
-    NPY_LOOP_BEGIN_THREADS;
-    switch(loop->meth) {
-    case ZERO_EL_REDUCELOOP:
-        /* fprintf(stderr, "ZERO..%d\n", loop->size); */
-        for (i = 0; i < loop->size; i++) {
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->idptr));
-            }
-            memmove(loop->bufptr[0], loop->idptr, loop->outsize);
-            loop->bufptr[0] += loop->outsize;
+    if (iter && NpyIter_GetIterSize(iter) != 0) {
+        char *dataptr_copy[3];
+        npy_intp stride_copy[3];
+
+        NpyIter_IterNext_Fn iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *count_ptr;
+
+        int itemsize = op_dtypes[0]->elsize;
+
+        /* Get the variables needed for the loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
         }
-        break;
-    case ONE_EL_REDUCELOOP:
-        /*fprintf(stderr, "ONEDIM..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->it->dataptr));
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        
+        /* Execute the loop with two nested iterators */
+        if (iter_inner) {
+            /* Only UFUNC_REDUCE uses iter_inner */
+            NpyIter_IterNext_Fn iternext_inner;
+            char **dataptr_inner;
+            npy_intp *stride_inner;
+            npy_intp count, *count_ptr_inner;
+
+            NPY_UF_DBG_PRINTF("UFunc: Reduce loop with two nested iterators\n");
+            iternext_inner = NpyIter_GetIterNext(iter_inner, NULL);
+            if (iternext_inner == NULL) {
+                goto fail;
             }
-            memmove(loop->bufptr[0], loop->it->dataptr, loop->outsize);
-            PyArray_ITER_NEXT(loop->it);
-            loop->bufptr[0] += loop->outsize;
-            loop->index++;
-        }
-        break;
-    case NOBUFFER_UFUNCLOOP:
-        /*fprintf(stderr, "NOBUFFER..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            /* Copy first element to output */
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->it->dataptr));
+            dataptr_inner = NpyIter_GetDataPtrArray(iter_inner);
+            stride_inner = NpyIter_GetInnerStrideArray(iter_inner);
+            count_ptr_inner = NpyIter_GetInnerLoopSizePtr(iter_inner);
+
+            needs_api = NpyIter_IterationNeedsAPI(iter) ||
+                        NpyIter_IterationNeedsAPI(iter_inner);
+
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
             }
-            memmove(loop->bufptr[0], loop->it->dataptr, loop->outsize);
-            /* Adjust input pointer */
-            loop->bufptr[1] = loop->it->dataptr+loop->steps[1];
-            loop->function((char **)loop->bufptr, &(loop->N),
-                    loop->steps, loop->funcdata);
-            UFUNC_CHECK_ERROR(loop);
-            PyArray_ITER_NEXT(loop->it);
-            loop->bufptr[0] += loop->outsize;
-            loop->bufptr[2] = loop->bufptr[0];
-            loop->index++;
-        }
-        break;
-    case BUFFER_UFUNCLOOP:
-        /*
-         * use buffer for arr
-         *
-         * For each row to reduce
-         * 1. copy first item over to output (casting if necessary)
-         * 2. Fill inner buffer
-         * 3. When buffer is filled or end of row
-         * a. Cast input buffers if needed
-         * b. Call inner function.
-         * 4. Repeat 2 until row is done.
-         */
-        /* fprintf(stderr, "BUFFERED..%d %d\n", loop->size, loop->swap); */
-        while(loop->index < loop->size) {
-            loop->inptr = loop->it->dataptr;
-            /* Copy (cast) First term over to output */
-            if (loop->cast) {
-                /* A little tricky because we need to cast it first */
-                arr->descr->f->copyswap(loop->buffer, loop->inptr,
-                        loop->swap, NULL);
-                loop->cast(loop->buffer, loop->castbuf, 1, NULL, NULL);
-                if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                    Py_XINCREF(*((PyObject **)loop->castbuf));
+
+            do {
+                int first = 1;
+
+                /* Reset the inner iterator to the outer's data */
+                if (NpyIter_ResetBasePointers(iter_inner, dataptr, NULL)
+                                                != NPY_SUCCEED) {
+                    goto fail;
                 }
-                memcpy(loop->bufptr[0], loop->castbuf, loop->outsize);
-            }
-            else {
-                /* Simple copy */
-                arr->descr->f->copyswap(loop->bufptr[0], loop->inptr,
-                        loop->swap, NULL);
-            }
-            loop->inptr += loop->instrides;
-            n = 1;
-            while(n < loop->N) {
-                /* Copy up to loop->bufsize elements to buffer */
-                dptr = loop->buffer;
-                for (i = 0; i < loop->bufsize; i++, n++) {
-                    if (n == loop->N) {
-                        break;
-                    }
-                    arr->descr->f->copyswap(dptr, loop->inptr,
-                            loop->swap, NULL);
-                    loop->inptr += loop->instrides;
-                    dptr += loop->insize;
+
+                /* Copy the first element to start the reduction */
+                if (otype == NPY_OBJECT) {
+                    Py_XDECREF(*(PyObject **)dataptr_inner[0]);
+                    *(PyObject **)dataptr_inner[0] =
+                                        *(PyObject **)dataptr_inner[1];
+                    Py_XINCREF(*(PyObject **)dataptr_inner[0]);
                 }
-                if (loop->cast) {
-                    loop->cast(loop->buffer, loop->castbuf, i, NULL, NULL);
+                else {
+                    memcpy(dataptr_inner[0], dataptr_inner[1], itemsize);
                 }
-                loop->function((char **)loop->bufptr, &i,
-                        loop->steps, loop->funcdata);
-                loop->bufptr[0] += loop->steps[0]*i;
-                loop->bufptr[2] += loop->steps[2]*i;
-                UFUNC_CHECK_ERROR(loop);
+                
+                stride_copy[0] = 0;
+                stride_copy[2] = 0;
+                do {
+                    count = *count_ptr_inner;
+                    /* Turn the two items into three for the inner loop */
+                    dataptr_copy[0] = dataptr_inner[0];
+                    dataptr_copy[1] = dataptr_inner[1];
+                    dataptr_copy[2] = dataptr_inner[0];
+                    if (first) {
+                        --count;
+                        dataptr_copy[1] += stride_inner[1];
+                        first = 0;
+                    }
+                    stride_copy[1] = stride_inner[1];
+                    NPY_UF_DBG_PRINTF("iterator loop count %d\n", (int)count);
+                    innerloop(dataptr_copy, &count,
+                                stride_copy, innerloopdata);
+                } while(iternext_inner(iter_inner));
+            } while (iternext(iter));
+
+            if (!needs_api) {
+                NPY_END_THREADS;
             }
-            PyArray_ITER_NEXT(loop->it);
-            loop->bufptr[0] += loop->outsize;
-            loop->bufptr[2] = loop->bufptr[0];
-            loop->index++;
         }
+        /* Execute the loop with just the outer iterator */
+        else {
+            npy_intp count_m1 = PyArray_DIM(op[1], axis)-1;
+            npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
 
-        /*
-         * DECREF left-over objects if buffering was used.
-         * It is needed when casting created new objects in
-         * castbuf.  Intermediate copying into castbuf (via
-         * loop->function) decref'd what was already there.
+            NPY_UF_DBG_PRINTF("UFunc: Reduce loop with just outer iterator\n");
 
-         * It's the final copy into the castbuf that needs a DECREF.
-         */
+            if (operation == UFUNC_ACCUMULATE) {
+                stride0 = PyArray_STRIDE(op[0], axis);
+            }
 
-        /* Only when casting needed and it is from a non-object array */
-        if ((loop->obj & UFUNC_OBJ_ISOBJECT) && loop->cast &&
-            (!PyArray_ISOBJECT(arr))) {
-            for (i=0; i<loop->bufsize; i++) {
-                Py_CLEAR(((PyObject **)loop->castbuf)[i]);
+            stride_copy[0] = stride0;
+            stride_copy[1] = stride1;
+            stride_copy[2] = stride0;
+
+            needs_api = NpyIter_IterationNeedsAPI(iter);
+
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
             }
-        }
 
-    }
-    NPY_LOOP_END_THREADS;
-    /* Hang on to this reference -- will be decref'd with loop */
-    if (loop->retbase) {
-        ret = (PyArrayObject *)loop->ret->base;
-    }
-    else {
-        ret = loop->ret;
-    }
-    Py_INCREF(ret);
-    ufuncreduce_dealloc(loop);
-    return (PyObject *)ret;
+            do {
+                
+                dataptr_copy[0] = dataptr[0];
+                dataptr_copy[1] = dataptr[1];
+                dataptr_copy[2] = dataptr[0];
+
+                /* Copy the first element to start the reduction */
+                if (otype == NPY_OBJECT) {
+                    Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+                    *(PyObject **)dataptr_copy[0] =
+                                        *(PyObject **)dataptr_copy[1];
+                    Py_XINCREF(*(PyObject **)dataptr_copy[0]);
+                }
+                else {
+                    memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
+                }
 
-fail:
-    NPY_LOOP_END_THREADS;
-    if (loop) {
-        ufuncreduce_dealloc(loop);
+                if (count_m1 > 0) {
+                    /* Turn the two items into three for the inner loop */
+                    if (operation == UFUNC_REDUCE) {
+                        dataptr_copy[1] += stride1;
+                    }
+                    else if (operation == UFUNC_ACCUMULATE) {
+                        dataptr_copy[1] += stride1;
+                        dataptr_copy[2] += stride0;
+                    }
+                    NPY_UF_DBG_PRINTF("iterator loop count %d\n",
+                                                    (int)count_m1);
+                    innerloop(dataptr_copy, &count_m1,
+                                stride_copy, innerloopdata);
+                }
+            } while (iternext(iter));
+
+            if (!needs_api) {
+                NPY_END_THREADS;
+            }
+        }
     }
-    return NULL;
-}
+    else if (iter == NULL) {
+        char *dataptr_copy[3];
+        npy_intp stride_copy[3];
 
+        int itemsize = op_dtypes[0]->elsize;
 
-static PyObject *
-PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
-                   int axis, int otype)
-{
-    PyArrayObject *ret = NULL;
-    PyUFuncReduceObject *loop;
-    intp i, n;
-    char *dptr;
-    NPY_BEGIN_THREADS_DEF;
+        /* Execute the loop with just the inner iterator */
+        if (iter_inner) {
+            /* Only UFUNC_REDUCE uses iter_inner */
+            NpyIter_IterNext_Fn iternext_inner;
+            char **dataptr_inner;
+            npy_intp *stride_inner;
+            npy_intp count, *count_ptr_inner;
+            int first = 1;
 
-    /* Construct loop object */
-    loop = construct_reduce(self, &arr, out, axis, otype,
-            UFUNC_ACCUMULATE, 0, "accumulate");
-    if (!loop) {
-        return NULL;
-    }
+            NPY_UF_DBG_PRINTF("UFunc: Reduce loop with just inner iterator\n");
 
-    NPY_LOOP_BEGIN_THREADS;
-    switch(loop->meth) {
-    case ZERO_EL_REDUCELOOP:
-        /* Accumulate */
-        /* fprintf(stderr, "ZERO..%d\n", loop->size); */
-        for (i = 0; i < loop->size; i++) {
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->idptr));
+            iternext_inner = NpyIter_GetIterNext(iter_inner, NULL);
+            if (iternext_inner == NULL) {
+                goto fail;
             }
-            memcpy(loop->bufptr[0], loop->idptr, loop->outsize);
-            loop->bufptr[0] += loop->outsize;
-        }
-        break;
-    case ONE_EL_REDUCELOOP:
-        /* Accumulate */
-        /* fprintf(stderr, "ONEDIM..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->it->dataptr));
+            dataptr_inner = NpyIter_GetDataPtrArray(iter_inner);
+            stride_inner = NpyIter_GetInnerStrideArray(iter_inner);
+            count_ptr_inner = NpyIter_GetInnerLoopSizePtr(iter_inner);
+
+            /* Reset the inner iterator to prepare the buffers */
+            if (NpyIter_Reset(iter_inner, NULL) != NPY_SUCCEED) {
+                goto fail;
             }
-            memmove(loop->bufptr[0], loop->it->dataptr, loop->outsize);
-            PyArray_ITER_NEXT(loop->it);
-            loop->bufptr[0] += loop->outsize;
-            loop->index++;
-        }
-        break;
-    case NOBUFFER_UFUNCLOOP:
-        /* Accumulate */
-        /* fprintf(stderr, "NOBUFFER..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            /* Copy first element to output */
-            if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                Py_INCREF(*((PyObject **)loop->it->dataptr));
+
+            needs_api = NpyIter_IterationNeedsAPI(iter_inner);
+
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Copy the first element to start the reduction */
+            if (otype == NPY_OBJECT) {
+                Py_XDECREF(*(PyObject **)dataptr_inner[0]);
+                *(PyObject **)dataptr_inner[0] =
+                                    *(PyObject **)dataptr_inner[1];
+                Py_XINCREF(*(PyObject **)dataptr_inner[0]);
+            }
+            else {
+                memcpy(dataptr_inner[0], dataptr_inner[1], itemsize);
+            }
+            
+            stride_copy[0] = 0;
+            stride_copy[2] = 0;
+            do {
+                count = *count_ptr_inner;
+                /* Turn the two items into three for the inner loop */
+                dataptr_copy[0] = dataptr_inner[0];
+                dataptr_copy[1] = dataptr_inner[1];
+                dataptr_copy[2] = dataptr_inner[0];
+                if (first) {
+                    --count;
+                    dataptr_copy[1] += stride_inner[1];
+                    first = 0;
+                }
+                stride_copy[1] = stride_inner[1];
+                NPY_UF_DBG_PRINTF("iterator loop count %d\n", (int)count);
+                innerloop(dataptr_copy, &count,
+                            stride_copy, innerloopdata);
+            } while(iternext_inner(iter_inner));
+
+            if (!needs_api) {
+                NPY_END_THREADS;
             }
-            memmove(loop->bufptr[0], loop->it->dataptr, loop->outsize);
-            /* Adjust input pointer */
-            loop->bufptr[1] = loop->it->dataptr + loop->steps[1];
-            loop->function((char **)loop->bufptr, &(loop->N),
-                           loop->steps, loop->funcdata);
-            UFUNC_CHECK_ERROR(loop);
-            PyArray_ITER_NEXT(loop->it);
-            PyArray_ITER_NEXT(loop->rit);
-            loop->bufptr[0] = loop->rit->dataptr;
-            loop->bufptr[2] = loop->bufptr[0] + loop->steps[0];
-            loop->index++;
         }
-        break;
-    case BUFFER_UFUNCLOOP:
-        /* Accumulate
-         *
-         * use buffer for arr
-         *
-         * For each row to reduce
-         * 1. copy identity over to output (casting if necessary)
-         * 2. Fill inner buffer
-         * 3. When buffer is filled or end of row
-         * a. Cast input buffers if needed
-         * b. Call inner function.
-         * 4. Repeat 2 until row is done.
-         */
-        /* fprintf(stderr, "BUFFERED..%d %p\n", loop->size, loop->cast); */
-        while (loop->index < loop->size) {
-            loop->inptr = loop->it->dataptr;
-            /* Copy (cast) First term over to output */
-            if (loop->cast) {
-                /* A little tricky because we need to
-                   cast it first */
-                arr->descr->f->copyswap(loop->buffer, loop->inptr,
-                                        loop->swap, NULL);
-                loop->cast(loop->buffer, loop->castbuf, 1, NULL, NULL);
-                if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                    Py_XINCREF(*((PyObject **)loop->castbuf));
+        /* Execute the loop with no iterators */
+        else {
+            npy_intp count = PyArray_DIM(op[1], axis);
+            npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
+
+            NPY_UF_DBG_PRINTF("UFunc: Reduce loop with no iterators\n");
+
+            if (operation == UFUNC_REDUCE) {
+                if (PyArray_NDIM(op[0]) != 0) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "provided out is the wrong size "
+                            "for the reduction");
+                    goto fail;
                 }
-                memcpy(loop->bufptr[0], loop->castbuf, loop->outsize);
+            }
+            else if (operation == UFUNC_ACCUMULATE) {
+                if (PyArray_NDIM(op[0]) != PyArray_NDIM(op[1]) ||
+                        !PyArray_CompareLists(PyArray_DIMS(op[0]),
+                                              PyArray_DIMS(op[1]),
+                                              PyArray_NDIM(op[0]))) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "provided out is the wrong size "
+                            "for the reduction");
+                    goto fail;
+                }
+                stride0 = PyArray_STRIDE(op[0], axis);
+            }
+
+            stride_copy[0] = stride0;
+            stride_copy[1] = stride1;
+            stride_copy[2] = stride0;
+
+            /* Turn the two items into three for the inner loop */
+            dataptr_copy[0] = PyArray_BYTES(op[0]);
+            dataptr_copy[1] = PyArray_BYTES(op[1]);
+            dataptr_copy[2] = PyArray_BYTES(op[0]);
+
+            /* Copy the first element to start the reduction */
+            if (otype == NPY_OBJECT) {
+                Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+                *(PyObject **)dataptr_copy[0] =
+                                    *(PyObject **)dataptr_copy[1];
+                Py_XINCREF(*(PyObject **)dataptr_copy[0]);
             }
             else {
-                /* Simple copy */
-                arr->descr->f->copyswap(loop->bufptr[0], loop->inptr,
-                                        loop->swap, NULL);
+                memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
             }
-            loop->inptr += loop->instrides;
-            n = 1;
-            while (n < loop->N) {
-                /* Copy up to loop->bufsize elements to buffer */
-                dptr = loop->buffer;
-                for (i = 0; i < loop->bufsize; i++, n++) {
-                    if (n == loop->N) {
-                        break;
-                    }
-                    arr->descr->f->copyswap(dptr, loop->inptr,
-                                            loop->swap, NULL);
-                    loop->inptr += loop->instrides;
-                    dptr += loop->insize;
+
+            if (count > 1) {
+                --count;
+                if (operation == UFUNC_REDUCE) {
+                    dataptr_copy[1] += stride1;
                 }
-                if (loop->cast) {
-                    loop->cast(loop->buffer, loop->castbuf, i, NULL, NULL);
+                else if (operation == UFUNC_ACCUMULATE) {
+                    dataptr_copy[1] += stride1;
+                    dataptr_copy[2] += stride0;
                 }
-                loop->function((char **)loop->bufptr, &i,
-                               loop->steps, loop->funcdata);
-                loop->bufptr[0] += loop->steps[0]*i;
-                loop->bufptr[2] += loop->steps[2]*i;
-                UFUNC_CHECK_ERROR(loop);
-            }
-            PyArray_ITER_NEXT(loop->it);
-            PyArray_ITER_NEXT(loop->rit);
-            loop->bufptr[0] = loop->rit->dataptr;
-            loop->bufptr[2] = loop->bufptr[0] + loop->steps[0];
-            loop->index++;
-        }
 
-        /*
-         * DECREF left-over objects if buffering was used.
-         * It is needed when casting created new objects in
-         * castbuf.  Intermediate copying into castbuf (via
-         * loop->function) decref'd what was already there.
+                NPY_UF_DBG_PRINTF("iterator loop count %d\n", (int)count);
 
-         * It's the final copy into the castbuf that needs a DECREF.
-         */
+                needs_api = PyDataType_REFCHK(op_dtypes[0]);
 
-        /* Only when casting needed and it is from a non-object array */
-        if ((loop->obj & UFUNC_OBJ_ISOBJECT) && loop->cast &&
-            (!PyArray_ISOBJECT(arr))) {
-            for (i=0; i<loop->bufsize; i++) {
-                Py_CLEAR(((PyObject **)loop->castbuf)[i]);
+                if (!needs_api) {
+                    NPY_BEGIN_THREADS;
+                }
+
+                innerloop(dataptr_copy, &count,
+                            stride_copy, innerloopdata);
+
+                if (!needs_api) {
+                    NPY_END_THREADS;
+                }
             }
         }
-
     }
-    NPY_LOOP_END_THREADS;
-    /* Hang on to this reference -- will be decref'd with loop */
-    if (loop->retbase) {
-        ret = (PyArrayObject *)loop->ret->base;
+
+finish:
+    Py_XDECREF(op_dtypes[0]);
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
     }
-    else {
-        ret = loop->ret;
+    if (iter_inner != NULL) {
+        NpyIter_Deallocate(iter_inner);
     }
-    Py_INCREF(ret);
-    ufuncreduce_dealloc(loop);
-    return (PyObject *)ret;
+    return (PyObject *)out;
 
- fail:
-    NPY_LOOP_END_THREADS;
-    if (loop) {
-        ufuncreduce_dealloc(loop);
+fail:
+    Py_XDECREF(out);
+    Py_XDECREF(op_dtypes[0]);
+
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
+    }
+    if (iter_inner != NULL) {
+        NpyIter_Deallocate(iter_inner);
     }
+
+    Py_XDECREF(errobj);
+
     return NULL;
 }
 
 /*
+ * We have two basic kinds of loops. One is used when arr is not-swapped
+ * and aligned and output type is the same as input type.  The other uses
+ * buffers when one of these is not satisfied.
+ *
+ *  Zero-length and one-length axes-to-be-reduced are handled separately.
+ */
+static PyObject *
+PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
+        int axis, int otype)
+{
+    return PyUFunc_ReductionOp(self, arr, out, axis, otype,
+                                UFUNC_REDUCE, "reduce");
+}
+
+
+static PyObject *
+PyUFunc_Accumulate(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
+                   int axis, int otype)
+{
+    return PyUFunc_ReductionOp(self, arr, out, axis, otype,
+                                UFUNC_ACCUMULATE, "accumulate");
+}
+
+/*
  * Reduceat performs a reduce over an axis using the indices as a guide
  *
  * op.reduceat(array,indices)  computes
@@ -3048,156 +3372,337 @@ static PyObject *
 PyUFunc_Reduceat(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *ind,
                  PyArrayObject *out, int axis, int otype)
 {
-    PyArrayObject *ret;
-    PyUFuncReduceObject *loop;
-    intp *ptr = (intp *)ind->data;
-    intp nn = ind->dimensions[0];
-    intp mm = arr->dimensions[axis] - 1;
-    intp n, i, j;
-    char *dptr;
+    PyArrayObject *op[3];
+    PyArray_Descr *op_dtypes[3] = {NULL, NULL, NULL};
+    npy_intp op_axes_arrays[3][NPY_MAXDIMS];
+    npy_intp *op_axes[3] = {op_axes_arrays[0], op_axes_arrays[1],
+                            op_axes_arrays[2]};
+    npy_uint32 op_flags[3];
+    int i, idim, ndim, otype_final;
+    int needs_api, need_outer_iterator;
     NPY_BEGIN_THREADS_DEF;
 
+    NpyIter *iter = NULL;
+
+    /* The reduceat indices - ind must be validated outside this call */
+    npy_intp *reduceat_ind;
+    npy_intp ind_size, red_axis_size;
+    /* The selected inner loop */
+    PyUFuncGenericFunction innerloop = NULL;
+    void *innerloopdata = NULL;
+
+    char *ufunc_name = self->name ? self->name : "(unknown)";
+    char *opname = "reduceat";
+
+    /* These parameters come from extobj= or from a TLS global */
+    int buffersize = 0, errormask = 0;
+    PyObject *errobj = NULL;
+
+    reduceat_ind = (npy_intp *)PyArray_DATA(ind);
+    ind_size = PyArray_DIM(ind, 0);
+    red_axis_size = PyArray_DIM(arr, axis);
+
     /* Check for out-of-bounds values in indices array */
-    for (i = 0; i<nn; i++) {
-        if ((*ptr < 0) || (*ptr > mm)) {
+    for (i = 0; i < ind_size; ++i) {
+        if (reduceat_ind[i] < 0 || reduceat_ind[i] >= red_axis_size) {
             PyErr_Format(PyExc_IndexError,
-                    "index out-of-bounds (0, %d)", (int) mm);
+                "index %d out-of-bounds in %s.%s [0, %d)",
+                (int)reduceat_ind[i], ufunc_name, opname, (int)red_axis_size);
             return NULL;
         }
-        ptr++;
     }
 
-    ptr = (intp *)ind->data;
-    /* Construct loop object */
-    loop = construct_reduce(self, &arr, out, axis, otype,
-            UFUNC_REDUCEAT, nn, "reduceat");
-    if (!loop) {
+    NPY_UF_DBG_PRINTF("\nEvaluating ufunc %s.%s\n", ufunc_name, opname);
+
+#if 0
+    printf("Doing %s.%s on array with dtype :  ", ufunc_name, opname);
+    PyObject_Print((PyObject *)PyArray_DESCR(arr), stdout, 0);
+    printf("\n");
+    printf("Index size is %d\n", (int)ind_size);
+#endif
+
+    if (PyUFunc_GetPyValues(opname, &buffersize, &errormask, &errobj) < 0) {
         return NULL;
     }
 
-    NPY_LOOP_BEGIN_THREADS;
-    switch(loop->meth) {
-    case ZERO_EL_REDUCELOOP:
-        /* zero-length index -- return array immediately */
-        /* fprintf(stderr, "ZERO..\n"); */
-        break;
-    case NOBUFFER_UFUNCLOOP:
-        /* Reduceat
-         * NOBUFFER -- behaved array and same type
-         */
-        /* fprintf(stderr, "NOBUFFER..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            ptr = (intp *)ind->data;
-            for (i = 0; i < nn; i++) {
-                loop->bufptr[1] = loop->it->dataptr + (*ptr)*loop->steps[1];
-                if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                    Py_XINCREF(*((PyObject **)loop->bufptr[1]));
-                }
-                memcpy(loop->bufptr[0], loop->bufptr[1], loop->outsize);
-                mm = (i == nn - 1 ? arr->dimensions[axis] - *ptr :
-                        *(ptr + 1) - *ptr) - 1;
-                if (mm > 0) {
-                    loop->bufptr[1] += loop->steps[1];
-                    loop->bufptr[2] = loop->bufptr[0];
-                    loop->function((char **)loop->bufptr, &mm,
-                            loop->steps, loop->funcdata);
-                    UFUNC_CHECK_ERROR(loop);
-                }
-                loop->bufptr[0] += loop->ret->strides[axis];
-                ptr++;
-            }
-            PyArray_ITER_NEXT(loop->it);
-            PyArray_ITER_NEXT(loop->rit);
-            loop->bufptr[0] = loop->rit->dataptr;
-            loop->index++;
+    /* Take a reference to out for later returning */
+    Py_XINCREF(out);
+
+    otype_final = otype;
+    if (get_binary_op_function(self, &otype_final,
+                                &innerloop, &innerloopdata) < 0) {
+        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
+        PyErr_Format(PyExc_ValueError,
+                     "could not find a matching type for %s.%s, "
+                     "requested type has type code '%c'",
+                            ufunc_name, opname, dtype ? dtype->type : '-');
+        Py_XDECREF(dtype);
+        goto fail;
+    }
+
+    ndim = PyArray_NDIM(arr);
+
+    /* Set up the output data type */
+    op_dtypes[0] = PyArray_DescrFromType(otype_final);
+    if (op_dtypes[0] == NULL) {
+        goto fail;
+    }
+
+#if NPY_UF_DBG_TRACING
+    printf("Found %s.%s inner loop with dtype :  ", ufunc_name, opname);
+    PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
+    printf("\n");
+#endif
+
+    /* Set up the op_axes for the outer loop */
+    for (i = 0, idim = 0; idim < ndim; ++idim) {
+        /* Use the i-th iteration dimension to match up ind */
+        if (idim == axis) {
+            op_axes_arrays[0][idim] = axis;
+            op_axes_arrays[1][idim] = -1;
+            op_axes_arrays[2][idim] = 0;
         }
-        break;
+        else {
+            op_axes_arrays[0][idim] = idim;
+            op_axes_arrays[1][idim] = idim;
+            op_axes_arrays[2][idim] = -1;
+        }
+    }
+
+    op[0] = out;
+    op[1] = arr;
+    op[2] = ind;
+
+    /* Likewise with accumulate, must do UPDATEIFCOPY */
+    if (out != NULL || ndim > 1 || !PyArray_ISALIGNED(arr) ||
+            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr))) {
+        need_outer_iterator = 1;
+    }
 
-    case BUFFER_UFUNCLOOP:
-        /* Reduceat
-         * BUFFER -- misbehaved array or different types
+    if (need_outer_iterator) {
+        npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
+                           NPY_ITER_REFS_OK|
+                           NPY_ITER_COORDS;
+
+        /*
+         * The way reduceat is set up, we can't do buffering,
+         * so make a copy instead when necessary.
          */
-        /* fprintf(stderr, "BUFFERED..%d\n", loop->size); */
-        while (loop->index < loop->size) {
-            ptr = (intp *)ind->data;
-            for (i = 0; i < nn; i++) {
-                if (loop->obj & UFUNC_OBJ_ISOBJECT) {
-                    Py_XINCREF(*((PyObject **)loop->idptr));
+
+        /* The per-operand flags for the outer loop */
+        op_flags[0] = NPY_ITER_READWRITE|
+                      NPY_ITER_NO_BROADCAST|
+                      NPY_ITER_ALLOCATE|
+                      NPY_ITER_NO_SUBTYPE|
+                      NPY_ITER_UPDATEIFCOPY|
+                      NPY_ITER_ALIGNED;
+        op_flags[1] = NPY_ITER_READONLY|
+                      NPY_ITER_COPY|
+                      NPY_ITER_ALIGNED;
+        op_flags[2] = NPY_ITER_READONLY;
+
+        op_dtypes[1] = op_dtypes[0];
+
+        NPY_UF_DBG_PRINTF("Allocating outer iterator\n");
+        iter = NpyIter_MultiNew(3, op, flags,
+                                   NPY_KEEPORDER, NPY_UNSAFE_CASTING,
+                                   op_flags,
+                                   op_dtypes,
+                                   ndim, op_axes, 0);
+        if (iter == NULL) {
+            goto fail;
+        }
+
+        /* Remove the inner loop axis from the outer iterator */
+        if (NpyIter_RemoveAxis(iter, axis) != NPY_SUCCEED) {
+            goto fail;
+        }
+        if (NpyIter_RemoveCoords(iter) != NPY_SUCCEED) {
+            goto fail;
+        }
+
+        /* In case COPY or UPDATEIFCOPY occurred */
+        op[0] = NpyIter_GetOperandArray(iter)[0];
+        op[1] = NpyIter_GetOperandArray(iter)[1];
+
+        if (out == NULL) {
+            out = op[0];
+            Py_INCREF(out);
+        }
+    }
+    /* Allocate the output for when there's no outer iterator */
+    else if (out == NULL) {
+        Py_INCREF(op_dtypes[0]);
+        op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
+                                    &PyArray_Type, op_dtypes[0],
+                                    1, &ind_size, NULL, NULL,
+                                    0, NULL);
+        if (out == NULL) {
+            goto fail;
+        }
+    }
+
+    /*
+     * If the output has zero elements, return now.
+     */
+    if (PyArray_SIZE(op[0]) == 0) {
+        goto finish;
+    }
+
+    if (iter && NpyIter_GetIterSize(iter) != 0) {
+        char *dataptr_copy[3];
+        npy_intp stride_copy[3];
+
+        NpyIter_IterNext_Fn iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *count_ptr;
+        npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
+
+        int itemsize = op_dtypes[0]->elsize;
+
+        /* Get the variables needed for the loop */
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        
+        /* Execute the loop with just the outer iterator */
+        npy_intp count_m1 = PyArray_DIM(op[1], axis)-1;
+        npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
+
+        NPY_UF_DBG_PRINTF("UFunc: Reduce loop with just outer iterator\n");
+
+        stride_copy[0] = stride0;
+        stride_copy[1] = stride1;
+        stride_copy[2] = stride0;
+
+        needs_api = NpyIter_IterationNeedsAPI(iter);
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+
+        do {
+
+            for (i = 0; i < ind_size; ++i) {
+                npy_intp start = reduceat_ind[i],
+                        end = (i == ind_size-1) ? count_m1+1 :
+                                                  reduceat_ind[i+1];
+                npy_intp count = end - start;
+
+                dataptr_copy[0] = dataptr[0] + stride0_ind*i;
+                dataptr_copy[1] = dataptr[1] + stride1*start;
+                dataptr_copy[2] = dataptr[0] + stride0_ind*i;
+
+                /* Copy the first element to start the reduction */
+                if (otype == NPY_OBJECT) {
+                    Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+                    *(PyObject **)dataptr_copy[0] =
+                                        *(PyObject **)dataptr_copy[1];
+                    Py_XINCREF(*(PyObject **)dataptr_copy[0]);
                 }
-                memcpy(loop->bufptr[0], loop->idptr, loop->outsize);
-                n = 0;
-                mm = (i == nn - 1 ? arr->dimensions[axis] - *ptr :
-                        *(ptr + 1) - *ptr);
-                if (mm < 1) {
-                    mm = 1;
+                else {
+                    memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
                 }
-                loop->inptr = loop->it->dataptr + (*ptr)*loop->instrides;
-                while (n < mm) {
-                    /* Copy up to loop->bufsize elements to buffer */
-                    dptr = loop->buffer;
-                    for (j = 0; j < loop->bufsize; j++, n++) {
-                        if (n == mm) {
-                            break;
-                        }
-                        arr->descr->f->copyswap(dptr, loop->inptr,
-                             loop->swap, NULL);
-                        loop->inptr += loop->instrides;
-                        dptr += loop->insize;
-                    }
-                    if (loop->cast) {
-                        loop->cast(loop->buffer, loop->castbuf, j, NULL, NULL);
-                    }
-                    loop->bufptr[2] = loop->bufptr[0];
-                    loop->function((char **)loop->bufptr, &j,
-                            loop->steps, loop->funcdata);
-                    UFUNC_CHECK_ERROR(loop);
-                    loop->bufptr[0] += j*loop->steps[0];
+
+                if (count > 1) {
+                    /* Inner loop like REDUCE */
+                    --count;
+                    dataptr_copy[1] += stride1;
+                    NPY_UF_DBG_PRINTF("iterator loop count %d\n",
+                                                    (int)count);
+                    innerloop(dataptr_copy, &count,
+                                stride_copy, innerloopdata);
                 }
-                loop->bufptr[0] += loop->ret->strides[axis];
-                ptr++;
             }
-            PyArray_ITER_NEXT(loop->it);
-            PyArray_ITER_NEXT(loop->rit);
-            loop->bufptr[0] = loop->rit->dataptr;
-            loop->index++;
+        } while (iternext(iter));
+
+        if (!needs_api) {
+            NPY_END_THREADS;
         }
+    }
+    else if (iter == NULL) {
+        char *dataptr_copy[3];
+        npy_intp stride_copy[3];
 
-        /*
-         * DECREF left-over objects if buffering was used.
-         * It is needed when casting created new objects in
-         * castbuf.  Intermediate copying into castbuf (via
-         * loop->function) decref'd what was already there.
+        int itemsize = op_dtypes[0]->elsize;
 
-         * It's the final copy into the castbuf that needs a DECREF.
-         */
+        npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
+
+        /* Execute the loop with no iterators */
+        npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
+
+        needs_api = PyDataType_REFCHK(op_dtypes[0]);
+
+        NPY_UF_DBG_PRINTF("UFunc: Reduce loop with no iterators\n");
+
+        stride_copy[0] = stride0;
+        stride_copy[1] = stride1;
+        stride_copy[2] = stride0;
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+
+        for (i = 0; i < ind_size; ++i) {
+            npy_intp start = reduceat_ind[i],
+                    end = (i == ind_size-1) ? PyArray_DIM(arr,axis) :
+                                              reduceat_ind[i+1];
+            npy_intp count = end - start;
+
+            dataptr_copy[0] = PyArray_BYTES(op[0]) + stride0_ind*i;
+            dataptr_copy[1] = PyArray_BYTES(op[1]) + stride1*start;
+            dataptr_copy[2] = PyArray_BYTES(op[0]) + stride0_ind*i;
+
+            /* Copy the first element to start the reduction */
+            if (otype == NPY_OBJECT) {
+                Py_XDECREF(*(PyObject **)dataptr_copy[0]);
+                *(PyObject **)dataptr_copy[0] =
+                                    *(PyObject **)dataptr_copy[1];
+                Py_XINCREF(*(PyObject **)dataptr_copy[0]);
+            }
+            else {
+                memcpy(dataptr_copy[0], dataptr_copy[1], itemsize);
+            }
 
-        /* Only when casting needed and it is from a non-object array */
-        if ((loop->obj & UFUNC_OBJ_ISOBJECT) && loop->cast &&
-            (!PyArray_ISOBJECT(arr))) {
-            for (i=0; i<loop->bufsize; i++) {
-                Py_CLEAR(((PyObject **)loop->castbuf)[i]);
+            if (count > 1) {
+                /* Inner loop like REDUCE */
+                --count;
+                dataptr_copy[1] += stride1;
+                NPY_UF_DBG_PRINTF("iterator loop count %d\n",
+                                                (int)count);
+                innerloop(dataptr_copy, &count,
+                            stride_copy, innerloopdata);
             }
         }
 
-        break;
-    }
-    NPY_LOOP_END_THREADS;
-    /* Hang on to this reference -- will be decref'd with loop */
-    if (loop->retbase) {
-        ret = (PyArrayObject *)loop->ret->base;
+        if (!needs_api) {
+            NPY_END_THREADS;
+        }
     }
-    else {
-        ret = loop->ret;
+
+finish:
+    Py_XDECREF(op_dtypes[0]);
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
     }
-    Py_INCREF(ret);
-    ufuncreduce_dealloc(loop);
-    return (PyObject *)ret;
+    return (PyObject *)out;
 
 fail:
-    NPY_LOOP_END_THREADS;
-    if (loop) {
-        ufuncreduce_dealloc(loop);
+    Py_XDECREF(out);
+    Py_XDECREF(op_dtypes[0]);
+
+    if (iter != NULL) {
+        NpyIter_Deallocate(iter);
     }
+
+    Py_XDECREF(errobj);
+
     return NULL;
 }
 
@@ -3250,10 +3755,8 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
         indtype = PyArray_DescrFromType(PyArray_INTP);
         if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|iO&O&", kwlist2,
                                         &op, &obj_ind, &axis,
-                                        PyArray_DescrConverter2,
-                                        &otype,
-                                        PyArray_OutputConverter,
-                                        &out)) {
+                                        PyArray_DescrConverter2, &otype,
+                                        PyArray_OutputConverter, &out)) {
             Py_XDECREF(otype);
             return NULL;
         }
@@ -3267,10 +3770,8 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
     else {
         if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|iO&O&", kwlist1,
                                         &op, &axis,
-                                        PyArray_DescrConverter2,
-                                        &otype,
-                                        PyArray_OutputConverter,
-                                        &out)) {
+                                        PyArray_DescrConverter2, &otype,
+                                        PyArray_OutputConverter, &out)) {
             Py_XDECREF(otype);
             return NULL;
         }
@@ -3329,7 +3830,7 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
          * is used for add and multiply reduction to avoid overflow
          */
         int typenum = PyArray_TYPE(mp);
-        if ((typenum < NPY_HALF)
+        if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
             && ((strcmp(self->name,"add") == 0)
                 || (strcmp(self->name,"multiply") == 0))) {
             if (PyTypeNum_ISBOOL(typenum)) {
@@ -3401,7 +3902,8 @@ PyUFunc_GenericReduction(PyUFuncObject *self, PyObject *args,
  * should just have PyArray_Return called.
  */
 static void
-_find_array_wrap(PyObject *args, PyObject **output_wrap, int nin, int nout)
+_find_array_wrap(PyObject *args, PyObject *kwds,
+                PyObject **output_wrap, int nin, int nout)
 {
     Py_ssize_t nargs;
     int i;
@@ -3471,12 +3973,22 @@ _find_array_wrap(PyObject *args, PyObject **output_wrap, int nin, int nout)
         int j = nin + i;
         int incref = 1;
         output_wrap[i] = wrap;
+        obj = NULL;
         if (j < nargs) {
             obj = PyTuple_GET_ITEM(args, j);
-            if (obj == Py_None) {
-                continue;
+            /* Output argument one may also be in a keyword argument */
+            if (i == 0 && obj == Py_None && kwds != NULL) {
+                obj = PyDict_GetItemString(kwds, "out");
             }
+        }
+        /* Output argument one may also be in a keyword argument */
+        else if (i == 0 && kwds != NULL) {
+            obj = PyDict_GetItemString(kwds, "out");
+        }
+
+        if (obj != Py_None && obj != NULL) {
             if (PyArray_CheckExact(obj)) {
+                /* None signals to not call any wrapping */
                 output_wrap[i] = Py_None;
             }
             else {
@@ -3491,6 +4003,7 @@ _find_array_wrap(PyObject *args, PyObject **output_wrap, int nin, int nout)
                 output_wrap[i] = owrap;
             }
         }
+
         if (incref) {
             Py_XINCREF(output_wrap[i]);
         }
@@ -3499,6 +4012,7 @@ _find_array_wrap(PyObject *args, PyObject **output_wrap, int nin, int nout)
     return;
 }
 
+
 static PyObject *
 ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
 {
@@ -3531,13 +4045,17 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
             return Py_NotImplemented;
         }
         else {
-            PyErr_SetString(PyExc_NotImplementedError, "Not implemented for this type");
+            PyErr_SetString(PyExc_NotImplementedError,
+                                        "Not implemented for this type");
             return NULL;
         }
     }
+
+    /* Free the input references */
     for (i = 0; i < self->nin; i++) {
         Py_DECREF(mps[i]);
     }
+
     /*
      * Use __array_wrap__ on all outputs
      * if present on one of the input arguments.
@@ -3555,25 +4073,13 @@ ufunc_generic_call(PyUFuncObject *self, PyObject *args, PyObject *kwds)
      * None --- array-object passed in don't call PyArray_Return
      * method --- the __array_wrap__ method to call.
      */
-    _find_array_wrap(args, wraparr, self->nin, self->nout);
+    _find_array_wrap(args, kwds, wraparr, self->nin, self->nout);
 
     /* wrap outputs */
     for (i = 0; i < self->nout; i++) {
         int j = self->nin+i;
-        PyObject *wrap;
-        /*
-         * check to see if any UPDATEIFCOPY flags are set
-         * which meant that a temporary output was generated
-         */
-        if (mps[j]->flags & UPDATEIFCOPY) {
-            PyObject *old = mps[j]->base;
-            /* we want to hang on to this */
-            Py_INCREF(old);
-            /* should trigger the copyback into old */
-            Py_DECREF(mps[j]);
-            mps[j] = (PyArrayObject *)old;
-        }
-        wrap = wraparr[i];
+        PyObject *wrap = wraparr[i];
+
         if (wrap != NULL) {
             if (wrap == Py_None) {
                 Py_DECREF(wrap);
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 03779f83c..0fff50a39 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -238,7 +238,54 @@ class TestScalarIndexing(TestCase):
         def subscript(x, i): x[i]
         self.assertRaises(IndexError, subscript, a, (newaxis, 0))
         self.assertRaises(IndexError, subscript, a, (newaxis,)*50)
-
+    
+    def test_overlapping_assignment(self):
+        # With positive strides
+        a = np.arange(4)
+        a[:-1] = a[1:]
+        assert_equal(a, [1,2,3,3])
+
+        a = np.arange(4)
+        a[1:] = a[:-1]
+        assert_equal(a, [0,0,1,2])
+
+        # With positive and negative strides
+        a = np.arange(4)
+        a[:] = a[::-1]
+        assert_equal(a, [3,2,1,0])
+
+        a = np.arange(6).reshape(2,3)
+        a[::-1,:] = a[:,::-1]
+        assert_equal(a, [[5,4,3],[2,1,0]])
+
+        a = np.arange(6).reshape(2,3)
+        a[::-1,::-1] = a[:,::-1]
+        assert_equal(a, [[3,4,5],[0,1,2]])
+
+        # With just one element overlapping
+        a = np.arange(5)
+        a[:3] = a[2:]
+        assert_equal(a, [2,3,4,3,4])
+
+        a = np.arange(5)
+        a[2:] = a[:3]
+        assert_equal(a, [0,1,0,1,2])
+
+        a = np.arange(5)
+        a[2::-1] = a[2:]
+        assert_equal(a, [4,3,2,3,4])
+
+        a = np.arange(5)
+        a[2:] = a[2::-1]
+        assert_equal(a, [0,1,2,1,0])
+
+        a = np.arange(5)
+        a[2::-1] = a[:1:-1]
+        assert_equal(a, [2,3,4,3,4])
+
+        a = np.arange(5)
+        a[:1:-1] = a[2::-1]
+        assert_equal(a, [0,1,0,1,2])
 
 class TestCreation(TestCase):
     def test_from_attribute(self):
@@ -628,18 +675,51 @@ class TestMethods(TestCase):
     def test_ravel(self):
         a = np.array([[0,1],[2,3]])
         assert_equal(a.ravel(), [0,1,2,3])
+        assert_(not a.ravel().flags.owndata)
         assert_equal(a.ravel('F'), [0,2,1,3])
         assert_equal(a.ravel(order='C'), [0,1,2,3])
         assert_equal(a.ravel(order='F'), [0,2,1,3])
         assert_equal(a.ravel(order='A'), [0,1,2,3])
+        assert_(not a.ravel(order='A').flags.owndata)
+        assert_equal(a.ravel(order='K'), [0,1,2,3])
+        assert_(not a.ravel(order='K').flags.owndata)
         assert_equal(a.ravel(), a.reshape(-1))
 
         a = np.array([[0,1],[2,3]], order='F')
         assert_equal(a.ravel(), [0,1,2,3])
         assert_equal(a.ravel(order='A'), [0,2,1,3])
+        assert_equal(a.ravel(order='K'), [0,2,1,3])
+        assert_(not a.ravel(order='A').flags.owndata)
+        assert_(not a.ravel(order='K').flags.owndata)
         assert_equal(a.ravel(), a.reshape(-1))
         assert_equal(a.ravel(order='A'), a.reshape(-1, order='A'))
 
+        a = np.array([[0,1],[2,3]])[::-1,:]
+        assert_equal(a.ravel(), [2,3,0,1])
+        assert_equal(a.ravel(order='C'), [2,3,0,1])
+        assert_equal(a.ravel(order='F'), [2,0,3,1])
+        assert_equal(a.ravel(order='A'), [2,3,0,1])
+        # 'K' doesn't reverse the axes of negative strides
+        assert_equal(a.ravel(order='K'), [2,3,0,1])
+        assert_(a.ravel(order='K').flags.owndata)
+
+    def test_setasflat(self):
+        # In this case, setasflat can treat a as a flat array,
+        # and must treat b in chunks of 3
+        a = np.arange(3*3*4).reshape(3,3,4)
+        b = np.arange(3*4*3, dtype='f4').reshape(3,4,3).T
+
+        assert_(not np.all(a.ravel() == b.ravel()))
+        a.setasflat(b)
+        assert_equal(a.ravel(), b.ravel())
+
+        # A case where the strides of neither a nor b can be collapsed
+        a = np.arange(3*2*4).reshape(3,2,4)[:,:,:-1]
+        b = np.arange(3*3*3, dtype='f4').reshape(3,3,3).T[:,:,:-1]
+
+        assert_(not np.all(a.ravel() == b.ravel()))
+        a.setasflat(b)
+        assert_equal(a.ravel(), b.ravel())
 
 class TestSubscripting(TestCase):
     def test_test_zero_rank(self):
diff --git a/numpy/core/tests/test_new_iterator.py b/numpy/core/tests/test_new_iterator.py
new file mode 100644
index 000000000..895ae4e78
--- /dev/null
+++ b/numpy/core/tests/test_new_iterator.py
@@ -0,0 +1,2174 @@
+import numpy as np
+from numpy import array, arange, newiter
+from numpy.testing import *
+import sys, warnings
+
+import warnings
+
+def iter_coords(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.coords)
+        i.iternext()
+    return ret
+
+def iter_indices(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.index)
+        i.iternext()
+    return ret
+
+def iter_iterindices(i):
+    ret = []
+    while not i.finished:
+        ret.append(i.iterindex)
+        i.iternext()
+    return ret
+
+def test_iter_refcount():
+    # Make sure the iterator doesn't leak
+
+    # Basic
+    a = arange(6)
+    dt = np.dtype('f4').newbyteorder()
+    rc_a = sys.getrefcount(a)
+    rc_dt = sys.getrefcount(dt)
+    it = newiter(a, [],
+                [['readwrite','updateifcopy']],
+                casting='unsafe',
+                op_dtypes=[dt])
+    assert_(not it.iterationneedsapi)
+    assert_(sys.getrefcount(a) > rc_a)
+    assert_(sys.getrefcount(dt) > rc_dt)
+    it = None
+    assert_equal(sys.getrefcount(a), rc_a)
+    assert_equal(sys.getrefcount(dt), rc_dt)
+
+    # With a copy
+    a = arange(6, dtype='f4')
+    dt = np.dtype('f4')
+    rc_a = sys.getrefcount(a)
+    rc_dt = sys.getrefcount(dt)
+    it = newiter(a, [],
+                [['readwrite']],
+                op_dtypes=[dt])
+    rc2_a = sys.getrefcount(a)
+    rc2_dt = sys.getrefcount(dt)
+    it2 = it.copy()
+    assert_(sys.getrefcount(a) > rc2_a)
+    assert_(sys.getrefcount(dt) > rc2_dt)
+    it = None
+    assert_equal(sys.getrefcount(a), rc2_a)
+    assert_equal(sys.getrefcount(dt), rc2_dt)
+    it2 = None
+    assert_equal(sys.getrefcount(a), rc_a)
+    assert_equal(sys.getrefcount(dt), rc_dt)
+
+def test_iter_best_order():
+    # The iterator should always find the iteration order
+    # with increasing memory addresses
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3,4), (2,3,4), (2,3,4,3), (2,3,2,2,3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit)&dirs):
+                    dirs_index[bit] = slice(None,None,-1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = newiter(aview, [], [['readonly']])
+            assert_equal([x for x in i], a)
+            # Fortran-order
+            i = newiter(aview.T, [], [['readonly']])
+            assert_equal([x for x in i], a)
+            # Other order
+            if len(shape) > 2:
+                i = newiter(aview.swapaxes(0,1), [], [['readonly']])
+                assert_equal([x for x in i], a)
+
+def test_iter_c_order():
+    # Test forcing C order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3,4), (2,3,4), (2,3,4,3), (2,3,2,2,3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit)&dirs):
+                    dirs_index[bit] = slice(None,None,-1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = newiter(aview, order='C')
+            assert_equal([x for x in i], aview.ravel(order='C'))
+            # Fortran-order
+            i = newiter(aview.T, order='C')
+            assert_equal([x for x in i], aview.T.ravel(order='C'))
+            # Other order
+            if len(shape) > 2:
+                i = newiter(aview.swapaxes(0,1), order='C')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0,1).ravel(order='C'))
+
+def test_iter_f_order():
+    # Test forcing F order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3,4), (2,3,4), (2,3,4,3), (2,3,2,2,3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit)&dirs):
+                    dirs_index[bit] = slice(None,None,-1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = newiter(aview, order='F')
+            assert_equal([x for x in i], aview.ravel(order='F'))
+            # Fortran-order
+            i = newiter(aview.T, order='F')
+            assert_equal([x for x in i], aview.T.ravel(order='F'))
+            # Other order
+            if len(shape) > 2:
+                i = newiter(aview.swapaxes(0,1), order='F')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0,1).ravel(order='F'))
+
+def test_iter_c_or_f_order():
+    # Test forcing any contiguous (C or F) order
+
+    # Test the ordering for 1-D to 5-D shapes
+    for shape in [(5,), (3,4), (2,3,4), (2,3,4,3), (2,3,2,2,3)]:
+        a = arange(np.prod(shape))
+        # Test each combination of positive and negative strides
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit)&dirs):
+                    dirs_index[bit] = slice(None,None,-1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = newiter(aview, order='A')
+            assert_equal([x for x in i], aview.ravel(order='A'))
+            # Fortran-order
+            i = newiter(aview.T, order='A')
+            assert_equal([x for x in i], aview.T.ravel(order='A'))
+            # Other order
+            if len(shape) > 2:
+                i = newiter(aview.swapaxes(0,1), order='A')
+                assert_equal([x for x in i],
+                                    aview.swapaxes(0,1).ravel(order='A'))
+
+def test_iter_best_order_coords_1d():
+    # The coords should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = newiter(a,['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(0,),(1,),(2,),(3,)])
+    # 1D reversed order
+    i = newiter(a[::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(3,),(2,),(1,),(0,)])
+
+def test_iter_best_order_coords_2d():
+    # The coords should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = newiter(a.reshape(2,3),['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(0,0),(0,1),(0,2),(1,0),(1,1),(1,2)])
+    # 2D Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F'),['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(0,0),(1,0),(0,1),(1,1),(0,2),(1,2)])
+    # 2D reversed C-order
+    i = newiter(a.reshape(2,3)[::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(1,0),(1,1),(1,2),(0,0),(0,1),(0,2)])
+    i = newiter(a.reshape(2,3)[:,::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(0,2),(0,1),(0,0),(1,2),(1,1),(1,0)])
+    i = newiter(a.reshape(2,3)[::-1,::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(1,2),(1,1),(1,0),(0,2),(0,1),(0,0)])
+    # 2D reversed Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(1,0),(0,0),(1,1),(0,1),(1,2),(0,2)])
+    i = newiter(a.reshape(2,3).copy(order='F')[:,::-1],
+                                                   ['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(0,2),(1,2),(0,1),(1,1),(0,0),(1,0)])
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1,::-1],
+                                                   ['coords'],[['readonly']])
+    assert_equal(iter_coords(i), [(1,2),(0,2),(1,1),(0,1),(1,0),(0,0)])
+
+def test_iter_best_order_coords_3d():
+    # The coords should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = newiter(a.reshape(2,3,2),['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,0,0),(0,0,1),(0,1,0),(0,1,1),(0,2,0),(0,2,1),
+                             (1,0,0),(1,0,1),(1,1,0),(1,1,1),(1,2,0),(1,2,1)])
+    # 3D Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F'),['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,0,0),(1,0,0),(0,1,0),(1,1,0),(0,2,0),(1,2,0),
+                             (0,0,1),(1,0,1),(0,1,1),(1,1,1),(0,2,1),(1,2,1)])
+    # 3D reversed C-order
+    i = newiter(a.reshape(2,3,2)[::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(1,0,0),(1,0,1),(1,1,0),(1,1,1),(1,2,0),(1,2,1),
+                             (0,0,0),(0,0,1),(0,1,0),(0,1,1),(0,2,0),(0,2,1)])
+    i = newiter(a.reshape(2,3,2)[:,::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,2,0),(0,2,1),(0,1,0),(0,1,1),(0,0,0),(0,0,1),
+                             (1,2,0),(1,2,1),(1,1,0),(1,1,1),(1,0,0),(1,0,1)])
+    i = newiter(a.reshape(2,3,2)[:,:,::-1],['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,0,1),(0,0,0),(0,1,1),(0,1,0),(0,2,1),(0,2,0),
+                             (1,0,1),(1,0,0),(1,1,1),(1,1,0),(1,2,1),(1,2,0)])
+    # 3D reversed Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F')[::-1],
+                                                    ['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(1,0,0),(0,0,0),(1,1,0),(0,1,0),(1,2,0),(0,2,0),
+                             (1,0,1),(0,0,1),(1,1,1),(0,1,1),(1,2,1),(0,2,1)])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,::-1],
+                                                    ['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,2,0),(1,2,0),(0,1,0),(1,1,0),(0,0,0),(1,0,0),
+                             (0,2,1),(1,2,1),(0,1,1),(1,1,1),(0,0,1),(1,0,1)])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,:,::-1],
+                                                    ['coords'],[['readonly']])
+    assert_equal(iter_coords(i),
+                            [(0,0,1),(1,0,1),(0,1,1),(1,1,1),(0,2,1),(1,2,1),
+                             (0,0,0),(1,0,0),(0,1,0),(1,1,0),(0,2,0),(1,2,0)])
+
+def test_iter_best_order_c_index_1d():
+    # The C index should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = newiter(a,['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,1,2,3])
+    # 1D reversed order
+    i = newiter(a[::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [3,2,1,0])
+
+def test_iter_best_order_c_index_2d():
+    # The C index should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = newiter(a.reshape(2,3),['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,1,2,3,4,5])
+    # 2D Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F'),
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,3,1,4,2,5])
+    # 2D reversed C-order
+    i = newiter(a.reshape(2,3)[::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [3,4,5,0,1,2])
+    i = newiter(a.reshape(2,3)[:,::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [2,1,0,5,4,3])
+    i = newiter(a.reshape(2,3)[::-1,::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [5,4,3,2,1,0])
+    # 2D reversed Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [3,0,4,1,5,2])
+    i = newiter(a.reshape(2,3).copy(order='F')[:,::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [2,5,1,4,0,3])
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1,::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i), [5,2,4,1,3,0])
+
+def test_iter_best_order_c_index_3d():
+    # The C index should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = newiter(a.reshape(2,3,2),['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [0,1,2,3,4,5,6,7,8,9,10,11])
+    # 3D Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F'),
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [0,6,2,8,4,10,1,7,3,9,5,11])
+    # 3D reversed C-order
+    i = newiter(a.reshape(2,3,2)[::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [6,7,8,9,10,11,0,1,2,3,4,5])
+    i = newiter(a.reshape(2,3,2)[:,::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [4,5,2,3,0,1,10,11,8,9,6,7])
+    i = newiter(a.reshape(2,3,2)[:,:,::-1],['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [1,0,3,2,5,4,7,6,9,8,11,10])
+    # 3D reversed Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F')[::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [6,0,8,2,10,4,7,1,9,3,11,5])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [4,10,2,8,0,6,5,11,3,9,1,7])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,:,::-1],
+                                    ['c_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [1,7,3,9,5,11,0,6,2,8,4,10])
+
+def test_iter_best_order_f_index_1d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(4)
+    # 1D order
+    i = newiter(a,['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,1,2,3])
+    # 1D reversed order
+    i = newiter(a[::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [3,2,1,0])
+
+def test_iter_best_order_f_index_2d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(6)
+    # 2D C-order
+    i = newiter(a.reshape(2,3),['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,2,4,1,3,5])
+    # 2D Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F'),
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [0,1,2,3,4,5])
+    # 2D reversed C-order
+    i = newiter(a.reshape(2,3)[::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [1,3,5,0,2,4])
+    i = newiter(a.reshape(2,3)[:,::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [4,2,0,5,3,1])
+    i = newiter(a.reshape(2,3)[::-1,::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [5,3,1,4,2,0])
+    # 2D reversed Fortran-order
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [1,0,3,2,5,4])
+    i = newiter(a.reshape(2,3).copy(order='F')[:,::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [4,5,2,3,0,1])
+    i = newiter(a.reshape(2,3).copy(order='F')[::-1,::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i), [5,4,3,2,1,0])
+
+def test_iter_best_order_f_index_3d():
+    # The Fortran index should be correct with any reordering
+
+    a = arange(12)
+    # 3D C-order
+    i = newiter(a.reshape(2,3,2),['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [0,6,2,8,4,10,1,7,3,9,5,11])
+    # 3D Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F'),
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [0,1,2,3,4,5,6,7,8,9,10,11])
+    # 3D reversed C-order
+    i = newiter(a.reshape(2,3,2)[::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [1,7,3,9,5,11,0,6,2,8,4,10])
+    i = newiter(a.reshape(2,3,2)[:,::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [4,10,2,8,0,6,5,11,3,9,1,7])
+    i = newiter(a.reshape(2,3,2)[:,:,::-1],['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [6,0,8,2,10,4,7,1,9,3,11,5])
+    # 3D reversed Fortran-order
+    i = newiter(a.reshape(2,3,2).copy(order='F')[::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [1,0,3,2,5,4,7,6,9,8,11,10])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [4,5,2,3,0,1,10,11,8,9,6,7])
+    i = newiter(a.reshape(2,3,2).copy(order='F')[:,:,::-1],
+                                    ['f_index'],[['readonly']])
+    assert_equal(iter_indices(i),
+                            [6,7,8,9,10,11,0,1,2,3,4,5])
+
+def test_iter_no_inner_full_coalesce():
+    # Check no_inner iterators which coalesce into a single inner loop
+
+    for shape in [(5,), (3,4), (2,3,4), (2,3,4,3), (2,3,2,2,3)]:
+        size = np.prod(shape)
+        a = arange(size)
+        # Test each combination of forward and backwards indexing
+        for dirs in range(2**len(shape)):
+            dirs_index = [slice(None)]*len(shape)
+            for bit in range(len(shape)):
+                if ((2**bit)&dirs):
+                    dirs_index[bit] = slice(None,None,-1)
+            dirs_index = tuple(dirs_index)
+
+            aview = a.reshape(shape)[dirs_index]
+            # C-order
+            i = newiter(aview, ['no_inner_iteration'], [['readonly']])
+            assert_equal(i.ndim, 1)
+            assert_equal(i[0].shape, (size,))
+            # Fortran-order
+            i = newiter(aview.T, ['no_inner_iteration'], [['readonly']])
+            assert_equal(i.ndim, 1)
+            assert_equal(i[0].shape, (size,))
+            # Other order
+            if len(shape) > 2:
+                i = newiter(aview.swapaxes(0,1),
+                                    ['no_inner_iteration'], [['readonly']])
+                assert_equal(i.ndim, 1)
+                assert_equal(i[0].shape, (size,))
+
+def test_iter_no_inner_dim_coalescing():
+    # Check no_inner iterators whose dimensions may not coalesce completely
+
+    # Skipping the last element in a dimension prevents coalescing
+    # with the next-bigger dimension
+    a = arange(24).reshape(2,3,4)[:,:,:-1]
+    i = newiter(a, ['no_inner_iteration'], [['readonly']])
+    assert_equal(i.ndim, 2)
+    assert_equal(i[0].shape, (3,))
+    a = arange(24).reshape(2,3,4)[:,:-1,:]
+    i = newiter(a, ['no_inner_iteration'], [['readonly']])
+    assert_equal(i.ndim, 2)
+    assert_equal(i[0].shape, (8,))
+    a = arange(24).reshape(2,3,4)[:-1,:,:]
+    i = newiter(a, ['no_inner_iteration'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    assert_equal(i[0].shape, (12,))
+    
+    # Even with lots of 1-sized dimensions, should still coalesce
+    a = arange(24).reshape(1,1,2,1,1,3,1,1,4,1,1)
+    i = newiter(a, ['no_inner_iteration'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    assert_equal(i[0].shape, (24,))
+
+def test_iter_dim_coalescing():
+    # Check that the correct number of dimensions are coalesced
+
+    # Tracking coordinates disables coalescing
+    a = arange(24).reshape(2,3,4)
+    i = newiter(a, ['coords'], [['readonly']])
+    assert_equal(i.ndim, 3)
+
+    # A tracked index can allow coalescing if it's compatible with the array
+    a3d = arange(24).reshape(2,3,4)
+    i = newiter(a3d, ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    i = newiter(a3d.swapaxes(0,1), ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+    i = newiter(a3d.T, ['c_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+    i = newiter(a3d.T, ['f_index'], [['readonly']])
+    assert_equal(i.ndim, 1)
+    i = newiter(a3d.T.swapaxes(0,1), ['f_index'], [['readonly']])
+    assert_equal(i.ndim, 3)
+
+    # When C or F order is forced, coalescing may still occur
+    a3d = arange(24).reshape(2,3,4)
+    i = newiter(a3d, order='C')
+    assert_equal(i.ndim, 1)
+    i = newiter(a3d.T, order='C')
+    assert_equal(i.ndim, 3)
+    i = newiter(a3d, order='F')
+    assert_equal(i.ndim, 3)
+    i = newiter(a3d.T, order='F')
+    assert_equal(i.ndim, 1)
+    i = newiter(a3d, order='A')
+    assert_equal(i.ndim, 1)
+    i = newiter(a3d.T, order='A')
+    assert_equal(i.ndim, 1)
+
+def test_iter_broadcasting():
+    # Standard NumPy broadcasting rules
+
+    # 1D with scalar
+    i = newiter([arange(6), np.int32(2)], ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (6,))
+
+    # 2D with scalar
+    i = newiter([arange(6).reshape(2,3), np.int32(2)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2,3))
+    # 2D with 1D
+    i = newiter([arange(6).reshape(2,3), arange(3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2,3))
+    i = newiter([arange(2).reshape(2,1), arange(3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2,3))
+    # 2D with 2D
+    i = newiter([arange(2).reshape(2,1), arange(3).reshape(1,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 6)
+    assert_equal(i.shape, (2,3))
+
+    # 3D with scalar
+    i = newiter([np.int32(2), arange(24).reshape(4,2,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    # 3D with 1D
+    i = newiter([arange(3), arange(24).reshape(4,2,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    i = newiter([arange(3), arange(8).reshape(4,2,1)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    # 3D with 2D
+    i = newiter([arange(6).reshape(2,3), arange(24).reshape(4,2,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    i = newiter([arange(2).reshape(2,1), arange(24).reshape(4,2,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    i = newiter([arange(3).reshape(1,3), arange(8).reshape(4,2,1)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    # 3D with 3D
+    i = newiter([arange(2).reshape(1,2,1), arange(3).reshape(1,1,3),
+                        arange(4).reshape(4,1,1)],
+                        ['coords'], [['readonly']]*3)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    i = newiter([arange(6).reshape(1,2,3), arange(4).reshape(4,1,1)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+    i = newiter([arange(24).reshape(4,2,3), arange(12).reshape(4,1,3)],
+                        ['coords'], [['readonly']]*2)
+    assert_equal(i.itersize, 24)
+    assert_equal(i.shape, (4,2,3))
+
+def test_iter_broadcasting_errors():
+    # Check that errors are thrown for bad broadcasting shapes
+
+    # 1D with 1D
+    assert_raises(ValueError, newiter, [arange(2), arange(3)],
+                    [], [['readonly']]*2)
+    # 2D with 1D
+    assert_raises(ValueError, newiter,
+                    [arange(6).reshape(2,3), arange(2)],
+                    [], [['readonly']]*2)
+    # 2D with 2D
+    assert_raises(ValueError, newiter,
+                    [arange(6).reshape(2,3), arange(9).reshape(3,3)],
+                    [], [['readonly']]*2)
+    assert_raises(ValueError, newiter,
+                    [arange(6).reshape(2,3), arange(4).reshape(2,2)],
+                    [], [['readonly']]*2)
+    # 3D with 3D
+    assert_raises(ValueError, newiter,
+                    [arange(36).reshape(3,3,4), arange(24).reshape(2,3,4)],
+                    [], [['readonly']]*2)
+    assert_raises(ValueError, newiter,
+                    [arange(8).reshape(2,4,1), arange(24).reshape(2,3,4)],
+                    [], [['readonly']]*2)
+
+
+def test_iter_flags_errors():
+    # Check that bad combinations of flags produce errors
+
+    a = arange(6)
+
+    # Not enough operands
+    assert_raises(ValueError, newiter, [], [], [])
+    # Too many operands
+    assert_raises(ValueError, newiter, [a]*100, [], [['readonly']]*100)
+    # Bad global flag
+    assert_raises(ValueError, newiter, [a], ['bad flag'], [['readonly']])
+    # Bad op flag
+    assert_raises(ValueError, newiter, [a], [], [['readonly','bad flag']])
+    # Bad order parameter
+    assert_raises(ValueError, newiter, [a], [], [['readonly']], order='G')
+    # Bad casting parameter
+    assert_raises(ValueError, newiter, [a], [], [['readonly']], casting='noon')
+    # op_flags must match ops
+    assert_raises(ValueError, newiter, [a]*3, [], [['readonly']]*2)
+    # Cannot track both a C and an F index
+    assert_raises(ValueError, newiter, a,
+                ['c_index','f_index'], [['readonly']])
+    # Inner iteration and coords/indices are incompatible
+    assert_raises(ValueError, newiter, a,
+                ['no_inner_iteration','coords'], [['readonly']])
+    assert_raises(ValueError, newiter, a,
+                ['no_inner_iteration','c_index'], [['readonly']])
+    assert_raises(ValueError, newiter, a,
+                ['no_inner_iteration','f_index'], [['readonly']])
+    # Must specify exactly one of readwrite/readonly/writeonly per operand
+    assert_raises(ValueError, newiter, a, [], [[]])
+    assert_raises(ValueError, newiter, a, [], [['readonly','writeonly']])
+    assert_raises(ValueError, newiter, a, [], [['readonly','readwrite']])
+    assert_raises(ValueError, newiter, a, [], [['writeonly','readwrite']])
+    assert_raises(ValueError, newiter, a,
+                [], [['readonly','writeonly','readwrite']])
+    # Python scalars are always readonly
+    assert_raises(TypeError, newiter, 1.5, [], [['writeonly']])
+    assert_raises(TypeError, newiter, 1.5, [], [['readwrite']])
+    # Array scalars are always readonly
+    assert_raises(TypeError, newiter, np.int32(1), [], [['writeonly']])
+    assert_raises(TypeError, newiter, np.int32(1), [], [['readwrite']])
+    # Check readonly array
+    a.flags.writeable = False
+    assert_raises(ValueError, newiter, a, [], [['writeonly']])
+    assert_raises(ValueError, newiter, a, [], [['readwrite']])
+    a.flags.writeable = True
+    # Coords and shape available only with the coords flag
+    i = newiter(arange(6), [], [['readonly']])
+    assert_raises(ValueError, lambda i:i.coords, i)
+    assert_raises(ValueError, lambda i:i.shape, i)
+    # Index available only with an index flag
+    assert_raises(ValueError, lambda i:i.index, i)
+    # GotoCoords and GotoIndex incompatible with buffering or no_inner
+    def assign_coords(i):
+        i.coords = (0,)
+    def assign_index(i):
+        i.index = 0
+    def assign_iterindex(i):
+        i.iterindex = 0;
+    def assign_iterrange(i):
+        i.iterrange = (0,1);
+    i = newiter(arange(6), ['no_inner_iteration'])
+    assert_raises(ValueError, assign_coords, i)
+    assert_raises(ValueError, assign_index, i)
+    assert_raises(ValueError, assign_iterindex, i)
+    assert_raises(ValueError, assign_iterrange, i)
+    i = newiter(arange(6), ['buffered'])
+    assert_raises(ValueError, assign_coords, i)
+    assert_raises(ValueError, assign_index, i)
+    assert_raises(ValueError, assign_iterrange, i)
+    # Can't iterate if size is zero
+    assert_raises(ValueError, newiter, np.array([]))
+
+def test_iter_nbo_align_contig():
+    # Check that byte order, alignment, and contig changes work
+
+    # Byte order change by requesting a specific dtype
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    assert_(a.dtype.byteorder != au.dtype.byteorder)
+    i = newiter(au, [], [['readwrite','updateifcopy']],
+                        casting='equiv',
+                        op_dtypes=[np.dtype('f4')])
+    assert_equal(i.dtypes[0].byteorder, a.dtype.byteorder)
+    assert_equal(i.operands[0].dtype.byteorder, a.dtype.byteorder)
+    assert_equal(i.operands[0], a)
+    i.operands[0][:] = 2
+    i = None
+    assert_equal(au, [2]*6)
+
+    # Byte order change by requesting NBO
+    a = np.arange(6, dtype='f4')
+    au = a.byteswap().newbyteorder()
+    assert_(a.dtype.byteorder != au.dtype.byteorder)
+    i = newiter(au, [], [['readwrite','updateifcopy','nbo']], casting='equiv')
+    assert_equal(i.dtypes[0].byteorder, a.dtype.byteorder)
+    assert_equal(i.operands[0].dtype.byteorder, a.dtype.byteorder)
+    assert_equal(i.operands[0], a)
+    i.operands[0][:] = 2
+    i = None
+    assert_equal(au, [2]*6)
+
+    # Unaligned input
+    a = np.zeros((6*4+1,), dtype='i1')[1:]
+    a.dtype = 'f4'
+    a[:] = np.arange(6, dtype='f4')
+    assert_(not a.flags.aligned)
+    # Without 'aligned', shouldn't copy
+    i = newiter(a, [], [['readonly']])
+    assert_(not i.operands[0].flags.aligned)
+    assert_equal(i.operands[0], a);
+    # With 'aligned', should make a copy
+    i = newiter(a, [], [['readwrite','updateifcopy','aligned']])
+    assert_(i.operands[0].flags.aligned)
+    assert_equal(i.operands[0], a);
+    i.operands[0][:] = 3
+    i = None
+    assert_equal(a, [3]*6)
+
+    # Discontiguous input
+    a = arange(12)
+    # If it is contiguous, shouldn't copy
+    i = newiter(a[:6], [], [['readonly']])
+    assert_(i.operands[0].flags.contiguous)
+    assert_equal(i.operands[0], a[:6]);
+    # If it isn't contiguous, should buffer
+    i = newiter(a[::2], ['buffered','no_inner_iteration'],
+                        [['readonly','contig']],
+                        buffersize=10)
+    assert_(i[0].flags.contiguous)
+    assert_equal(i[0], a[::2])
+
+def test_iter_array_cast():
+    # Check that arrays are cast as requested
+
+    # No cast 'f4' -> 'f4'
+    a = np.arange(6, dtype='f4').reshape(2,3)
+    i = newiter(a, [], [['readwrite']], op_dtypes=[np.dtype('f4')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f4'))
+
+    # Byte-order cast '<f4' -> '>f4'
+    a = np.arange(6, dtype='<f4').reshape(2,3)
+    i = newiter(a, [], [['readwrite','updateifcopy']],
+            casting='equiv',
+            op_dtypes=[np.dtype('>f4')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('>f4'))
+
+    # Safe case 'f4' -> 'f8'
+    a = np.arange(24, dtype='f4').reshape(2,3,4).swapaxes(1,2)
+    i = newiter(a, [], [['readonly','copy']],
+            casting='safe',
+            op_dtypes=[np.dtype('f8')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f8'))
+    # The memory layout of the temporary should match a (a is (48,4,16))
+    assert_equal(i.operands[0].strides, (96,8,32))
+    a = a[::-1,:,::-1]
+    i = newiter(a, [], [['readonly','copy']],
+            casting='safe',
+            op_dtypes=[np.dtype('f8')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f8'))
+    assert_equal(i.operands[0].strides, (-96,8,-32))
+
+    # Same-kind cast 'f8' -> 'f4' -> 'f8'
+    a = np.arange(24, dtype='f8').reshape(2,3,4).T
+    i = newiter(a, [],
+            [['readwrite','updateifcopy']],
+            casting='same_kind',
+            op_dtypes=[np.dtype('f4')])
+    assert_equal(i.operands[0], a)
+    assert_equal(i.operands[0].dtype, np.dtype('f4'))
+    assert_equal(i.operands[0].strides, (4, 16, 48))
+    # Check that UPDATEIFCOPY is activated
+    i.operands[0][2,1,1] = -12.5
+    assert_(a[2,1,1] != -12.5)
+    i = None
+    assert_equal(a[2,1,1], -12.5)
+
+    # Unsafe cast 'f4' -> 'i4'
+    a = np.arange(6, dtype='i4')[::-2]
+    i = newiter(a, [],
+            [['writeonly','updateifcopy']],
+            casting='unsafe',
+            op_dtypes=[np.dtype('f4')])
+    assert_equal(i.operands[0].dtype, np.dtype('f4'))
+    assert_equal(i.operands[0].strides, (-4,))
+    i.operands[0][:] = 1
+    i = None
+    assert_equal(a, [1,1,1])
+
+def test_iter_array_cast_errors():
+    # Check that invalid casts are caught
+
+    # Need to enable copying for casts to occur
+    assert_raises(TypeError, newiter, arange(2,dtype='f4'), [],
+                [['readonly']], op_dtypes=[np.dtype('f8')])
+    # Also need to allow casting for casts to occur
+    assert_raises(TypeError, newiter, arange(2,dtype='f4'), [],
+                [['readonly','copy']], casting='no',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, newiter, arange(2,dtype='f4'), [],
+                [['readonly','copy']], casting='equiv',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, newiter, arange(2,dtype='f8'), [],
+                [['writeonly','updateifcopy']],
+                casting='no',
+                op_dtypes=[np.dtype('f4')])
+    assert_raises(TypeError, newiter, arange(2,dtype='f8'), [],
+                [['writeonly','updateifcopy']],
+                casting='equiv',
+                op_dtypes=[np.dtype('f4')])
+    # '<f4' -> '>f4' should not work with casting='no'
+    assert_raises(TypeError, newiter, arange(2,dtype='<f4'), [],
+                [['readonly','copy']], casting='no',
+                op_dtypes=[np.dtype('>f4')])
+    # 'f4' -> 'f8' is a safe cast, but 'f8' -> 'f4' isn't
+    assert_raises(TypeError, newiter, arange(2,dtype='f4'), [],
+                [['readwrite','updateifcopy']],
+                casting='safe',
+                op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, newiter, arange(2,dtype='f8'), [],
+                [['readwrite','updateifcopy']],
+                casting='safe',
+                op_dtypes=[np.dtype('f4')])
+    # 'f4' -> 'i4' is neither a safe nor a same-kind cast
+    assert_raises(TypeError, newiter, arange(2,dtype='f4'), [],
+                [['readonly','copy']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('i4')])
+    assert_raises(TypeError, newiter, arange(2,dtype='i4'), [],
+                [['writeonly','updateifcopy']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('f4')])
+
+def test_iter_scalar_cast():
+    # Check that scalars are cast as requested
+
+    # No cast 'f4' -> 'f4'
+    i = newiter(np.float32(2.5), [], [['readonly']],
+                    op_dtypes=[np.dtype('f4')])
+    assert_equal(i.dtypes[0], np.dtype('f4'))
+    assert_equal(i.value.dtype, np.dtype('f4'))
+    assert_equal(i.value, 2.5)
+    # Safe cast 'f4' -> 'f8'
+    i = newiter(np.float32(2.5), [],
+                    [['readonly','copy']],
+                    casting='safe',
+                    op_dtypes=[np.dtype('f8')])
+    assert_equal(i.dtypes[0], np.dtype('f8'))
+    assert_equal(i.value.dtype, np.dtype('f8'))
+    assert_equal(i.value, 2.5)
+    # Same-kind cast 'f8' -> 'f4'
+    i = newiter(np.float64(2.5), [],
+                    [['readonly','copy']],
+                    casting='same_kind',
+                    op_dtypes=[np.dtype('f4')])
+    assert_equal(i.dtypes[0], np.dtype('f4'))
+    assert_equal(i.value.dtype, np.dtype('f4'))
+    assert_equal(i.value, 2.5)
+    # Unsafe cast 'f8' -> 'i4'
+    i = newiter(np.float64(3.0), [],
+                    [['readonly','copy']],
+                    casting='unsafe',
+                    op_dtypes=[np.dtype('i4')])
+    assert_equal(i.dtypes[0], np.dtype('i4'))
+    assert_equal(i.value.dtype, np.dtype('i4'))
+    assert_equal(i.value, 3)
+
+def test_iter_scalar_cast_errors():
+    # Check that invalid casts are caught
+
+    # Need to allow casting for casts to occur
+    assert_raises(TypeError, newiter, np.float32(2), [],
+                [['readonly']], op_dtypes=[np.dtype('f8')])
+    assert_raises(TypeError, newiter, 2.5, [],
+                [['readonly']], op_dtypes=[np.dtype('f4')])
+    # 'f8' -> 'f4' isn't a safe cast
+    assert_raises(TypeError, newiter, np.float64(2), [],
+                [['readonly']],
+                casting='safe',
+                op_dtypes=[np.dtype('f4')])
+    # 'f4' -> 'i4' is neither a safe nor a same-kind cast
+    assert_raises(TypeError, newiter, np.float32(2), [],
+                [['readonly']],
+                casting='same_kind',
+                op_dtypes=[np.dtype('i4')])
+
+def test_iter_object_arrays():
+    # Check that object arrays work
+
+    obj = {'a':3,'b':'d'}
+    a = np.array([[1,2,3], None, obj, None], dtype='O')
+    rc = sys.getrefcount(obj)
+
+    # Need to allow references for object arrays
+    assert_raises(TypeError, newiter, a)
+    assert_equal(sys.getrefcount(obj), rc)
+
+    i = newiter(a, ['refs_ok'], ['readonly'])
+    vals = [x[()] for x in i]
+    assert_equal(np.array(vals, dtype='O'), a)
+    vals, i, x = [None]*3
+    assert_equal(sys.getrefcount(obj), rc)
+
+    i = newiter(a.reshape(2,2).T, ['refs_ok','buffered'],
+                        ['readonly'], order='C')
+    assert_(i.iterationneedsapi)
+    vals = [x[()] for x in i]
+    assert_equal(np.array(vals, dtype='O'), a.reshape(2,2).ravel(order='F'))
+    vals, i, x = [None]*3
+    assert_equal(sys.getrefcount(obj), rc)
+
+    i = newiter(a.reshape(2,2).T, ['refs_ok','buffered'],
+                        ['readwrite'], order='C')
+    for x in i:
+        x[()] = None
+    vals, i, x = [None]*3
+    assert_equal(sys.getrefcount(obj), rc-1)
+    assert_equal(a, np.array([None]*4, dtype='O'))
+
+    # Conversions to/from objects
+    a = np.arange(6, dtype='O')
+    i = newiter(a, ['refs_ok','buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='i4')
+    for x in i:
+        x[()] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    a = np.arange(6, dtype='i4')
+    i = newiter(a, ['refs_ok','buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='O')
+    for x in i:
+        x[()] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    # Non-contiguous object array
+    a = np.zeros((6,), dtype=[('p','i1'),('a','O')])
+    a = a['a']
+    a[:] = np.arange(6)
+    i = newiter(a, ['refs_ok','buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='i4')
+    for x in i:
+        x[()] += 1
+    assert_equal(a, np.arange(6)+1)
+
+    #Non-contiguous value array
+    a = np.zeros((6,), dtype=[('p','i1'),('a','i4')])
+    a = a['a']
+    a[:] = np.arange(6) + 98172488
+    i = newiter(a, ['refs_ok','buffered'], ['readwrite'],
+                    casting='unsafe', op_dtypes='O')
+    ob = i[0][()]
+    rc = sys.getrefcount(ob)
+    for x in i:
+        x[()] += 1
+    assert_equal(sys.getrefcount(ob), rc-1)
+    assert_equal(a, np.arange(6)+98172489)
+
+def test_iter_common_dtype():
+    # Check that the iterator finds a common data type correctly
+
+    i = newiter([array([3],dtype='f4'),array([0],dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('f8'));
+    assert_equal(i.dtypes[1], np.dtype('f8'));
+    i = newiter([array([3],dtype='i4'),array([0],dtype='f4')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('f8'));
+    assert_equal(i.dtypes[1], np.dtype('f8'));
+    i = newiter([array([3],dtype='f4'),array(0,dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*2,
+                    casting='same_kind')
+    assert_equal(i.dtypes[0], np.dtype('f4'));
+    assert_equal(i.dtypes[1], np.dtype('f4'));
+    i = newiter([array([3],dtype='u4'),array(0,dtype='i4')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('u4'));
+    assert_equal(i.dtypes[1], np.dtype('u4'));
+    i = newiter([array([3],dtype='u4'),array(-12,dtype='i4')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*2,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('i8'));
+    assert_equal(i.dtypes[1], np.dtype('i8'));
+    i = newiter([array([3],dtype='u4'),array(-12,dtype='i4'),
+                 array([2j],dtype='c8'),array([9],dtype='f8')],
+                    ['common_dtype'],
+                    [['readonly','copy']]*4,
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('c16'));
+    assert_equal(i.dtypes[1], np.dtype('c16'));
+    assert_equal(i.dtypes[2], np.dtype('c16'));
+    assert_equal(i.dtypes[3], np.dtype('c16'));
+    assert_equal(i.value, (3,-12,2j,9))
+
+    # When allocating outputs, other outputs aren't factored in
+    i = newiter([array([3],dtype='i4'),None,array([2j],dtype='c16')], [],
+                    [['readonly','copy'],
+                     ['writeonly','allocate'],
+                     ['writeonly']],
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('i4'));
+    assert_equal(i.dtypes[1], np.dtype('i4'));
+    assert_equal(i.dtypes[2], np.dtype('c16'));
+    # But, if common data types are requested, they are
+    i = newiter([array([3],dtype='i4'),None,array([2j],dtype='c16')],
+                    ['common_dtype'],
+                    [['readonly','copy'],
+                     ['writeonly','allocate'],
+                     ['writeonly']],
+                    casting='safe')
+    assert_equal(i.dtypes[0], np.dtype('c16'));
+    assert_equal(i.dtypes[1], np.dtype('c16'));
+    assert_equal(i.dtypes[2], np.dtype('c16'));
+
+def test_iter_op_axes():
+    # Check that custom axes work
+
+    # Reverse the axes
+    a = arange(6).reshape(2,3)
+    i = newiter([a,a.T], [], [['readonly']]*2, op_axes=[[0,1],[1,0]])
+    assert_(all([x==y for (x,y) in i]))
+    a = arange(24).reshape(2,3,4)
+    i = newiter([a.T,a], [], [['readonly']]*2, op_axes=[[2,1,0],None])
+    assert_(all([x==y for (x,y) in i]))
+
+    # Broadcast 1D to any dimension
+    a = arange(1,31).reshape(2,3,5)
+    b = arange(1,3)
+    i = newiter([a,b], [], [['readonly']]*2, op_axes=[None,[0,-1,-1]])
+    assert_equal([x*y for (x,y) in i], (a*b.reshape(2,1,1)).ravel())
+    b = arange(1,4)
+    i = newiter([a,b], [], [['readonly']]*2, op_axes=[None,[-1,0,-1]])
+    assert_equal([x*y for (x,y) in i], (a*b.reshape(1,3,1)).ravel())
+    b = arange(1,6)
+    i = newiter([a,b], [], [['readonly']]*2,
+                            op_axes=[None,[np.newaxis,np.newaxis,0]])
+    assert_equal([x*y for (x,y) in i], (a*b.reshape(1,1,5)).ravel())
+
+    # Inner product-style broadcasting
+    a = arange(24).reshape(2,3,4)
+    b = arange(40).reshape(5,2,4)
+    i = newiter([a,b], ['coords'], [['readonly']]*2,
+                            op_axes=[[0,1,-1,-1],[-1,-1,0,1]])
+    assert_equal(i.shape, (2,3,5,2))
+
+    # Matrix product-style broadcasting
+    a = arange(12).reshape(3,4)
+    b = arange(20).reshape(4,5)
+    i = newiter([a,b], ['coords'], [['readonly']]*2,
+                            op_axes=[[0,-1],[-1,1]])
+    assert_equal(i.shape, (3,5))
+
+def test_iter_op_axes_errors():
+    # Check that custom axes throws errors for bad inputs
+
+    # Wrong number of items in op_axes
+    a = arange(6).reshape(2,3)
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0],[1],[0]])
+    # Out of bounds items in op_axes
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[2,1],[0,1]])
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0,1],[2,-1]])
+    # Duplicate items in op_axes
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0,0],[0,1]])
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0,1],[1,1]])
+
+    # Different sized arrays in op_axes
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0,1],[0,1,0]])
+
+    # Non-broadcastable dimensions in the result
+    assert_raises(ValueError, newiter, [a,a], [], [['readonly']]*2,
+                                    op_axes=[[0,1],[1,0]])
+
+def test_iter_copy():
+    # Check that copying the iterator works correctly
+    a = arange(24).reshape(2,3,4)
+
+    # Simple iterator
+    i = newiter(a)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterindex = 3
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    # Buffered iterator
+    i = newiter(a, ['buffered','ranged'], order='F', buffersize=3)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterindex = 3
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterrange = (3,9)
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    i.iterrange = (2,18)
+    i.next(); i.next()
+    j = i.copy()
+    assert_equal([x[()] for x in i], [x[()] for x in j])
+
+    # Casting iterator
+    i = newiter(a, ['buffered'], order='F', casting='unsafe',
+                op_dtypes='f8', buffersize=5)
+    j = i.copy()
+    i = None
+    assert_equal([x[()] for x in j], a.ravel(order='F'))
+
+    a = arange(24, dtype='<i4').reshape(2,3,4)
+    i = newiter(a, ['buffered'], order='F', casting='unsafe',
+                op_dtypes='>f8', buffersize=5)
+    j = i.copy()
+    i = None
+    assert_equal([x[()] for x in j], a.ravel(order='F'))
+
+def test_iter_allocate_output_simple():
+    # Check that the iterator will properly allocate outputs
+
+    # Simple case
+    a = arange(6)
+    i = newiter([a,None], [], [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+
+def test_iter_allocate_output_buffered_readwrite():
+    # Allocated output with buffering + delay_bufalloc
+
+    a = arange(6)
+    i = newiter([a,None], ['buffered','delay_bufalloc'],
+                        [['readonly'],['allocate','readwrite']])
+    i.operands[1][:] = 1
+    i.reset()
+    for x in i:
+        x[1][()] += x[0][()]
+    assert_equal(i.operands[1], a+1)
+
+def test_iter_allocate_output_itorder():
+    # The allocated output should match the iteration order
+
+    # C-order input, best iteration order
+    a = arange(6, dtype='i4').reshape(2,3)
+    i = newiter([a,None], [], [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, a.strides)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+    # F-order input, best iteration order
+    a = arange(24, dtype='i4').reshape(2,3,4).T
+    i = newiter([a,None], [], [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, a.strides)
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+    # Non-contiguous input, C iteration order
+    a = arange(24, dtype='i4').reshape(2,3,4).swapaxes(0,1)
+    i = newiter([a,None], [],
+                        [['readonly'],['writeonly','allocate']],
+                        order='C',
+                        op_dtypes=[None,np.dtype('f4')])
+    assert_equal(i.operands[1].shape, a.shape)
+    assert_equal(i.operands[1].strides, (32,16,4))
+    assert_equal(i.operands[1].dtype, np.dtype('f4'))
+
+def test_iter_allocate_output_opaxes():
+    # Specifing op_axes should work
+
+    a = arange(24, dtype='i4').reshape(2,3,4)
+    i = newiter([None,a], [], [['writeonly','allocate'],['readonly']],
+                        op_dtypes=[np.dtype('u4'),None],
+                        op_axes=[[1,2,0],None]);
+    assert_equal(i.operands[0].shape, (4,2,3))
+    assert_equal(i.operands[0].strides, (4,48,16))
+    assert_equal(i.operands[0].dtype, np.dtype('u4'))
+
+def test_iter_allocate_output_types_promotion():
+    # Check type promotion of automatic outputs
+
+    i = newiter([array([3],dtype='f4'),array([0],dtype='f8'),None], [],
+                    [['readonly']]*2+[['writeonly','allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f8'));
+    i = newiter([array([3],dtype='i4'),array([0],dtype='f4'),None], [],
+                    [['readonly']]*2+[['writeonly','allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f8'));
+    i = newiter([array([3],dtype='f4'),array(0,dtype='f8'),None], [],
+                    [['readonly']]*2+[['writeonly','allocate']])
+    assert_equal(i.dtypes[2], np.dtype('f4'));
+    i = newiter([array([3],dtype='u4'),array(0,dtype='i4'),None], [],
+                    [['readonly']]*2+[['writeonly','allocate']])
+    assert_equal(i.dtypes[2], np.dtype('u4'));
+    i = newiter([array([3],dtype='u4'),array(-12,dtype='i4'),None], [],
+                    [['readonly']]*2+[['writeonly','allocate']])
+    assert_equal(i.dtypes[2], np.dtype('i8'));
+
+def test_iter_allocate_output_types_byte_order():
+    # Verify the rules for byte order changes
+
+    # When there's just one input, the output type exactly matches
+    a = array([3],dtype='u4').newbyteorder()
+    i = newiter([a,None], [],
+                    [['readonly'],['writeonly','allocate']])
+    assert_equal(i.dtypes[0], i.dtypes[1]);
+    # With two or more inputs, the output type is in native byte order
+    i = newiter([a,a,None], [],
+                    [['readonly'],['readonly'],['writeonly','allocate']])
+    assert_(i.dtypes[0] != i.dtypes[2]);
+    assert_equal(i.dtypes[0].newbyteorder('='), i.dtypes[2])
+
+def test_iter_allocate_output_types_scalar():
+    # If the inputs are all scalars, the output should be a scalar
+
+    i = newiter([None,1,2.3,np.float32(12),np.complex128(3)],[],
+                [['writeonly','allocate']] + [['readonly']]*4)
+    assert_equal(i.operands[0].dtype, np.dtype('complex128'))
+    assert_equal(i.operands[0].ndim, 0)
+
+def test_iter_allocate_output_subtype():
+    # Make sure that the subtype with priority wins
+
+    # matrix vs ndarray
+    a = np.matrix([[1,2], [3,4]])
+    b = np.arange(4).reshape(2,2).T
+    i = newiter([a,b,None], [],
+                    [['readonly'],['readonly'],['writeonly','allocate']])
+    assert_equal(type(a), type(i.operands[2]))
+    assert_(type(b) != type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (2,2))
+
+    # matrix always wants things to be 2D
+    b = np.arange(4).reshape(1,2,2)
+    assert_raises(RuntimeError, newiter, [a,b,None], [],
+                    [['readonly'],['readonly'],['writeonly','allocate']])
+    # but if subtypes are disabled, the result can still work
+    i = newiter([a,b,None], [],
+            [['readonly'],['readonly'],['writeonly','allocate','no_subtype']])
+    assert_equal(type(b), type(i.operands[2]))
+    assert_(type(a) != type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (1,2,2))
+
+def test_iter_allocate_output_errors():
+    # Check that the iterator will throw errors for bad output allocations
+
+    # Need an input if no output data type is specified
+    a = arange(6)
+    assert_raises(TypeError, newiter, [a,None], [],
+                        [['writeonly'],['writeonly','allocate']])
+    # Allocated output should be flagged for writing
+    assert_raises(ValueError, newiter, [a,None], [],
+                        [['readonly'],['allocate','readonly']])
+    # Allocated output can't have buffering without delayed bufalloc
+    assert_raises(ValueError, newiter, [a,None], ['buffered'],
+                                            ['allocate','readwrite'])
+    # Must specify at least one input
+    assert_raises(ValueError, newiter, [None,None], [],
+                        [['writeonly','allocate'],
+                         ['writeonly','allocate']],
+                        op_dtypes=[np.dtype('f4'),np.dtype('f4')])
+    # If using op_axes, must specify all the axes
+    a = arange(24, dtype='i4').reshape(2,3,4)
+    assert_raises(ValueError, newiter, [a,None], [],
+                        [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')],
+                        op_axes=[None,[0,np.newaxis,1]])
+    # If using op_axes, the axes must be within bounds
+    assert_raises(ValueError, newiter, [a,None], [],
+                        [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')],
+                        op_axes=[None,[0,3,1]])
+    # If using op_axes, there can't be duplicates
+    assert_raises(ValueError, newiter, [a,None], [],
+                        [['readonly'],['writeonly','allocate']],
+                        op_dtypes=[None,np.dtype('f4')],
+                        op_axes=[None,[0,2,1,0]])
+
+def test_iter_remove_axis():
+    a = arange(24).reshape(2,3,4)
+
+    i = newiter(a,['coords'])
+    i.remove_axis(1)
+    assert_equal([x for x in i], a[:,0,:].ravel())
+
+    a = a[::-1,:,:]
+    i = newiter(a,['coords'])
+    i.remove_axis(0)
+    assert_equal([x for x in i], a[0,:,:].ravel())
+
+def test_iter_remove_coords_inner_loop():
+    # Check that removing coords support works
+
+    a = arange(24).reshape(2,3,4)
+
+    i = newiter(a,['coords'])
+    assert_equal(i.ndim, 3)
+    assert_equal(i.shape, (2,3,4))
+    assert_equal(i.itviews[0].shape, (2,3,4))
+
+    # Removing coords causes all dimensions to coalesce
+    before = [x for x in i]
+    i.remove_coords()
+    after = [x for x in i]
+
+    assert_equal(before, after)
+    assert_equal(i.ndim, 1)
+    assert_raises(ValueError, lambda i:i.shape, i)
+    assert_equal(i.itviews[0].shape, (24,))
+
+    # Removing the inner loop means there's just one iteration
+    i.reset()
+    assert_equal(i.itersize, 24)
+    assert_equal(i[0].shape, tuple())
+    i.remove_inner_loop()
+    assert_equal(i.itersize, 24)
+    assert_equal(i[0].shape, (24,))
+    assert_equal(i.value, arange(24))
+
+def test_iter_iterindex():
+    # Make sure iterindex works
+
+    buffersize = 5
+    a = arange(24).reshape(4,3,2)
+    for flags in ([], ['buffered']):
+        i = newiter(a, flags, buffersize=buffersize)
+        assert_equal(iter_iterindices(i), range(24))
+        i.iterindex = 2
+        assert_equal(iter_iterindices(i), range(2,24))
+
+        i = newiter(a, flags, order='F', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), range(24))
+        i.iterindex = 5
+        assert_equal(iter_iterindices(i), range(5,24))
+
+        i = newiter(a[::-1], flags, order='F', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), range(24))
+        i.iterindex = 9
+        assert_equal(iter_iterindices(i), range(9,24))
+
+        i = newiter(a[::-1,::-1], flags, order='C', buffersize=buffersize)
+        assert_equal(iter_iterindices(i), range(24))
+        i.iterindex = 13
+        assert_equal(iter_iterindices(i), range(13,24))
+
+        i = newiter(a[::1,::-1], flags, buffersize=buffersize)
+        assert_equal(iter_iterindices(i), range(24))
+        i.iterindex = 23
+        assert_equal(iter_iterindices(i), range(23,24))
+        i.reset()
+        i.iterindex = 2
+        assert_equal(iter_iterindices(i), range(2,24))
+
+def test_iter_iterrange():
+    # Make sure getting and resetting the iterrange works
+
+    buffersize = 5
+    a = arange(24, dtype='i4').reshape(4,3,2)
+    a_fort = a.ravel(order='F')
+
+    i = newiter(a, ['ranged'], ['readonly'], order='F',
+                buffersize=buffersize)
+    assert_equal(i.iterrange, (0,24))
+    assert_equal([x[()] for x in i], a_fort)
+    for r in [(0,24), (1,2), (3,24), (5,5), (0,20), (23,24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal([x[()] for x in i], a_fort[r[0]:r[1]])
+
+    i = newiter(a, ['ranged','buffered'], ['readonly'], order='F',
+                op_dtypes='f8', buffersize=buffersize)
+    assert_equal(i.iterrange, (0,24))
+    assert_equal([x[()] for x in i], a_fort)
+    for r in [(0,24), (1,2), (3,24), (5,5), (0,20), (23,24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal([x[()] for x in i], a_fort[r[0]:r[1]])
+
+    def get_array(i):
+        val = np.array([], dtype='f8')
+        for x in i:
+            val = np.concatenate((val, x))
+        return val
+
+    i = newiter(a, ['ranged','buffered','no_inner_iteration'],
+                ['readonly'], order='F',
+                op_dtypes='f8', buffersize=buffersize)
+    assert_equal(i.iterrange, (0,24))
+    assert_equal(get_array(i), a_fort)
+    for r in [(0,24), (1,2), (3,24), (5,5), (0,20), (23,24)]:
+        i.iterrange = r
+        assert_equal(i.iterrange, r)
+        assert_equal(get_array(i), a_fort[r[0]:r[1]])
+
+def test_iter_buffering():
+    # Test buffering with several buffer sizes and types
+    arrays = []
+    # F-order swapped array
+    arrays.append(np.arange(24,
+                    dtype='c16').reshape(2,3,4).T.newbyteorder().byteswap())
+    # Contiguous 1-dimensional array
+    arrays.append(np.arange(10, dtype='f4'))
+    # Unaligned array
+    a = np.zeros((4*16+1,), dtype='i1')[1:]
+    a.dtype = 'i4'
+    a[:] = np.arange(16,dtype='i4')
+    arrays.append(a)
+    # 4-D F-order array
+    arrays.append(np.arange(120,dtype='i4').reshape(5,3,2,4).T)
+    for a in arrays:
+        for buffersize in (1,2,3,5,8,11,16,1024):
+            vals = []
+            i = newiter(a, ['buffered','no_inner_iteration'],
+                           [['readonly','nbo','aligned']],
+                           order='C',
+                           casting='equiv',
+                           buffersize=buffersize)
+            while not i.finished:
+                assert_(i[0].size <= buffersize)
+                vals.append(i[0].copy())
+                i.iternext()
+            assert_equal(np.concatenate(vals), a.ravel(order='C'))
+
+def test_iter_write_buffering():
+    # Test that buffering of writes is working
+
+    # F-order swapped array
+    a = np.arange(24).reshape(2,3,4).T.newbyteorder().byteswap()
+    i = newiter(a, ['buffered'],
+                   [['readwrite','nbo','aligned']],
+                   casting='equiv',
+                   order='C',
+                   buffersize=16)
+    x = 0
+    while not i.finished:
+        i[0] = x
+        x += 1
+        i.iternext()
+    assert_equal(a.ravel(order='C'), np.arange(24))
+
+def test_iter_buffering_delayed_alloc():
+    # Test that delaying buffer allocation works
+
+    a = np.arange(6)
+    b = np.arange(1, dtype='f4')
+    i = newiter([a,b], ['buffered','delay_bufalloc','coords','reduce_ok'],
+                    ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes='f4')
+    assert_(i.hasdelayedbufalloc)
+    assert_raises(ValueError, lambda i:i.coords, i)
+    assert_raises(ValueError, lambda i:i[0], i)
+    assert_raises(ValueError, lambda i:i[0:2], i)
+    def assign_iter(i):
+        i[0] = 0
+    assert_raises(ValueError, assign_iter, i)
+
+    i.reset()
+    assert_(not i.hasdelayedbufalloc)
+    assert_equal(i.coords, (0,))
+    assert_equal(i[0], 0)
+    i[1] = 1
+    assert_equal(i[0:2], [0,1])
+    assert_equal([[x[0][()],x[1][()]] for x in i], zip(range(6), [1]*6))
+
+def test_iter_buffered_cast_simple():
+    # Test that buffering can handle a simple cast
+
+    a = np.arange(10, dtype='f4')
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f8')],
+                   buffersize=3)
+    for v in i:
+        v[()] *= 2
+    
+    assert_equal(a, 2*np.arange(10, dtype='f4'))
+
+def test_iter_buffered_cast_byteswapped():
+    # Test that buffering can handle a cast which requires swap->cast->swap
+
+    a = np.arange(10, dtype='f4').newbyteorder().byteswap()
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f8').newbyteorder()],
+                   buffersize=3)
+    for v in i:
+        v[()] *= 2
+    
+    assert_equal(a, 2*np.arange(10, dtype='f4'))
+
+    try:
+        warnings.simplefilter("ignore", np.ComplexWarning)
+
+        a = np.arange(10, dtype='f8').newbyteorder().byteswap()
+        i = newiter(a, ['buffered','no_inner_iteration'],
+                       [['readwrite','nbo','aligned']],
+                       casting='unsafe',
+                       op_dtypes=[np.dtype('c8').newbyteorder()],
+                       buffersize=3)
+        for v in i:
+            v[()] *= 2
+        
+        assert_equal(a, 2*np.arange(10, dtype='f8'))
+    finally:
+        warnings.simplefilter("default", np.ComplexWarning)
+
+def test_iter_buffered_cast_byteswapped_complex():
+    # Test that buffering can handle a cast which requires swap->cast->copy
+
+    a = np.arange(10, dtype='c8').newbyteorder().byteswap()
+    a += 2j
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16')],
+                   buffersize=3)
+    for v in i:
+        v[()] *= 2
+    assert_equal(a, 2*np.arange(10, dtype='c8') + 4j)
+
+    a = np.arange(10, dtype='c8')
+    a += 2j
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16').newbyteorder()],
+                   buffersize=3)
+    for v in i:
+        v[()] *= 2
+    assert_equal(a, 2*np.arange(10, dtype='c8') + 4j)
+
+    a = np.arange(10, dtype=np.clongdouble).newbyteorder().byteswap()
+    a += 2j
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('c16')],
+                   buffersize=3)
+    for v in i:
+        v[()] *= 2
+    assert_equal(a, 2*np.arange(10, dtype=np.clongdouble) + 4j)
+
+    a = np.arange(10, dtype=np.longdouble).newbyteorder().byteswap()
+    i = newiter(a, ['buffered','no_inner_iteration'],
+                   [['readwrite','nbo','aligned']],
+                   casting='same_kind',
+                   op_dtypes=[np.dtype('f4')],
+                   buffersize=7)
+    for v in i:
+        v[()] *= 2
+    assert_equal(a, 2*np.arange(10, dtype=np.longdouble))
+
+def test_iter_buffered_cast_structured_type():
+    # Tests buffering of structured types
+
+    # simple -> struct type (duplicates the value)
+    sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2,3)), ('d', 'O')]
+    a = np.arange(3, dtype='f4') + 0.5
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt)
+    vals = [np.array(x) for x in i]
+    assert_equal(vals[0]['a'], 0.5)
+    assert_equal(vals[0]['b'], 0)
+    assert_equal(vals[0]['c'], [[(0.5)]*3]*2)
+    assert_equal(vals[0]['d'], 0.5)
+    assert_equal(vals[1]['a'], 1.5)
+    assert_equal(vals[1]['b'], 1)
+    assert_equal(vals[1]['c'], [[(1.5)]*3]*2)
+    assert_equal(vals[1]['d'], 1.5)
+    assert_equal(vals[0].dtype, np.dtype(sdt))
+
+    # object -> struct type
+    sdt = [('a', 'f4'), ('b', 'i8'), ('c', 'c8', (2,3)), ('d', 'O')]
+    a = np.arange(3, dtype='O') + 0.5
+    rc = sys.getrefcount(a[0])
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt)
+    vals = [np.array(x) for x in i]
+    assert_equal(vals[0]['a'], 0.5)
+    assert_equal(vals[0]['b'], 0)
+    assert_equal(vals[0]['c'], [[(0.5)]*3]*2)
+    assert_equal(vals[0]['d'], 0.5)
+    assert_equal(vals[1]['a'], 1.5)
+    assert_equal(vals[1]['b'], 1)
+    assert_equal(vals[1]['c'], [[(1.5)]*3]*2)
+    assert_equal(vals[1]['d'], 1.5)
+    assert_equal(vals[0].dtype, np.dtype(sdt))
+    vals, i, x = [None]*3
+    assert_equal(sys.getrefcount(a[0]), rc)
+
+    # struct type -> simple (takes the first value)
+    sdt = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    a = np.array([(5.5,7,'test'),(8,10,11)], dtype=sdt)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes='i4')
+    assert_equal([x[()] for x in i], [5, 8])
+
+    # struct type -> struct type (field-wise copy)
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    sdt2 = [('d', 'u2'), ('a', 'O'), ('b', 'f8')]
+    a = np.array([(1,2,3),(4,5,6)], dtype=sdt1)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    assert_equal([np.array(x) for x in i],
+                    [np.array((3,1,2), dtype=sdt2),
+                     np.array((6,4,5), dtype=sdt2)])
+
+    # struct type -> struct type (field gets discarded)
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', 'O')]
+    sdt2 = [('b', 'O'), ('a', 'f8')]
+    a = np.array([(1,2,3),(4,5,6)], dtype=sdt1)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    vals = []
+    for x in i:
+        vals.append(np.array(x))
+        x['a'] = x['b']+3
+    assert_equal(vals, [np.array((2,1), dtype=sdt2),
+                     np.array((5,4), dtype=sdt2)])
+    assert_equal(a, np.array([(5,2,None),(8,5,None)], dtype=sdt1))
+
+    # struct type -> struct type (structured field gets discarded)
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', [('a', 'i2'),('b','i4')])]
+    sdt2 = [('b', 'O'), ('a', 'f8')]
+    a = np.array([(1,2,(0,9)),(4,5,(20,21))], dtype=sdt1)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    vals = []
+    for x in i:
+        vals.append(np.array(x))
+        x['a'] = x['b']+3
+    assert_equal(vals, [np.array((2,1), dtype=sdt2),
+                     np.array((5,4), dtype=sdt2)])
+    assert_equal(a, np.array([(5,2,(0,0)),(8,5,(0,0))], dtype=sdt1))
+
+    # struct type -> struct type (structured field w/ ref gets discarded)
+    sdt1 = [('a', 'f4'), ('b', 'i8'), ('d', [('a', 'i2'),('b','O')])]
+    sdt2 = [('b', 'O'), ('a', 'f8')]
+    a = np.array([(1,2,(0,9)),(4,5,(20,21))], dtype=sdt1)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    vals = []
+    for x in i:
+        vals.append(np.array(x))
+        x['a'] = x['b']+3
+    assert_equal(vals, [np.array((2,1), dtype=sdt2),
+                     np.array((5,4), dtype=sdt2)])
+    assert_equal(a, np.array([(5,2,(0,None)),(8,5,(0,None))], dtype=sdt1))
+
+    # struct type -> struct type back (structured field w/ ref gets discarded)
+    sdt1 = [('b', 'O'), ('a', 'f8')]
+    sdt2 = [('a', 'f4'), ('b', 'i8'), ('d', [('a', 'i2'),('b','O')])]
+    a = np.array([(1,2),(4,5)], dtype=sdt1)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    vals = []
+    for x in i:
+        vals.append(np.array(x))
+        assert_equal(x['d'], np.array((0, None), dtype=[('a','i2'),('b','O')]))
+        x['a'] = x['b']+3
+    assert_equal(vals, [np.array((2,1,(0,None)), dtype=sdt2),
+                     np.array((5,4,(0,None)), dtype=sdt2)])
+    assert_equal(a, np.array([(1,4),(4,7)], dtype=sdt1))
+
+def test_iter_buffered_cast_subarray():
+    # Tests buffering of subarrays
+
+    # one element -> many (copies it to all)
+    sdt1 = [('a', 'f4')]
+    sdt2 = [('a', 'f8', (3,2,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    for x, count in zip(i, range(6)):
+        assert_(np.all(x['a'] == count))
+
+    # one element -> many -> back (copies it to all)
+    sdt1 = [('a', 'O', (1,1))]
+    sdt2 = [('a', 'O', (3,2,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:,0,0] = np.arange(6)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_(np.all(x['a'] == count))
+        x['a'][0] += 2
+        count += 1
+    assert_equal(a['a'], np.arange(6).reshape(6,1,1)+2)
+
+    # many -> one element -> back (copies just element 0)
+    sdt1 = [('a', 'O', (3,2,2))]
+    sdt2 = [('a', 'O', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:,0,0,0] = np.arange(6)
+    i = newiter(a, ['buffered','refs_ok'], ['readwrite'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], count)
+        x['a'] += 2
+        count += 1
+    assert_equal(a['a'], np.arange(6).reshape(6,1,1,1)*np.ones((1,3,2,2))+2)
+
+    # many -> one element -> back (copies just element 0)
+    sdt1 = [('a', 'f8', (3,2,2))]
+    sdt2 = [('a', 'O', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:,0,0,0] = np.arange(6)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], count)
+        count += 1
+
+    # many -> one element (copies just element 0)
+    sdt1 = [('a', 'O', (3,2,2))]
+    sdt2 = [('a', 'f4', (1,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'][:,0,0,0] = np.arange(6)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], count)
+        count += 1
+
+    # many -> matching shape (straightforward copy)
+    sdt1 = [('a', 'O', (3,2,2))]
+    sdt2 = [('a', 'f4', (3,2,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*3*2*2).reshape(6,3,2,2)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], a[count]['a'])
+        count += 1
+
+    # vector -> smaller vector (truncates)
+    sdt1 = [('a', 'f8', (6,))]
+    sdt2 = [('a', 'f4', (2,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*6).reshape(6,6)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'], a[count]['a'][:2])
+        count += 1
+
+    # vector -> bigger vector (pads with zeros)
+    sdt1 = [('a', 'f8', (2,))]
+    sdt2 = [('a', 'f4', (6,))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6,2)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2], a[count]['a'])
+        assert_equal(x['a'][2:], [0,0,0,0])
+        count += 1
+
+    # vector -> matrix (broadcasts)
+    sdt1 = [('a', 'f8', (2,))]
+    sdt2 = [('a', 'f4', (2,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6,2)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][0], a[count]['a'])
+        assert_equal(x['a'][1], a[count]['a'])
+        count += 1
+
+    # vector -> matrix (broadcasts and zero-pads)
+    sdt1 = [('a', 'f8', (2,1))]
+    sdt2 = [('a', 'f4', (3,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2).reshape(6,2,1)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2,0], a[count]['a'][:,0])
+        assert_equal(x['a'][:2,1], a[count]['a'][:,0])
+        assert_equal(x['a'][2,:], [0,0])
+        count += 1
+
+    # matrix -> matrix (truncates and zero-pads)
+    sdt1 = [('a', 'f8', (2,3))]
+    sdt2 = [('a', 'f4', (3,2))]
+    a = np.zeros((6,), dtype=sdt1)
+    a['a'] = np.arange(6*2*3).reshape(6,2,3)
+    i = newiter(a, ['buffered','refs_ok'], ['readonly'],
+                    casting='unsafe',
+                    op_dtypes=sdt2)
+    assert_equal(i[0].dtype, np.dtype(sdt2))
+    count = 0
+    for x in i:
+        assert_equal(x['a'][:2,0], a[count]['a'][:,0])
+        assert_equal(x['a'][:2,1], a[count]['a'][:,1])
+        assert_equal(x['a'][2,:], [0,0])
+        count += 1
+
+def test_iter_buffering_badwriteback():
+    # Writing back from a buffer cannot combine elements
+
+    # a needs write buffering, but had a broadcast dimension
+    a = np.arange(6).reshape(2,3,1)
+    b = np.arange(12).reshape(2,3,2)
+    assert_raises(ValueError,newiter,[a,b],
+                        ['buffered','no_inner_iteration'],
+                        [['readwrite'],['writeonly']],
+                        order='C')
+
+    # But if a is readonly, it's fine
+    i = newiter([a,b],['buffered','no_inner_iteration'],
+                        [['readonly'],['writeonly']],
+                        order='C')
+    
+    # If a has just one element, it's fine too (constant 0 stride, a reduction)
+    a = np.arange(1).reshape(1,1,1)
+    i = newiter([a,b],['buffered','no_inner_iteration','reduce_ok'],
+                        [['readwrite'],['writeonly']],
+                        order='C')
+
+    # check that it fails on other dimensions too
+    a = np.arange(6).reshape(1,3,2)
+    assert_raises(ValueError,newiter,[a,b],
+                        ['buffered','no_inner_iteration'],
+                        [['readwrite'],['writeonly']],
+                        order='C')
+    a = np.arange(4).reshape(2,1,2)
+    assert_raises(ValueError,newiter,[a,b],
+                        ['buffered','no_inner_iteration'],
+                        [['readwrite'],['writeonly']],
+                        order='C')
+
+def test_iter_buffering_string():
+    # Safe casting disallows shrinking strings
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.str)
+    assert_equal(a.dtype, np.dtype('S4'));
+    assert_raises(TypeError,newiter,a,['buffered'],['readonly'],
+                    op_dtypes='S2')
+    i = newiter(a, ['buffered'], ['readonly'], op_dtypes='S6')
+    assert_equal(i[0], 'abc')
+    assert_equal(i[0].dtype, np.dtype('S6'))
+
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.unicode)
+    assert_equal(a.dtype, np.dtype('U4'));
+    assert_raises(TypeError,newiter,a,['buffered'],['readonly'],
+                    op_dtypes='U2')
+    i = newiter(a, ['buffered'], ['readonly'], op_dtypes='U6')
+    assert_equal(i[0], u'abc')
+    assert_equal(i[0].dtype, np.dtype('U6'))
+
+def test_iter_buffering_growinner():
+    # Test that the inner loop grows when no buffering is needed
+    a = np.arange(30)
+    i = newiter(a, ['buffered','growinner','no_inner_iteration'],
+                           buffersize=5)
+    # Should end up with just one inner loop here
+    assert_equal(i[0].size, a.size)
+
+def test_iter_no_broadcast():
+    # Test that the no_broadcast flag works
+    a = np.arange(24).reshape(2,3,4)
+    b = np.arange(6).reshape(2,3,1)
+    c = np.arange(12).reshape(3,4)
+
+    i = newiter([a,b,c], [],
+                    [['readonly','no_broadcast'],['readonly'],['readonly']])
+    assert_raises(ValueError, newiter, [a,b,c], [],
+                    [['readonly'],['readonly','no_broadcast'],['readonly']])
+    assert_raises(ValueError, newiter, [a,b,c], [],
+                    [['readonly'],['readonly'],['readonly','no_broadcast']])
+
+def test_iter_nested_iters_basic():
+    # Test nested iteration basic usage
+    a = arange(12).reshape(2,3,2)
+
+    i, j = np.nested_iters(a, [[0],[1,2]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1,2,3,4,5],[6,7,8,9,10,11]])
+
+    i, j = np.nested_iters(a, [[0,1],[2]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]])
+
+    i, j = np.nested_iters(a, [[0,2],[1]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,2,4],[1,3,5],[6,8,10],[7,9,11]])
+
+def test_iter_nested_iters_reorder():
+    # Test nested iteration basic usage
+    a = arange(12).reshape(2,3,2)
+
+    # In 'K' order (default), it gets reordered
+    i, j = np.nested_iters(a, [[0],[2,1]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1,2,3,4,5],[6,7,8,9,10,11]])
+
+    i, j = np.nested_iters(a, [[1,0],[2]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]])
+
+    i, j = np.nested_iters(a, [[2,0],[1]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,2,4],[1,3,5],[6,8,10],[7,9,11]])
+
+    # In 'C' order, it doesn't
+    i, j = np.nested_iters(a, [[0],[2,1]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,2,4,1,3,5],[6,8,10,7,9,11]])
+
+    i, j = np.nested_iters(a, [[1,0],[2]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1],[6,7],[2,3],[8,9],[4,5],[10,11]])
+
+    i, j = np.nested_iters(a, [[2,0],[1]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,2,4],[6,8,10],[1,3,5],[7,9,11]])
+
+def test_iter_nested_iters_flip_axes():
+    # Test nested iteration with negative axes
+    a = arange(12).reshape(2,3,2)[::-1,::-1,::-1]
+
+    # In 'K' order (default), the axes all get flipped
+    i, j = np.nested_iters(a, [[0],[1,2]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1,2,3,4,5],[6,7,8,9,10,11]])
+
+    i, j = np.nested_iters(a, [[0,1],[2]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]])
+
+    i, j = np.nested_iters(a, [[0,2],[1]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,2,4],[1,3,5],[6,8,10],[7,9,11]])
+
+    # In 'C' order, flipping axes is disabled
+    i, j = np.nested_iters(a, [[0],[1,2]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[11,10,9,8,7,6],[5,4,3,2,1,0]])
+
+    i, j = np.nested_iters(a, [[0,1],[2]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[11,10],[9,8],[7,6],[5,4],[3,2],[1,0]])
+
+    i, j = np.nested_iters(a, [[0,2],[1]], order='C')
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[11,9,7],[10,8,6],[5,3,1],[4,2,0]])
+
+def test_iter_nested_iters_broadcast():
+    # Test nested iteration with broadcasting
+    a = arange(2).reshape(2,1)
+    b = arange(3).reshape(1,3)
+
+    i, j = np.nested_iters([a,b], [[0],[1]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[[0,0],[0,1],[0,2]],[[1,0],[1,1],[1,2]]])
+
+    i, j = np.nested_iters([a,b], [[1],[0]])
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[[0,0],[1,0]],[[0,1],[1,1]],[[0,2],[1,2]]])
+
+def test_iter_nested_iters_dtype_copy():
+    # Test nested iteration with a copy to change dtype
+
+    # copy
+    a = arange(6, dtype='i4').reshape(2,3)
+    i, j = np.nested_iters(a, [[0],[1]],
+                        op_flags=['readonly','copy'],
+                        op_dtypes='f8')
+    assert_equal(j[0].dtype, np.dtype('f8'))
+    vals = []
+    for x in i:
+        vals.append([y for y in j])
+    assert_equal(vals, [[0,1,2],[3,4,5]])
+    vals = None
+
+    # updateifcopy
+    a = arange(6, dtype='f4').reshape(2,3)
+    i, j = np.nested_iters(a, [[0],[1]],
+                        op_flags=['readwrite','updateifcopy'],
+                        casting='same_kind',
+                        op_dtypes='f8')
+    assert_equal(j[0].dtype, np.dtype('f8'))
+    for x in i:
+        for y in j:
+            y[()] += 1
+    assert_equal(a, [[0,1,2],[3,4,5]])
+    i, j, x, y = (None,)*4 # force the updateifcopy
+    assert_equal(a, [[1,2,3],[4,5,6]])
+
+def test_iter_nested_iters_dtype_buffered():
+    # Test nested iteration with buffering to change dtype
+
+    a = arange(6, dtype='f4').reshape(2,3)
+    i, j = np.nested_iters(a, [[0],[1]],
+                        flags=['buffered'],
+                        op_flags=['readwrite'],
+                        casting='same_kind',
+                        op_dtypes='f8')
+    assert_equal(j[0].dtype, np.dtype('f8'))
+    for x in i:
+        for y in j:
+            y[()] += 1
+    assert_equal(a, [[1,2,3],[4,5,6]])
+
+def test_iter_reduction_error():
+    
+    a = np.arange(6)
+    assert_raises(ValueError, newiter, [a,None], [],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[[0],[-1]])
+
+    a = np.arange(6).reshape(2,3)
+    assert_raises(ValueError, newiter, [a,None], ['no_inner_iteration'],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[[0,1],[-1,-1]])
+
+def test_iter_reduction():
+    # Test doing reductions with the iterator
+
+    a = np.arange(6)
+    i = newiter([a,None], ['reduce_ok'],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[[0],[-1]])
+    # Need to initialize the output operand to the addition unit
+    i.operands[1][()] = 0
+    # Do the reduction
+    for x, y in i:
+        y[()] += x
+    # Since no axes were specified, should have allocated a scalar
+    assert_equal(i.operands[1].ndim, 0)
+    assert_equal(i.operands[1], np.sum(a))
+
+    a = np.arange(6).reshape(2,3)
+    i = newiter([a,None], ['reduce_ok','no_inner_iteration'],
+                    [['readonly'], ['readwrite','allocate']],
+                    op_axes=[[0,1],[-1,-1]])
+    # Need to initialize the output operand to the addition unit
+    i.operands[1][()] = 0
+    # Reduction shape/strides for the output
+    assert_equal(i[1].shape, (6,))
+    assert_equal(i[1].strides, (0,))
+    # Do the reduction
+    for x, y in i:
+        y[()] += x
+    # Since no axes were specified, should have allocated a scalar
+    assert_equal(i.operands[1].ndim, 0)
+    assert_equal(i.operands[1], np.sum(a))
+
+
+def test_iter_buffering_reduction():
+    # Test doing buffered reductions with the iterator
+
+    a = np.arange(6)
+    b = np.array(0., dtype='f8').byteswap().newbyteorder()
+    i = newiter([a,b], ['reduce_ok', 'buffered'],
+                    [['readonly'], ['readwrite','nbo']],
+                    op_axes=[[0],[-1]])
+    assert_equal(i[1].dtype, np.dtype('f8'))
+    assert_(i[1].dtype != b.dtype)
+    # Do the reduction
+    for x, y in i:
+        y[()] += x
+    # Since no axes were specified, should have allocated a scalar
+    assert_equal(b, np.sum(a))
+
+    a = np.arange(6).reshape(2,3)
+    b = np.array([0,0], dtype='f8').byteswap().newbyteorder()
+    i = newiter([a,b], ['reduce_ok','no_inner_iteration', 'buffered'],
+                    [['readonly'], ['readwrite','nbo']],
+                    op_axes=[[0,1],[0,-1]])
+    # Reduction shape/strides for the output
+    assert_equal(i[1].shape, (3,))
+    assert_equal(i[1].strides, (0,))
+    # Do the reduction
+    for x, y in i:
+        y[()] += x
+    assert_equal(b, np.sum(a, axis=1))
+
+if __name__ == "__main__":
+    run_module_suite()
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index f3ca44404..f1cf7c2c4 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -5,7 +5,9 @@ import numpy as np
 from numpy.core import *
 from numpy.random import rand, randint, randn
 from numpy.testing import *
+from numpy.testing.utils import WarningManager
 from numpy.core.multiarray import dot as dot_
+import warnings
 
 class Vec:
     def __init__(self,sequence=None):
@@ -151,6 +153,330 @@ class TestResize(TestCase):
         Ar = resize(A, (0,))
         assert_equal(Ar, array([]))
 
+class TestEinSum(TestCase):
+    def test_einsum_errors(self):
+        # Need enough arguments
+        assert_raises(ValueError, np.einsum)
+        assert_raises(ValueError, np.einsum, "")
+
+        # subscripts must be a string
+        assert_raises(TypeError, np.einsum, 0, 0)
+
+        # out parameter must be an array
+        assert_raises(TypeError, np.einsum, "", 0, out='test')
+
+        # order parameter must be a valid order
+        assert_raises(TypeError, np.einsum, "", 0, order='W')
+
+        # casting parameter must be a valid casting
+        assert_raises(ValueError, np.einsum, "", 0, casting='blah')
+
+        # dtype parameter must be a valid dtype
+        assert_raises(TypeError, np.einsum, "", 0, dtype='bad_data_type')
+
+        # other keyword arguments are rejected
+        assert_raises(TypeError, np.einsum, "", 0, bad_arg=0)
+
+        # number of operands must match count in subscripts string
+        assert_raises(ValueError, np.einsum, "", 0, 0)
+        assert_raises(ValueError, np.einsum, ",", 0, [0], [0])
+        assert_raises(ValueError, np.einsum, ",", [0])
+
+        # can't have more subscripts than dimensions in the operand
+        assert_raises(ValueError, np.einsum, "i", 0)
+        assert_raises(ValueError, np.einsum, "ij", [0,0])
+        assert_raises(ValueError, np.einsum, "...i", 0)
+        assert_raises(ValueError, np.einsum, "i...j", [0,0])
+        assert_raises(ValueError, np.einsum, "i...", 0)
+        assert_raises(ValueError, np.einsum, "ij...", [0,0])
+
+        # invalid ellipsis
+        assert_raises(ValueError, np.einsum, "i..", [0,0])
+        assert_raises(ValueError, np.einsum, ".i...", [0,0])
+        assert_raises(ValueError, np.einsum, "j->..j", [0,0])
+        assert_raises(ValueError, np.einsum, "j->.j...", [0,0])
+
+        # invalid subscript character
+        assert_raises(ValueError, np.einsum, "i%...", [0,0])
+        assert_raises(ValueError, np.einsum, "...j$", [0,0])
+        assert_raises(ValueError, np.einsum, "i->&", [0,0])
+
+        # output subscripts must appear in input
+        assert_raises(ValueError, np.einsum, "i->ij", [0,0])
+
+        # output subscripts may only be specified once
+        assert_raises(ValueError, np.einsum, "ij->jij", [[0,0],[0,0]])
+
+        # dimensions much match when being collapsed
+        assert_raises(ValueError, np.einsum, "ii", np.arange(6).reshape(2,3))
+        assert_raises(ValueError, np.einsum, "ii->i", np.arange(6).reshape(2,3))
+
+    def test_einsum_views(self):
+        # pass-through
+        a = np.arange(6).reshape(2,3)
+
+        b = np.einsum("", a)
+        assert_(b.base is a)
+
+        b = np.einsum("ij", a)
+        assert_(b.base is a)
+        assert_equal(b, a)
+
+        # transpose
+        a = np.arange(6).reshape(2,3)
+
+        b = np.einsum("ji", a)
+        assert_(b.base is a)
+        assert_equal(b, a.T)
+
+        # diagonal
+        a = np.arange(9).reshape(3,3)
+
+        b = np.einsum("ii->i", a)
+        assert_(b.base is a)
+        assert_equal(b, [a[i,i] for i in range(3)])
+
+        # diagonal with various ways of broadcasting an additional dimension
+        a = np.arange(27).reshape(3,3,3)
+
+        b = np.einsum("ii->i", a)
+        assert_(b.base is a)
+        assert_equal(b, [[x[i,i] for i in range(3)] for x in a])
+
+        b = np.einsum("ii...->i", a)
+        assert_(b.base is a)
+        assert_equal(b, [[x[i,i] for i in range(3)]
+                         for x in a.transpose(2,0,1)])
+
+        b = np.einsum("ii->i...", a)
+        assert_(b.base is a)
+        assert_equal(b, [a[:,i,i] for i in range(3)])
+
+        b = np.einsum("jii->ij", a)
+        assert_(b.base is a)
+        assert_equal(b, [a[:,i,i] for i in range(3)])
+
+        b = np.einsum("ii...->i...", a)
+        assert_(b.base is a)
+        assert_equal(b, [a.transpose(2,0,1)[:,i,i] for i in range(3)])
+
+        b = np.einsum("i...i->i...", a)
+        assert_(b.base is a)
+        assert_equal(b, [a.transpose(1,0,2)[:,i,i] for i in range(3)])
+
+        b = np.einsum("i...i->i", a)
+        assert_(b.base is a)
+        assert_equal(b, [[x[i,i] for i in range(3)]
+                         for x in a.transpose(1,0,2)])
+
+        # triple diagonal
+        a = np.arange(27).reshape(3,3,3)
+
+        b = np.einsum("iii->i", a)
+        assert_(b.base is a)
+        assert_equal(b, [a[i,i,i] for i in range(3)])
+
+        # swap axes
+        a = np.arange(24).reshape(2,3,4)
+
+        b = np.einsum("ijk->jik", a)
+        assert_(b.base is a)
+        assert_equal(b, a.swapaxes(0,1))
+
+    def check_einsum_sums(self, dtype):
+        # sum(a, axis=-1)
+        a = np.arange(10, dtype=dtype)
+        assert_equal(np.einsum("i->", a), np.sum(a, axis=-1))
+
+        a = np.arange(24, dtype=dtype).reshape(2,3,4)
+        assert_equal(np.einsum("i->", a), np.sum(a, axis=-1))
+
+        # sum(a, axis=0)
+        a = np.arange(10, dtype=dtype)
+        assert_equal(np.einsum("i...->", a), np.sum(a, axis=0))
+
+        a = np.arange(24, dtype=dtype).reshape(2,3,4)
+        assert_equal(np.einsum("i...->", a), np.sum(a, axis=0))
+
+        # trace(a)
+        a = np.arange(25, dtype=dtype).reshape(5,5)
+        assert_equal(np.einsum("ii", a), np.trace(a))
+
+        # multiply(a, b)
+        a = np.arange(12, dtype=dtype).reshape(3,4)
+        b = np.arange(24, dtype=dtype).reshape(2,3,4)
+        assert_equal(np.einsum(",", a, b), np.multiply(a, b))
+
+        # inner(a,b)
+        a = np.arange(24, dtype=dtype).reshape(2,3,4)
+        b = np.arange(4, dtype=dtype)
+        assert_equal(np.einsum("i,i", a, b), np.inner(a, b))
+
+        a = np.arange(24, dtype=dtype).reshape(2,3,4)
+        b = np.arange(2, dtype=dtype)
+        assert_equal(np.einsum("i...,i...", a, b), np.inner(a.T, b.T).T)
+
+        # outer(a,b)
+        a = np.arange(3, dtype=dtype)+1
+        b = np.arange(4, dtype=dtype)+1
+        assert_equal(np.einsum("i,j", a, b), np.outer(a, b))
+
+        # Suppress the complex warnings for the 'as f8' tests
+        ctx = WarningManager()
+        ctx.__enter__()
+        try:
+            warnings.simplefilter('ignore', np.ComplexWarning)
+
+            # matvec(a,b) / a.dot(b) where a is matrix, b is vector
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(5, dtype=dtype)
+            assert_equal(np.einsum("ij,j", a, b), np.dot(a, b))
+
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(5, dtype=dtype)
+            c = np.arange(4, dtype=dtype)
+            np.einsum("ij,j", a, b, out=c,
+                        dtype='f8', casting='unsafe')
+            assert_equal(c,
+                        np.dot(a.astype('f8'), b.astype('f8')).astype(dtype))
+
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(5, dtype=dtype)
+            assert_equal(np.einsum("ji,j", a.T, b.T), np.dot(b.T, a.T))
+
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(5, dtype=dtype)
+            c = np.arange(4, dtype=dtype)
+            np.einsum("ji,j", a.T, b.T, out=c, dtype='f8', casting='unsafe')
+            assert_equal(c,
+                    np.dot(b.T.astype('f8'), a.T.astype('f8')).astype(dtype))
+
+            # matmat(a,b) / a.dot(b) where a is matrix, b is matrix
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(30, dtype=dtype).reshape(5,6)
+            assert_equal(np.einsum("ij,jk", a, b), np.dot(a, b))
+
+            a = np.arange(20, dtype=dtype).reshape(4,5)
+            b = np.arange(30, dtype=dtype).reshape(5,6)
+            c = np.arange(24, dtype=dtype).reshape(4,6)
+            np.einsum("ij,jk", a, b, out=c, dtype='f8', casting='unsafe')
+            assert_equal(c,
+                        np.dot(a.astype('f8'), b.astype('f8')).astype(dtype))
+
+            # matrix triple product (note this is not an efficient
+            # way to multiply 3 matrices)
+            a = np.arange(12, dtype=dtype).reshape(3,4)
+            b = np.arange(20, dtype=dtype).reshape(4,5)
+            c = np.arange(30, dtype=dtype).reshape(5,6)
+            if dtype != 'f2':
+                assert_equal(np.einsum("ij,jk,kl", a, b, c),
+                                    a.dot(b).dot(c))
+
+            a = np.arange(12, dtype=dtype).reshape(3,4)
+            b = np.arange(20, dtype=dtype).reshape(4,5)
+            c = np.arange(30, dtype=dtype).reshape(5,6)
+            d = np.arange(18, dtype=dtype).reshape(3,6)
+            np.einsum("ij,jk,kl", a, b, c, out=d,
+                                dtype='f8', casting='unsafe')
+            assert_equal(d, a.astype('f8').dot(b.astype('f8')
+                        ).dot(c.astype('f8')).astype(dtype))
+
+            # tensordot(a, b)
+            if np.dtype(dtype) != np.dtype('f2'):
+                a = np.arange(60, dtype=dtype).reshape(3,4,5)
+                b = np.arange(24, dtype=dtype).reshape(4,3,2)
+                assert_equal(np.einsum("ijk,jil->kl", a, b),
+                                np.tensordot(a,b, axes=([1,0],[0,1])))
+
+                a = np.arange(60, dtype=dtype).reshape(3,4,5)
+                b = np.arange(24, dtype=dtype).reshape(4,3,2)
+                c = np.arange(10, dtype=dtype).reshape(5,2)
+                np.einsum("ijk,jil->kl", a, b, out=c,
+                                        dtype='f8', casting='unsafe')
+                assert_equal(c, np.tensordot(a.astype('f8'), b.astype('f8'),
+                                        axes=([1,0],[0,1])).astype(dtype))
+        finally:
+            ctx.__exit__()
+
+        # logical_and(logical_and(a!=0, b!=0), c!=0)
+        a = np.array([1,   3,   -2,   0,   12,  13,   0,   1], dtype=dtype)
+        b = np.array([0,   3.5, 0.,   -2,  0,   1,    3,   12], dtype=dtype)
+        c = np.array([True,True,False,True,True,False,True,True])
+        assert_equal(np.einsum("i,i,i->i", a, b, c,
+                                dtype='?', casting='unsafe'),
+                            logical_and(logical_and(a!=0, b!=0), c!=0))
+
+        a = np.arange(9, dtype=dtype)
+        assert_equal(np.einsum(",i->", 3, a), 3*np.sum(a))
+        assert_equal(np.einsum("i,->", a, 3), 3*np.sum(a))
+
+        # Various stride0, contiguous, and SSE aligned variants
+        a = np.arange(64, dtype=dtype)
+        if np.dtype(dtype).itemsize > 1:
+            assert_equal(np.einsum(",",a,a), np.multiply(a,a))
+            assert_equal(np.einsum("i,i", a, a), np.dot(a,a))
+            assert_equal(np.einsum("i,->i", a, 2), 2*a)
+            assert_equal(np.einsum(",i->i", 2, a), 2*a)
+            assert_equal(np.einsum("i,->", a, 2), 2*np.sum(a))
+            assert_equal(np.einsum(",i->", 2, a), 2*np.sum(a))
+
+            assert_equal(np.einsum(",",a[1:],a[:-1]), np.multiply(a[1:],a[:-1]))
+            assert_equal(np.einsum("i,i", a[1:], a[:-1]), np.dot(a[1:],a[:-1]))
+            assert_equal(np.einsum("i,->i", a[1:], 2), 2*a[1:])
+            assert_equal(np.einsum(",i->i", 2, a[1:]), 2*a[1:])
+            assert_equal(np.einsum("i,->", a[1:], 2), 2*np.sum(a[1:]))
+            assert_equal(np.einsum(",i->", 2, a[1:]), 2*np.sum(a[1:]))
+
+        # An object array, summed as the data type
+        a = np.arange(9, dtype=object)
+        b = np.einsum("i->", a, dtype=dtype, casting='unsafe')
+        assert_equal(b, np.sum(a))
+        assert_equal(b.dtype, np.dtype(dtype))
+
+    def test_einsum_sums_int8(self):
+        self.check_einsum_sums('i1');
+
+    def test_einsum_sums_uint8(self):
+        self.check_einsum_sums('u1');
+
+    def test_einsum_sums_int16(self):
+        self.check_einsum_sums('i2');
+
+    def test_einsum_sums_uint16(self):
+        self.check_einsum_sums('u2');
+
+    def test_einsum_sums_int32(self):
+        self.check_einsum_sums('i4');
+
+    def test_einsum_sums_uint32(self):
+        self.check_einsum_sums('u4');
+
+    def test_einsum_sums_int64(self):
+        self.check_einsum_sums('i8');
+
+    def test_einsum_sums_uint64(self):
+        self.check_einsum_sums('u8');
+
+    def test_einsum_sums_float16(self):
+        self.check_einsum_sums('f2');
+
+    def test_einsum_sums_float32(self):
+        self.check_einsum_sums('f4');
+
+    def test_einsum_sums_float64(self):
+        self.check_einsum_sums('f8');
+
+    def test_einsum_sums_longdouble(self):
+        self.check_einsum_sums(np.longdouble);
+
+    def test_einsum_sums_cfloat64(self):
+        self.check_einsum_sums('c8');
+
+    def test_einsum_sums_cfloat128(self):
+        self.check_einsum_sums('c16');
+
+    def test_einsum_sums_clongdouble(self):
+        self.check_einsum_sums(np.clongdouble);
 
 class TestNonarrayArgs(TestCase):
     # check that non-array arguments to functions wrap them in arrays
@@ -332,8 +658,8 @@ class TestFloatExceptions(TestCase):
         finally:
             np.seterr(**oldsettings)
 
-class TestCoercion(TestCase):
-    def test_coercion(self):
+class TestTypes(TestCase):
+    def check_promotion_cases(self, promote_func):
         """Tests that the scalars get coerced correctly."""
         i8, i16, i32, i64 = int8(0), int16(0), int32(0), int64(0)
         u8, u16, u32, u64 = uint8(0), uint16(0), uint32(0), uint64(0)
@@ -341,40 +667,106 @@ class TestCoercion(TestCase):
         c64, c128, cld = complex64(0), complex128(0), clongdouble(0)
 
         # coercion within the same type
-        assert_equal(np.add(i8,i16).dtype, int16)
-        assert_equal(np.add(i32,i8).dtype, int32)
-        assert_equal(np.add(i16,i64).dtype, int64)
-        assert_equal(np.add(u8,u32).dtype, uint32)
-        assert_equal(np.add(f32,f64).dtype, float64)
-        assert_equal(np.add(fld,f32).dtype, longdouble)
-        assert_equal(np.add(f64,fld).dtype, longdouble)
-        assert_equal(np.add(c128,c64).dtype, complex128)
-        assert_equal(np.add(cld,c128).dtype, clongdouble)
-        assert_equal(np.add(c64,fld).dtype, clongdouble)
+        assert_equal(promote_func(i8,i16), np.dtype(int16))
+        assert_equal(promote_func(i32,i8), np.dtype(int32))
+        assert_equal(promote_func(i16,i64), np.dtype(int64))
+        assert_equal(promote_func(u8,u32), np.dtype(uint32))
+        assert_equal(promote_func(f32,f64), np.dtype(float64))
+        assert_equal(promote_func(fld,f32), np.dtype(longdouble))
+        assert_equal(promote_func(f64,fld), np.dtype(longdouble))
+        assert_equal(promote_func(c128,c64), np.dtype(complex128))
+        assert_equal(promote_func(cld,c128), np.dtype(clongdouble))
+        assert_equal(promote_func(c64,fld), np.dtype(clongdouble))
 
         # coercion between types
-        assert_equal(np.add(i8,u8).dtype, int16)
-        assert_equal(np.add(u8,i32).dtype, int32)
-        assert_equal(np.add(i64,u32).dtype, int64)
-        assert_equal(np.add(u64,i32).dtype, float64)
-        assert_equal(np.add(i32,f32).dtype, float64)
-        assert_equal(np.add(i64,f32).dtype, float64)
-        assert_equal(np.add(f32,i16).dtype, float32)
-        assert_equal(np.add(f32,u32).dtype, float64)
-        assert_equal(np.add(f32,c64).dtype, complex64)
-        assert_equal(np.add(c128,f32).dtype, complex128)
-        assert_equal(np.add(cld,f64).dtype, clongdouble)
+        assert_equal(promote_func(i8,u8), np.dtype(int16))
+        assert_equal(promote_func(u8,i32), np.dtype(int32))
+        assert_equal(promote_func(i64,u32), np.dtype(int64))
+        assert_equal(promote_func(u64,i32), np.dtype(float64))
+        assert_equal(promote_func(i32,f32), np.dtype(float64))
+        assert_equal(promote_func(i64,f32), np.dtype(float64))
+        assert_equal(promote_func(f32,i16), np.dtype(float32))
+        assert_equal(promote_func(f32,u32), np.dtype(float64))
+        assert_equal(promote_func(f32,c64), np.dtype(complex64))
+        assert_equal(promote_func(c128,f32), np.dtype(complex128))
+        assert_equal(promote_func(cld,f64), np.dtype(clongdouble))
 
         # coercion between scalars and 1-D arrays
-        assert_equal(np.add(array([i8]),i64).dtype, int8)
-        assert_equal(np.add(u64,array([i32])).dtype, int32)
-        assert_equal(np.add(i64,array([u32])).dtype, uint32)
-        assert_equal(np.add(int32(-1),array([u64])).dtype, float64)
-        assert_equal(np.add(f64,array([f32])).dtype, float32)
-        assert_equal(np.add(fld,array([f32])).dtype, float32)
-        assert_equal(np.add(array([f64]),fld).dtype, float64)
-        assert_equal(np.add(fld,array([c64])).dtype, complex64)
-        assert_equal(np.add(c64,array([f64])).dtype, complex128)
+        assert_equal(promote_func(array([i8]),i64), np.dtype(int8))
+        assert_equal(promote_func(u64,array([i32])), np.dtype(int32))
+        assert_equal(promote_func(i64,array([u32])), np.dtype(uint32))
+        assert_equal(promote_func(int32(-1),array([u64])), np.dtype(float64))
+        assert_equal(promote_func(f64,array([f32])), np.dtype(float32))
+        assert_equal(promote_func(fld,array([f32])), np.dtype(float32))
+        assert_equal(promote_func(array([f64]),fld), np.dtype(float64))
+        assert_equal(promote_func(fld,array([c64])), np.dtype(complex64))
+
+    def test_coercion(self):
+        def res_type(a, b):
+            return np.add(a, b).dtype
+
+        ctx = WarningManager()
+        ctx.__enter__()
+        warnings.simplefilter('ignore', np.ComplexWarning)
+
+        self.check_promotion_cases(res_type)
+
+        f64 = float64(0)
+        c64 = complex64(0)
+        ## Scalars do not coerce to complex if the value is real
+        #assert_equal(res_type(c64,array([f64])), np.dtype(float64))
+        # But they do if the value is complex
+        assert_equal(res_type(complex64(3j),array([f64])),
+                                                    np.dtype(complex128))
+
+        # Scalars do coerce to complex even if the value is real
+        # This is so "a+0j" can be reliably used to make something complex.
+        assert_equal(res_type(c64,array([f64])), np.dtype(complex128))
+
+        ctx.__exit__()
+
+
+    def test_result_type(self):
+        self.check_promotion_cases(np.result_type)
+
+        f64 = float64(0)
+        c64 = complex64(0)
+        ## Scalars do not coerce to complex if the value is real
+        #assert_equal(np.result_type(c64,array([f64])), np.dtype(float64))
+        # But they do if the value is complex
+        assert_equal(np.result_type(complex64(3j),array([f64])),
+                                                    np.dtype(complex128))
+
+        # Scalars do coerce to complex even if the value is real
+        # This is so "a+0j" can be reliably used to make something complex.
+        assert_equal(np.result_type(c64,array([f64])), np.dtype(complex128))
+
+
+    def can_cast(self):
+        assert_(np.can_cast(np.int32, np.int64))
+        assert_(np.can_cast(np.float64, np.complex))
+        assert_(not np.can_cast(np.complex, np.float))
+        
+        assert_(np.can_cast('i8', 'f8'))
+        assert_(not np.can_cast('i8', 'f4'))
+        assert_(np.can_cast('i4', 'S4'))
+
+        assert_(np.can_cast('i8', 'i8', 'no'))
+        assert_(not np.can_cast('<i8', '>i8', 'no'))
+
+        assert_(np.can_cast('<i8', '>i8', 'equiv'))
+        assert_(not np.can_cast('<i4', '>i8', 'equiv'))
+
+        assert_(np.can_cast('<i4', '>i8', 'safe'))
+        assert_(not np.can_cast('<i8', '>i4', 'safe'))
+
+        assert_(np.can_cast('<i8', '>i4', 'same_kind'))
+        assert_(not np.can_cast('<i8', '>u4', 'same_kind'))
+
+        assert_(np.can_cast('<i8', '>u4', 'unsafe'))
+
+        assert_raises(TypeError, np.can_cast, 'i4', None)
+        assert_raises(TypeError, np.can_cast, None, 'i4')
 
 class TestFromiter(TestCase):
     def makegen(self):
@@ -409,6 +801,49 @@ class TestFromiter(TestCase):
         self.assertTrue(alltrue(a == expected,axis=0))
         self.assertTrue(alltrue(a20 == expected[:20],axis=0))
 
+class TestNonzero(TestCase):
+    def test_nonzero_trivial(self):
+        assert_equal(np.count_nonzero(array([])), 0)
+        assert_equal(np.nonzero(array([])), ([],))
+
+        assert_equal(np.count_nonzero(array(0)), 0)
+        assert_equal(np.nonzero(array(0)), ([],))
+        assert_equal(np.count_nonzero(array(1)), 1)
+        assert_equal(np.nonzero(array(1)), ([0],))
+
+    def test_nonzero_onedim(self):
+        x = array([1,0,2,-1,0,0,8])
+        assert_equal(np.count_nonzero(x), 4)
+        assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
+
+        x = array([(1,2),(0,0),(1,1),(-1,3),(0,7)],
+                            dtype=[('a','i4'),('b','i2')])
+        assert_equal(np.count_nonzero(x['a']), 3)
+        assert_equal(np.count_nonzero(x['b']), 4)
+        assert_equal(np.nonzero(x['a']), ([0,2,3],))
+        assert_equal(np.nonzero(x['b']), ([0,2,3,4],))
+
+    def test_nonzero_twodim(self):
+        x = array([[0,1,0],[2,0,3]])
+        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.nonzero(x), ([0,1,1],[1,0,2]))
+
+        x = np.eye(3)
+        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.nonzero(x), ([0,1,2],[0,1,2]))
+
+        x = array([[(0,1),(0,0),(1,11)],
+                   [(1,1),(1,0),(0,0)],
+                   [(0,0),(1,5),(0,1)]], dtype=[('a','f4'),('b','u1')])
+        assert_equal(np.count_nonzero(x['a']), 4)
+        assert_equal(np.count_nonzero(x['b']), 5)
+        assert_equal(np.nonzero(x['a']), ([0,1,1,2],[2,0,1,1]))
+        assert_equal(np.nonzero(x['b']), ([0,0,1,2,2],[0,2,0,1,2]))
+
+        assert_equal(np.count_nonzero(x['a'].T), 4)
+        assert_equal(np.count_nonzero(x['b'].T), 5)
+        assert_equal(np.nonzero(x['a'].T), ([0,1,1,2],[1,1,2,0]))
+        assert_equal(np.nonzero(x['b'].T), ([0,0,1,2,2],[0,1,2,0,2]))
 
 class TestIndex(TestCase):
     def test_boolean(self):
@@ -657,7 +1092,7 @@ class TestClip(TestCase):
         assert_array_strict_equal(ac, act)
 
     def test_simple_int64_inout(self):
-        """Test native in32 input with double array min/max and int32 out."""
+        """Test native int32 input with double array min/max and int32 out."""
         a   = self._generate_int32_data(self.nr, self.nc)
         m   = zeros(a.shape, float64)
         M   = float64(1)
@@ -1008,25 +1443,91 @@ class TestStdVarComplex(TestCase):
 
 
 class TestLikeFuncs(TestCase):
-    '''Test zeros_like and empty_like'''
+    '''Test ones_like, zeros_like, and empty_like'''
 
     def setUp(self):
-        self.data = [(array([[1,2,3],[4,5,6]],dtype=int32), (2,3), int32),
-                     (array([[1,2,3],[4,5,6]],dtype=float32), (2,3), float32),
+        self.data = [
+                # Array scalars
+                (array(3.), None),
+                (array(3), 'f8'),
+                # 1D arrays
+                (arange(6, dtype='f4'), None),
+                (arange(6), 'c16'),
+                # 2D C-layout arrays
+                (arange(6).reshape(2,3), None),
+                (arange(6).reshape(3,2), 'i1'),
+                # 2D F-layout arrays
+                (arange(6).reshape((2,3), order='F'), None),
+                (arange(6).reshape((3,2), order='F'), 'i1'),
+                # 3D C-layout arrays
+                (arange(24).reshape(2,3,4), None),
+                (arange(24).reshape(4,3,2), 'f4'),
+                # 3D F-layout arrays
+                (arange(24).reshape((2,3,4), order='F'), None),
+                (arange(24).reshape((4,3,2), order='F'), 'f4'),
+                # 3D non-C/F-layout arrays
+                (arange(24).reshape(2,3,4).swapaxes(0,1), None),
+                (arange(24).reshape(4,3,2).swapaxes(0,1), '?'),
                      ]
 
+    def check_like_function(self, like_function, value):
+        for d, dtype in self.data:
+            # default (K) order, dtype
+            dz = like_function(d, dtype=dtype)
+            assert_equal(dz.shape, d.shape)
+            assert_equal(array(dz.strides)*d.dtype.itemsize, 
+                         array(d.strides)*dz.dtype.itemsize)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            if not value is None:
+                assert_(all(dz == value))
+
+            # C order, default dtype
+            dz = like_function(d, order='C', dtype=dtype)
+            assert_equal(dz.shape, d.shape)
+            assert_(dz.flags.c_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            if not value is None:
+                assert_(all(dz == value))
+
+            # F order, default dtype
+            dz = like_function(d, order='F', dtype=dtype)
+            assert_equal(dz.shape, d.shape)
+            assert_(dz.flags.f_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            if not value is None:
+                assert_(all(dz == value))
+
+            # A order
+            dz = like_function(d, order='A', dtype=dtype)
+            assert_equal(dz.shape, d.shape)
+            if d.flags.f_contiguous:
+                assert_(dz.flags.f_contiguous)
+            else:
+                assert_(dz.flags.c_contiguous)
+            if dtype is None:
+                assert_equal(dz.dtype, d.dtype)
+            else:
+                assert_equal(dz.dtype, np.dtype(dtype))
+            if not value is None:
+                assert_(all(dz == value))
+
+    def test_ones_like(self):
+        self.check_like_function(np.ones_like, 1)
+
     def test_zeros_like(self):
-        for d, dshape, dtype in self.data:
-            dz = zeros_like(d)
-            assert dz.shape == dshape
-            assert dz.dtype.type == dtype
-            assert all(abs(dz) == 0)
+        self.check_like_function(np.zeros_like, 0)
 
     def test_empty_like(self):
-        for d, dshape, dtype in self.data:
-            dz = zeros_like(d)
-            assert dz.shape == dshape
-            assert dz.dtype.type == dtype
+        self.check_like_function(np.empty_like, None)
 
 class _TestCorrelate(TestCase):
     def _setup(self, dt):
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index c10656e7f..279b69e77 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -608,6 +608,9 @@ class TestRegression(TestCase):
         assert_equal(np.dot(x,z),np.dot(x,y2))
 
     def test_object_casting(self, level=rlevel):
+        # This used to trigger the object-type version of
+        # the bitwise_or operation, because float64 -> object
+        # casting succeeds
         def rs():
             x = np.ones([484,286])
             y = np.zeros([484,286])
@@ -912,7 +915,6 @@ class TestRegression(TestCase):
     def test_attributes(self, level=rlevel):
         """Ticket #791
         """
-        import numpy as np
         class TestArray(np.ndarray):
             def __new__(cls, data, info):
                 result = np.array(data)
@@ -922,63 +924,63 @@ class TestRegression(TestCase):
             def __array_finalize__(self, obj):
                 self.info = getattr(obj, 'info', '')
         dat = TestArray([[1,2,3,4],[5,6,7,8]],'jubba')
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.resize((4,2))
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.sort()
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.fill(2)
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.put([2,3,4],[6,3,4])
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.setfield(4, np.int32,0)
-        assert dat.info == 'jubba'
+        assert_(dat.info == 'jubba')
         dat.setflags()
-        assert dat.info == 'jubba'
-        assert dat.all(1).info == 'jubba'
-        assert dat.any(1).info == 'jubba'
-        assert dat.argmax(1).info == 'jubba'
-        assert dat.argmin(1).info == 'jubba'
-        assert dat.argsort(1).info == 'jubba'
-        assert dat.astype(TestArray).info == 'jubba'
-        assert dat.byteswap().info == 'jubba'
-        assert dat.clip(2,7).info == 'jubba'
-        assert dat.compress([0,1,1]).info == 'jubba'
-        assert dat.conj().info == 'jubba'
-        assert dat.conjugate().info == 'jubba'
-        assert dat.copy().info == 'jubba'
+        assert_(dat.info == 'jubba')
+        assert_(dat.all(1).info == 'jubba')
+        assert_(dat.any(1).info == 'jubba')
+        assert_(dat.argmax(1).info == 'jubba')
+        assert_(dat.argmin(1).info == 'jubba')
+        assert_(dat.argsort(1).info == 'jubba')
+        assert_(dat.astype(TestArray).info == 'jubba')
+        assert_(dat.byteswap().info == 'jubba')
+        assert_(dat.clip(2,7).info == 'jubba')
+        assert_(dat.compress([0,1,1]).info == 'jubba')
+        assert_(dat.conj().info == 'jubba')
+        assert_(dat.conjugate().info == 'jubba')
+        assert_(dat.copy().info == 'jubba')
         dat2 = TestArray([2, 3, 1, 0],'jubba')
         choices = [[0, 1, 2, 3], [10, 11, 12, 13],
                    [20, 21, 22, 23], [30, 31, 32, 33]]
-        assert dat2.choose(choices).info == 'jubba'
-        assert dat.cumprod(1).info == 'jubba'
-        assert dat.cumsum(1).info == 'jubba'
-        assert dat.diagonal().info == 'jubba'
-        assert dat.flatten().info == 'jubba'
-        assert dat.getfield(np.int32,0).info == 'jubba'
-        assert dat.imag.info == 'jubba'
-        assert dat.max(1).info == 'jubba'
-        assert dat.mean(1).info == 'jubba'
-        assert dat.min(1).info == 'jubba'
-        assert dat.newbyteorder().info == 'jubba'
-        assert dat.nonzero()[0].info == 'jubba'
-        assert dat.nonzero()[1].info == 'jubba'
-        assert dat.prod(1).info == 'jubba'
-        assert dat.ptp(1).info == 'jubba'
-        assert dat.ravel().info == 'jubba'
-        assert dat.real.info == 'jubba'
-        assert dat.repeat(2).info == 'jubba'
-        assert dat.reshape((2,4)).info == 'jubba'
-        assert dat.round().info == 'jubba'
-        assert dat.squeeze().info == 'jubba'
-        assert dat.std(1).info == 'jubba'
-        assert dat.sum(1).info == 'jubba'
-        assert dat.swapaxes(0,1).info == 'jubba'
-        assert dat.take([2,3,5]).info == 'jubba'
-        assert dat.transpose().info == 'jubba'
-        assert dat.T.info == 'jubba'
-        assert dat.var(1).info == 'jubba'
-        assert dat.view(TestArray).info == 'jubba'
+        assert_(dat2.choose(choices).info == 'jubba')
+        assert_(dat.cumprod(1).info == 'jubba')
+        assert_(dat.cumsum(1).info == 'jubba')
+        assert_(dat.diagonal().info == 'jubba')
+        assert_(dat.flatten().info == 'jubba')
+        assert_(dat.getfield(np.int32,0).info == 'jubba')
+        assert_(dat.imag.info == 'jubba')
+        assert_(dat.max(1).info == 'jubba')
+        assert_(dat.mean(1).info == 'jubba')
+        assert_(dat.min(1).info == 'jubba')
+        assert_(dat.newbyteorder().info == 'jubba')
+        assert_(dat.nonzero()[0].info == 'jubba')
+        assert_(dat.nonzero()[1].info == 'jubba')
+        assert_(dat.prod(1).info == 'jubba')
+        assert_(dat.ptp(1).info == 'jubba')
+        assert_(dat.ravel().info == 'jubba')
+        assert_(dat.real.info == 'jubba')
+        assert_(dat.repeat(2).info == 'jubba')
+        assert_(dat.reshape((2,4)).info == 'jubba')
+        assert_(dat.round().info == 'jubba')
+        assert_(dat.squeeze().info == 'jubba')
+        assert_(dat.std(1).info == 'jubba')
+        assert_(dat.sum(1).info == 'jubba')
+        assert_(dat.swapaxes(0,1).info == 'jubba')
+        assert_(dat.take([2,3,5]).info == 'jubba')
+        assert_(dat.transpose().info == 'jubba')
+        assert_(dat.T.info == 'jubba')
+        assert_(dat.var(1).info == 'jubba')
+        assert_(dat.view(TestArray).info == 'jubba')
 
     def test_recarray_tolist(self, level=rlevel):
         """Ticket #793, changeset r5215
@@ -1140,7 +1142,8 @@ class TestRegression(TestCase):
     def test_array_from_sequence_scalar_array2(self):
         """Ticket #1081: weird array with strange input..."""
         t = np.array([np.array([]), np.array(0, object)])
-        assert_raises(ValueError, lambda: np.array(t))
+        assert_equal(t.shape, (2,))
+        assert_equal(t.dtype, np.dtype(object))
 
     def test_array_too_big(self):
         """Ticket #1080."""
@@ -1445,5 +1448,46 @@ class TestRegression(TestCase):
         # Ticket #1695
         assert_(np.find_common_type([],['?','?']) == '?')
 
+    def test_empty_mul(self):
+        a = np.array([1.])
+        a[1:1] *= 2
+        assert_equal(a, [1.])
+
+    def test_array_side_effect(self):
+        assert_equal(np.dtype('S10').itemsize, 10)
+
+        A = np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.string_)
+
+        # This was throwing an exception because in ctors.c,
+        # discover_itemsize was calling PyObject_Length without checking
+        # the return code.  This failed to get the length of the number 2,
+        # and the exception hung around until something checked
+        # PyErr_Occurred() and returned an error.
+        assert_equal(np.dtype('S10').itemsize, 10)
+
+    def test_any_float(self):
+        # all and any for floats
+        a = np.array([0.1, 0.9])
+        assert_(np.any(a))
+        assert_(np.all(a))
+
+    def test_large_float_sum(self):
+        a = np.arange(10000, dtype='f')
+        assert_equal(a.sum(dtype='d'), a.astype('d').sum())
+
+    def test_ufunc_casting_out(self):
+        a = np.array(1.0, dtype=np.float32)
+        b = np.array(1.0, dtype=np.float64)
+        c = np.array(1.0, dtype=np.float32)
+        np.add(a, b, out=c)
+        assert_equal(c, 2.0)
+
+    def test_array_scalar_contiguous(self):
+        # Array scalars are both C and Fortran contiguous
+        assert_(np.array(1.0).flags.c_contiguous)
+        assert_(np.array(1.0).flags.f_contiguous)
+        assert_(np.array(np.float32(1.0)).flags.c_contiguous)
+        assert_(np.array(np.float32(1.0)).flags.f_contiguous)
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 520b6eb17..a2b3a232b 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -18,16 +18,22 @@ class TestTypes(TestCase):
     def test_type_add(self, level=1):
         # list of types
         for k, atype in enumerate(types):
-            vala = atype(3)
-            val1 = np.array([3],dtype=atype)
+            a_scalar = atype(3)
+            a_array = np.array([3],dtype=atype)
             for l, btype in enumerate(types):
-                valb = btype(1)
-                val2 = np.array([1],dtype=btype)
-                val = vala+valb
-                valo = val1 + val2
-                assert val.dtype.num == valo.dtype.num and \
-                       val.dtype.char == valo.dtype.char, \
-                       "error with (%d,%d)" % (k,l)
+                b_scalar = btype(1)
+                b_array = np.array([1],dtype=btype)
+                c_scalar = a_scalar + b_scalar
+                c_array = a_array + b_array
+                # It was comparing the type numbers, but the new ufunc
+                # function-finding mechanism finds the lowest function
+                # to which both inputs can be cast - which produces 'l'
+                # when you do 'q' + 'b'.  The old function finding mechanism
+                # skipped ahead based on the first argument, but that
+                # does not produce properly symmetric results...
+                assert_equal(c_scalar.dtype, c_array.dtype,
+                           "error with types (%d/'%c' + %d/'%c')" %
+                            (k,np.dtype(atype).char,l,np.dtype(btype).char))
 
     def test_type_create(self, level=1):
         for k, atype in enumerate(types):
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 84e64bf45..a7a41dfe2 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -275,6 +275,32 @@ class TestUfunc(TestCase):
     def test_get_signature(self):
         assert_equal(umt.inner1d.signature, "(i),(i)->()")
 
+    def test_forced_sig(self):
+        a = 0.5*np.arange(3,dtype='f8')
+        assert_equal(np.add(a,0.5), [0.5, 1, 1.5])
+        assert_equal(np.add(a,0.5,sig='i',casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a,0.5,sig='ii->i',casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a,0.5,sig=('i4',),casting='unsafe'), [0, 0, 1])
+        assert_equal(np.add(a,0.5,sig=('i4','i4','i4'),
+                                            casting='unsafe'), [0, 0, 1])
+
+        b = np.zeros((3,),dtype='f8')
+        np.add(a,0.5,out=b)
+        assert_equal(b, [0.5, 1, 1.5])
+        b[:] = 0
+        np.add(a,0.5,sig='i',out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a,0.5,sig='ii->i',out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a,0.5,sig=('i4',),out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        b[:] = 0
+        np.add(a,0.5,sig=('i4','i4','i4'),out=b, casting='unsafe')
+        assert_equal(b, [0, 0, 1])
+        
+
     def test_inner1d(self):
         a = np.arange(6).reshape((2,3))
         assert_array_equal(umt.inner1d(a,a), np.sum(a*a,axis=-1))
@@ -344,14 +370,25 @@ class TestUfunc(TestCase):
         c = np.zeros((2,3),dtype='int')
         umt.inner1d(a,b,c)
         assert_array_equal(c, np.sum(a*b,axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a,b,out=c)
+        assert_array_equal(c, np.sum(a*b,axis=-1), err_msg=msg)
+
         msg = "output argument with type cast"
         c = np.zeros((2,3),dtype='int16')
         umt.inner1d(a,b,c)
         assert_array_equal(c, np.sum(a*b,axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a,b,out=c)
+        assert_array_equal(c, np.sum(a*b,axis=-1), err_msg=msg)
+
         msg = "output argument with incontiguous layout"
         c = np.zeros((2,3,4),dtype='int16')
         umt.inner1d(a,b,c[...,0])
         assert_array_equal(c[...,0], np.sum(a*b,axis=-1), err_msg=msg)
+        c[:] = -1
+        umt.inner1d(a,b,out=c[...,0])
+        assert_array_equal(c[...,0], np.sum(a*b,axis=-1), err_msg=msg)
 
     def test_innerwt(self):
         a = np.arange(6).reshape((2,3))
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 34bbd1469..3f4db4593 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -683,19 +683,44 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     X = []
 
     def flatten_dtype(dt):
-        """Unpack a structured data-type."""
+        """Unpack a structured data-type, and produce re-packing info."""
         if dt.names is None:
             # If the dtype is flattened, return.
             # If the dtype has a shape, the dtype occurs
             # in the list more than once.
-            return [dt.base] * int(np.prod(dt.shape))
+            shape = dt.shape
+            if len(shape) == 0:
+                return ([dt.base], None)
+            else:
+                packing = [(shape[-1], tuple)]
+                if len(shape) > 1:
+                    for dim in dt.shape[-2:0:-1]:
+                        packing = [(dim*packing[0][0],packing*dim)]
+                    packing = packing*shape[0]
+                return ([dt.base] * int(np.prod(dt.shape)), packing)
         else:
             types = []
+            packing = []
             for field in dt.names:
                 tp, bytes = dt.fields[field]
-                flat_dt = flatten_dtype(tp)
+                flat_dt, flat_packing = flatten_dtype(tp)
                 types.extend(flat_dt)
-            return types
+                packing.append((len(flat_dt),flat_packing))
+            return (types, packing)
+
+    def pack_items(items, packing):
+        """Pack items into nested lists based on re-packing info."""
+        if packing == None:
+            return items[0]
+        elif packing is tuple:
+            return tuple(items)
+        else:
+            start = 0
+            ret = []
+            for length, subpacking in packing:
+                ret.append(pack_items(items[start:start+length], subpacking))
+                start += length
+            return tuple(ret)
 
     def split_line(line):
         """Chop off comments, strip, and split at delimiter."""
@@ -724,7 +749,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             first_vals = split_line(first_line)
         N = len(usecols or first_vals)
 
-        dtype_types = flatten_dtype(dtype)
+        dtype_types, packing = flatten_dtype(dtype)
         if len(dtype_types) > 1:
             # We're dealing with a structured array, each field of
             # the dtype matches a column
@@ -732,6 +757,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         else:
             # All fields have the same dtype
             converters = [defconv for i in xrange(N)]
+            if N > 1:
+                packing = [(N, tuple)]
 
         # By preference, use the converters specified by the user
         for i, conv in (user_converters or {}).iteritems():
@@ -753,27 +780,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                 vals = [vals[i] for i in usecols]
 
             # Convert each value according to its column and store
-            X.append(tuple([conv(val) for (conv, val) in zip(converters, vals)]))
+            items = [conv(val) for (conv, val) in zip(converters, vals)]
+            # Then pack it according to the dtype's nesting
+            items = pack_items(items, packing)
+
+            X.append(items)
     finally:
         if own_fh:
             fh.close()
 
-    if len(dtype_types) > 1:
-        # We're dealing with a structured array, with a dtype such as
-        # [('x', int), ('y', [('s', int), ('t', float)])]
-        #
-        # First, create the array using a flattened dtype:
-        # [('x', int), ('s', int), ('t', float)]
-        #
-        # Then, view the array using the specified dtype.
-        try:
-            X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types]))
-            X = X.view(dtype)
-        except TypeError:
-            # In the case we have an object dtype
-            X = np.array(X, dtype=dtype)
-    else:
-        X = np.array(X, dtype)
+    X = np.array(X, dtype)
 
     X = np.squeeze(X)
     if unpack:
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index a85b01909..04497dee8 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -381,6 +381,15 @@ class TestLoadTxt(TestCase):
                      dtype=dt)
         assert_array_equal(x, a)
 
+    def test_3d_shaped_dtype(self):
+        c = StringIO("aaaa  1.0  8.0  1 2 3 4 5 6 7 8 9 10 11 12")
+        dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
+                       ('block', int, (2, 2, 3))])
+        x = np.loadtxt(c, dtype=dt)
+        a = np.array([('aaaa', 1.0, 8.0, [[[1, 2, 3], [4, 5, 6]],[[7, 8, 9], [10, 11, 12]]])],
+                     dtype=dt)
+        assert_array_equal(x, a)
+
     def test_empty_file(self):
         c = StringIO()
         assert_raises(IOError, np.loadtxt, c)
@@ -884,7 +893,6 @@ M   33  21.99
                      dtype=dt)
         assert_array_equal(x, a)
 
-
     def test_withmissing(self):
         data = StringIO('A,B\n0,1\n2,N/A')
         kwargs = dict(delimiter=",", missing_values="N/A", names=True)
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 0a22e2a34..14af43a59 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -540,13 +540,17 @@ class TestStackArrays(TestCase):
 
 
 class TestJoinBy(TestCase):
+    def setUp(self):
+        self.a = np.array(zip(np.arange(10), np.arange(50, 60),
+                        np.arange(100, 110)),
+                        dtype=[('a', int), ('b', int), ('c', int)])
+        self.b = np.array(zip(np.arange(5, 15), np.arange(65, 75),
+                        np.arange(100, 110)),
+                        dtype=[('a', int), ('b', int), ('d', int)])
     #
-    def test_base(self):
+    def test_inner_join(self):
         "Basic test of join_by"
-        a = np.array(zip(np.arange(10), np.arange(50, 60), np.arange(100, 110)),
-                     dtype=[('a', int), ('b', int), ('c', int)])
-        b = np.array(zip(np.arange(5, 15), np.arange(65, 75), np.arange(100, 110)),
-                     dtype=[('a', int), ('b', int), ('d', int)])
+        a, b = self.a, self.b
         #
         test = join_by('a', a, b, jointype='inner')
         control = np.array([(5, 55, 65, 105, 100), (6, 56, 66, 106, 101),
@@ -555,6 +559,9 @@ class TestJoinBy(TestCase):
                            dtype=[('a', int), ('b1', int), ('b2', int),
                                   ('c', int), ('d', int)])
         assert_equal(test, control)
+
+    def test_join(self):
+        a, b = self.a, self.b
         #
         test = join_by(('a', 'b'), a, b)
         control = np.array([(5, 55, 105, 100), (6, 56, 106, 101),
@@ -562,6 +569,9 @@ class TestJoinBy(TestCase):
                             (9, 59, 109, 104)],
                            dtype=[('a', int), ('b', int),
                                   ('c', int), ('d', int)])
+
+    def test_outer_join(self):
+        a, b = self.a, self.b
         #
         test = join_by(('a', 'b'), a, b, 'outer')
         control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1),
@@ -587,6 +597,9 @@ class TestJoinBy(TestCase):
                       dtype=[('a', int), ('b', int),
                              ('c', int), ('d', int)])
         assert_equal(test, control)
+
+    def test_leftouter_join(self):
+        a, b = self.a, self.b
         #
         test = join_by(('a', 'b'), a, b, 'leftouter')
         control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1),
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index c044176cf..c23338cd8 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -205,7 +205,7 @@ def tensorsolve(a, b, axes=None):
 
     See Also
     --------
-    tensordot, tensorinv
+    tensordot, tensorinv, einsum
 
     Examples
     --------
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index a945789df..673cfb1ab 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -940,7 +940,9 @@ class _MaskedBinaryOperation:
         # Revert result to da where masked
         if m.any():
             np.putmask(result, m, 0)
-            result += m * da
+            # This only makes sense if the operation preserved the dtype
+            if result.dtype == da.dtype:
+                result += m * da
         # Transforms to a (subclass of) MaskedArray
         result = result.view(get_masked_subclass(a, b))
         result._mask = m
@@ -4314,6 +4316,8 @@ class MaskedArray(ndarray):
             array.
         ndarray.nonzero :
             Equivalent ndarray method.
+        count_nonzero :
+            Counts the number of non-zero elements in the input array.
 
         Examples
         --------
@@ -5550,8 +5554,8 @@ class mvoid(MaskedArray):
 
     def __getitem__(self, indx):
         "Get the index..."
-        _mask = self._mask.astype(np.void)
-        if _mask is not nomask and _mask[indx]:
+        m = self._mask
+        if m is not nomask and m[indx]:
             return masked
         return self._data[indx]
 
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index e3b7b99ef..c55559001 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -1366,12 +1366,18 @@ class TestFillingValues(TestCase):
         fval = _check_fill_value(fill_val, ndtype)
         self.assertTrue(isinstance(fval, ndarray))
         assert_equal(fval.item(), [-999, -12345678.9, asbytes("???")])
+
         #.....Using a flexible type w/ a different type shouldn't matter
-        fill_val = np.array((-999, -12345678.9, "???"),
-                            dtype=[("A", int), ("B", float), ("C", "|S3")])
+        # BEHAVIOR in 1.5 and earlier: match structured types by position
+        #fill_val = np.array((-999, -12345678.9, "???"),
+        #                    dtype=[("A", int), ("B", float), ("C", "|S3")])
+        # BEHAVIOR in 1.6 and later: match structured types by name
+        fill_val = np.array(("???", -999, -12345678.9),
+                            dtype=[("c", "|S3"), ("a", int), ("b", float), ])
         fval = _check_fill_value(fill_val, ndtype)
         self.assertTrue(isinstance(fval, ndarray))
         assert_equal(fval.item(), [-999, -12345678.9, asbytes("???")])
+
         #.....Using an object-array shouldn't matter either
         fill_value = np.array((-999, -12345678.9, "???"), dtype=object)
         fval = _check_fill_value(fill_val, ndtype)
diff --git a/numpy/ma/testutils.py b/numpy/ma/testutils.py
index 2b69cc4ad..5cfc9f2ab 100644
--- a/numpy/ma/testutils.py
+++ b/numpy/ma/testutils.py
@@ -77,8 +77,7 @@ def assert_equal_records(a, b):
 
 
 def assert_equal(actual, desired, err_msg=''):
-    """Asserts that two items are equal.
-    """
+    "Asserts that two items are equal."
     # Case #1: dictionary .....
     if isinstance(desired, dict):
         if not isinstance(actual, dict):
diff --git a/numpy/matrixlib/tests/test_defmatrix.py b/numpy/matrixlib/tests/test_defmatrix.py
index 65d79df0b..ccb68f0e7 100644
--- a/numpy/matrixlib/tests/test_defmatrix.py
+++ b/numpy/matrixlib/tests/test_defmatrix.py
@@ -263,7 +263,7 @@ class TestMatrixReturn(TestCase):
             'searchsorted', 'setflags', 'setfield', 'sort', 'take',
             'tofile', 'tolist', 'tostring', 'all', 'any', 'sum',
             'argmax', 'argmin', 'min', 'max', 'mean', 'var', 'ptp',
-            'prod', 'std', 'ctypes', 'itemset'
+            'prod', 'std', 'ctypes', 'itemset', 'setasflat'
             ]
         for attrib in dir(a):
             if attrib.startswith('_') or attrib in excluded_methods:
diff --git a/numpy/testing/print_coercion_tables.py b/numpy/testing/print_coercion_tables.py
index 0c8a87d9a..7b5320d7e 100755
--- a/numpy/testing/print_coercion_tables.py
+++ b/numpy/testing/print_coercion_tables.py
@@ -14,6 +14,8 @@ class GenericObject:
     def __radd__(self, other):
         return self
 
+    dtype = np.dtype('O')
+
 def print_cancast_table(ntypes):
     print 'X',
     for char in ntypes: print char,
@@ -24,7 +26,7 @@ def print_cancast_table(ntypes):
             print int(np.can_cast(row, col)),
         print
 
-def print_coercion_table(ntypes, inputfirstvalue, inputsecondvalue, firstarray):
+def print_coercion_table(ntypes, inputfirstvalue, inputsecondvalue, firstarray, use_promote_types=False):
     print '+',
     for char in ntypes: print char,
     print
@@ -46,11 +48,14 @@ def print_coercion_table(ntypes, inputfirstvalue, inputsecondvalue, firstarray):
                 else:
                     rowvalue = rowtype(inputfirstvalue)
                 colvalue = coltype(inputsecondvalue)
-                value = np.add(rowvalue,colvalue)
-                if isinstance(value, np.ndarray):
-                    char = value.dtype.char
+                if use_promote_types:
+                    char = np.promote_types(rowvalue.dtype, colvalue.dtype).char
                 else:
-                    char = np.dtype(type(value)).char
+                    value = np.add(rowvalue,colvalue)
+                    if isinstance(value, np.ndarray):
+                        char = value.dtype.char
+                    else:
+                        char = np.dtype(type(value)).char
             except ValueError:
                 char = '!'
             except OverflowError:
@@ -76,4 +81,6 @@ print_coercion_table(np.typecodes['All'], 0, 0, True)
 print
 print "array + neg scalar"
 print_coercion_table(np.typecodes['All'], 0, -1, True)
-
+print
+print "promote_types"
+print_coercion_table(np.typecodes['All'], 0, 0, False, True)
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 5106c1184..9798a25be 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -413,7 +413,7 @@ class TestULP(unittest.TestCase):
 
     def test_double(self):
         # Generate 1 + small deviation, check that adding eps gives a few UNL
-        x = np.ones(10).astype(np.float32)
+        x = np.ones(10).astype(np.float64)
         x += 0.01 * np.random.randn(10).astype(np.float64)
         eps = np.finfo(np.float64).eps
         assert_array_max_ulp(x, x+eps, maxulp=200)
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 6f18b5468..1a03f98ea 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -421,7 +421,7 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
         usecomplex = False
 
     msg = build_err_msg([actual, desired], err_msg, verbose=verbose,
-                         header='Arrays are not almost equal')
+             header=('Arrays are not almost equal to %d decimals' % decimal))
 
     if usecomplex:
         if iscomplexobj(actual):
@@ -616,7 +616,8 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                                 names=('x', 'y'))
             if not cond :
                 raise AssertionError(msg)
-    except ValueError:
+    except ValueError as e:
+        header = 'error during assertion:\n%s\n\n%s' % (e, header)
         msg = build_err_msg([x, y], err_msg, verbose=verbose, header=header,
                             names=('x', 'y'))
         raise ValueError(msg)
@@ -771,7 +772,7 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg='', verbose=True):
             z = z.astype(float_) # handle object arrays
         return around(z, decimal) <= 10.0**(-decimal)
     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,
-                         header='Arrays are not almost equal')
+             header=('Arrays are not almost equal to %d decimals' % decimal))
 
 def assert_array_less(x, y, err_msg='', verbose=True):
     """
author	Mark Wiebe <mwwiebe@gmail.com>	2011-01-28 16:27:56 -0800
committer	Mark Wiebe <mwwiebe@gmail.com>	2011-01-28 16:27:56 -0800
commit	67e5476a4178de55451501cfb01794c22d340b7a (patch)
tree	2a24b021001658deb92230692f8fad62e9355791 /numpy
parent	cdac1209a517bf0808f12340d21ac9d334f69485 (diff)
parent	aedce0eb9fa63e7dec3c865374a64e11374c284c (diff)
download	numpy-67e5476a4178de55451501cfb01794c22d340b7a.tar.gz