1 files changed, 430 insertions, 365 deletions
diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
index 06daed3ba..ee1d0b439 100644
--- a/numpy/core/src/multiarray/new_iterator.c.src
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -31,7 +31,7 @@
 #define NPY_ITFLAG_NOINNER     0x020
 /* The iterator is buffered */
 #define NPY_ITFLAG_BUFFER      0x040
-/* The iterator is buffered */
+/* The iterator should grow the buffered inner loop when possible */
 #define NPY_ITFLAG_GROWINNER   0x080
 
 /* Internal iterator per-operand iterator flags */
@@ -51,10 +51,6 @@
 /* The operand is aligned */
 #define NPY_OP_ITFLAG_ALIGNED      0x40
 
-/* Internal flag, for the type of operands */
-#define NPY_ITER_OP_ARRAY         0
-#define NPY_ITER_OP_NULL          1
-
 /*
  * The data layout of the iterator is fully specified by
  * a triple (itflags, ndim, niter).  These three variables
@@ -212,12 +208,19 @@ struct NpyIter_AD {
 
 /* Internal helper functions */
 static int
-pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
+npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
+static int
+npyiter_check_op_axes(npy_intp niter, npy_intp oa_ndim, npy_intp **op_axes);
 static int
 npyiter_check_per_op_flags(npy_uint32 flags, char *op_itflags);
 static int
-pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
-                       PyArray_Descr** op_dtype, int* op_type,
+npyiter_prepare_operands(npy_intp niter, npy_intp *ndim, PyArrayObject **op_in,
+                    PyArrayObject **op, PyArray_Descr **op_request_dtypes,
+                    PyArray_Descr **op_dtype,
+                    npy_intp *op_ndim, npy_uint32 *op_flags, char *op_itflags);
+static int
+npyiter_prepare_one_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
+                       PyArray_Descr** op_dtype,
                        npy_intp* op_ndim,
                        npy_uint32 op_flags, char *op_itflags);
 static int
@@ -263,6 +266,15 @@ static PyArrayObject *
 npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                 npy_intp op_ndim, npy_intp *shape,
                 PyArray_Descr *op_dtype, npy_intp *op_axes);
+static int
+npyiter_allocate_arrays(NpyIter *iter, PyArrayObject **op,
+                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
+                        npy_uint32 *op_flags, char *op_itflags,
+                        npy_intp *op_ndim, npy_intp **op_axes);
+static void
+npyiter_get_priority_subtype(PyArrayObject **op, char *op_itflags,
+                            npy_intp niter, double *subtype_priority,
+                            PyTypeObject **subtype);
 
 static int
 npyiter_allocate_buffers(NpyIter *iter);
@@ -291,16 +303,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
     /* Per-operand values */
     PyArrayObject *op[NPY_MAXARGS];
     PyArray_Descr *op_dtype[NPY_MAXARGS];
-    int op_type[NPY_MAXARGS];
     char op_itflags[NPY_MAXARGS];
     npy_intp op_ndim[NPY_MAXARGS];
     char **op_dataptr;
 
     npy_intp *perm;
     NpyIter_BufferData *bufferdata = NULL;
-    char axes_dupcheck[NPY_MAXDIMS];
-    int any_allocate_if_null = 0, any_missing_dtypes = 0,
-            allocate_output_scalars = 0;
+    int any_allocate = 0, any_missing_dtypes = 0,
+            allocate_output_scalars = 0, need_subtype = 0;
+
     /* The subtype for automatically allocated outputs */
     double subtype_priority = NPY_PRIORITY;
     PyTypeObject *subtype = &PyArray_Type;
@@ -313,59 +324,12 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
     }
 
     /* Error check 'oa_ndim' and 'op_axes', which must be used together */
-    if (oa_ndim == 0 && op_axes != NULL) {
-        PyErr_Format(PyExc_ValueError,
-                "If 'op_axes' is not NULL in the iterator constructor, "
-                "'oa_ndim' must be greater than zero");
+    if (!npyiter_check_op_axes(niter, oa_ndim, op_axes)) {
         return NULL;
     }
-    else if (oa_ndim > 0) {
-        if (oa_ndim > NPY_MAXDIMS) {
-            PyErr_Format(PyExc_ValueError,
-                "Cannot construct an iterator with more than %d dimensions "
-                "(%d were requested for op_axes)",
-                (int)NPY_MAXDIMS, (int)oa_ndim);
-            return NULL;
-        }
-        else if (op_axes == NULL) {
-            PyErr_Format(PyExc_ValueError,
-                    "If 'oa_ndim' is greater than zero in the iterator "
-                    "constructor, then op_axes cannot be NULL");
-            return NULL;
-        }
 
-        /* Check that there are no duplicates in op_axes */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            npy_intp *axes = op_axes[iiter];
-            if (axes != NULL) {
-                memset(axes_dupcheck, 0, NPY_MAXDIMS);
-                for (idim = 0; idim < oa_ndim; ++idim) {
-                    npy_intp i = axes[idim];
-                    if (i >= 0) {
-                        if (i >= NPY_MAXDIMS) {
-                            PyErr_Format(PyExc_ValueError,
-                                    "The 'op_axes' provided to the iterator "
-                                    "constructor contained invalid "
-                                    "values %d", (int)i);
-                            return NULL;
-                        } else if(axes_dupcheck[i] == 1) {
-                            PyErr_Format(PyExc_ValueError,
-                                    "The 'op_axes' provided to the iterator "
-                                    "constructor contained duplicate "
-                                    "value %d", (int)i);
-                            return NULL;
-                        }
-                        else {
-                            axes_dupcheck[i] = 1;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    /* Checks the global iterator flags */
-    if (!pyiter_check_global_flags(flags, &itflags)) {
+    /* Check the global iterator flags */
+    if (!npyiter_check_global_flags(flags, &itflags)) {
         return NULL;
     }
 
@@ -374,76 +338,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
      * chosen to be big enough to get some amortization benefits, but
      * small enough to be cache-friendly.
      */
-    if (itflags&NPY_ITFLAG_BUFFER && buffersize <= 0) {
+    if ((itflags&NPY_ITFLAG_BUFFER) && buffersize <= 0) {
         buffersize = 1 << 12;
     }
 
     /* Prepare all the operands */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        /*
-         * Make a copy of the input operands so we can substitute
-         * new values in place when necessary without affecting
-         * the caller's array.
-         */
-        op[iiter] = op_in[iiter];
-        Py_XINCREF(op[iiter]);
-        op_dtype[iiter] = NULL;
-
-        /* Check the readonly/writeonly flags, and fill in op_itflags */
-        if (!npyiter_check_per_op_flags(op_flags[iiter], &op_itflags[iiter])) {
-            npy_intp i;
-
-            for (i = 0; i <= iiter; ++i) {
-                Py_XDECREF(op[i]);
-                Py_XDECREF(op_dtype[i]);
-            }
-            return NULL;
-        }
-
-        /*
-         * Prepare the operand.  This produces an op_dtype[iiter] reference
-         * on success.
-         */
-        if (!pyiter_prepare_operand(&op[iiter],
-                        op_request_dtypes ? op_request_dtypes[iiter] : NULL,
-                        &op_dtype[iiter], &op_type[iiter],
-                        &op_ndim[iiter],
-                        op_flags[iiter], &op_itflags[iiter])) {
-            npy_intp i;
-
-            for (i = 0; i <= iiter; ++i) {
-                Py_XDECREF(op[i]);
-                Py_XDECREF(op_dtype[i]);
-            }
-            return NULL;
-        }
-        /* The iterator dimensions is the maximum of all the inputs */
-        if (op_ndim[iiter] > ndim) {
-            ndim = op_ndim[iiter];
-        }
-    }
-
-
-    /* If all the operands were NULL, it's an error */
-    if (op_type[0] == NPY_ITER_OP_NULL) {
-        int all_null = 1;
-        for (iiter = 1; iiter < niter; ++iiter) {
-            if (op_type[iiter] != NPY_ITER_OP_NULL) {
-                all_null = 0;
-                break;
-            }
-        }
-        if (all_null) {
-            npy_intp i;
-
-            for (i = 0; i < niter; ++i) {
-                Py_XDECREF(op[i]);
-                Py_XDECREF(op_dtype[i]);
-            }
-            PyErr_SetString(PyExc_ValueError,
-                    "At least one iterator input must be non-NULL");
-            return NULL;
-        }
+    if (!npyiter_prepare_operands(niter, &ndim, op_in, op,
+                        op_request_dtypes, op_dtype,
+                        op_ndim, op_flags, op_itflags)) {
+        return NULL;
     }
 
     /* If 'op_axes' is being used, force 'ndim' */
@@ -474,36 +377,37 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
         NIT_BASEOFFSETS(iter)[iiter] = 0;
 
         /* Get the data pointer for this operand */
-        switch (op_type[iiter]) {
-            case NPY_ITER_OP_ARRAY:
-                /*
-                 * Array casting/copying is handled later, once the
-                 * iteration order is finalized.  Here, we
-                 * optimistically assume the array will be used
-                 * as is.
-                 */
-                op_dataptr[iiter] = PyArray_DATA(op[iiter]);
-                break;
-
-            case NPY_ITER_OP_NULL:
-                op_dataptr[iiter] = NULL;
-                /* Now that ndim is fixed, outputs get the full ndim */
-                if (allocate_output_scalars) {
-                    op_ndim[iiter] = 0;
-                }
-                else {
-                    op_ndim[iiter] = ndim;
-                }
-                /* Flag this so later we can avoid flipping axes */
-                any_allocate_if_null = 1;
-                /*
-                 * If the data type wasn't provided, will need to
-                 * calculate it later.
-                 */
-                if (op_dtype[iiter] == NULL) {
-                    any_missing_dtypes = 1;
-                }
-                break;
+        if (op[iiter] != NULL) {
+            /*
+             * Array casting/copying is handled later, once the
+             * iteration order is finalized.  Here, we
+             * optimistically assume the array will be used
+             * as is.
+             */
+            op_dataptr[iiter] = PyArray_DATA(op[iiter]);
+        }
+        else {
+            op_dataptr[iiter] = NULL;
+            /* Now that ndim is fixed, outputs get the full ndim */
+            if (allocate_output_scalars) {
+                op_ndim[iiter] = 0;
+            }
+            else {
+                op_ndim[iiter] = ndim;
+            }
+            /* Flag this so later we can avoid flipping axes */
+            any_allocate = 1;
+            /* If a subtype may be used, indicate so */
+            if (!(op_flags[iiter]&NPY_ITER_NO_SUBTYPE)) {
+                need_subtype = 1;
+            }
+            /*
+             * If the data type wasn't provided, will need to
+             * calculate it later.
+             */
+            if (op_dtype[iiter] == NULL) {
+                any_missing_dtypes = 1;
+            }
         }
     }
     /* Set resetindex to zero as well (it's just after the resetdataptr) */
@@ -569,28 +473,15 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
          * If there's an output being allocated, we must not negate
          * any strides.
          */
-        if (!any_allocate_if_null) {
+        if (!any_allocate) {
             npyiter_flip_negative_strides(iter);
         }
         itflags = NIT_ITFLAGS(iter);
     }
 
-    if (any_allocate_if_null) {
-        /*
-         * The __array_priority__ attribute of the inputs determines
-         * the subtype of any output arrays.  Take the subtype
-         * with highest priority.
-         */
-        for (iiter = 0; iiter < niter; ++iiter) {
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
-                double priority =
-                            PyArray_GetPriority((PyObject *)op[iiter], 0.0);
-                if (priority > subtype_priority) {
-                    subtype_priority = priority;
-                    subtype = Py_TYPE(op[iiter]);
-                }
-            }
-        }
+    if (need_subtype) {
+        npyiter_get_priority_subtype(op, op_itflags, niter,
+                                     &subtype_priority, &subtype);
     }
 
     /*
@@ -646,189 +537,10 @@ NpyIter_MultiNew(npy_intp niter, PyArrayObject **op_in, npy_uint32 flags,
      * copying due to casting/byte order/alignment can be
      * done now using a memory layout matching the iterator.
      */
-    for (iiter = 0; iiter < niter; ++iiter) {
-        if (op_type[iiter] == NPY_ITER_OP_NULL) {
-            PyArrayObject *out;
-            PyTypeObject *op_subtype;
-
-            /* Check whether the subtype was disabled */
-            if (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) {
-                op_subtype = &PyArray_Type;
-            }
-            else {
-                op_subtype = subtype;
-            }
-
-            /* Allocate the output array, if possible */
-            out = npyiter_new_temp_array(iter, op_subtype,
-                                        op_ndim[iiter], NULL,
-                                        op_dtype[iiter],
-                                        op_axes ? op_axes[iiter] : NULL);
-            if (out == NULL) {
-                NpyIter_Deallocate(iter);
-                return NULL;
-            }
-
-            op[iiter] = out;
-            NIT_OBJECTS(iter)[iiter] = out;
-
-            /*
-             * Now we need to replace the pointers and strides with values
-             * from the new array.
-             */
-            npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
-                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
-
-            /* New arrays are aligned and need no swapping or casting */
-            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
-            op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
-        }
-        else if ((op_itflags[iiter]&
-                        (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) &&
-                              (op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) {
-            PyArrayObject *temp;
-
-            /* Allocate the temporary array, if possible */
-            temp = npyiter_new_temp_array(iter, &PyArray_Type,
-                                        PyArray_NDIM(op[iiter]),
-                                        PyArray_DIMS(op[iiter]),
-                                        op_dtype[iiter],
-                                        op_axes ? op_axes[iiter] : NULL);
-            if (temp == NULL) {
-                NpyIter_Deallocate(iter);
-                return NULL;
-            }
-
-            /* If the data will be read, copy it into temp */
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
-                if (PyArray_CopyInto(temp, op[iiter]) != 0) {
-                    Py_DECREF(temp);
-                    NpyIter_Deallocate(iter);
-                    return NULL;
-                }
-            }
-            /* If the data will be written to, set UPDATEIFCOPY */
-            if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
-                PyArray_FLAGS(temp) |= NPY_UPDATEIFCOPY;
-                PyArray_FLAGS(op[iiter]) &= ~NPY_WRITEABLE;
-                Py_INCREF(op[iiter]);
-                temp->base = (PyObject *)op[iiter];
-            }
-
-            Py_DECREF(op[iiter]);
-            op[iiter] = temp;
-            NIT_OBJECTS(iter)[iiter] = temp;
-
-            /*
-             * Now we need to replace the pointers and strides with values
-             * from the temporary array.
-             */
-            npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
-                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
-
-            /* Now it is aligned, and no longer needs a swap or cast */
-            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
-            op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
-        }
-        else {
-            /*
-             * Buffering must be enabled for casting/conversion if copy
-             * wasn't specified.
-             */
-            if (op_itflags[iiter]&
-                        (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP) &&
-                          !(itflags&NPY_ITFLAG_BUFFER)) {
-                PyErr_SetString(PyExc_TypeError,
-                        "Iterator input required copying or buffering, "
-                        "but neither copying nor buffering was enabled");
-                NpyIter_Deallocate(iter);
-                return NULL;
-            }
-
-            /*
-             * If the operand is aligned, any buffering can use aligned
-             * optimizations.
-             */
-            if (PyArray_ISALIGNED(op[iiter])) {
-                op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
-            }
-        }
-
-        /*
-         * If no alignment, byte swap, or casting is needed, and
-         * the inner stride of this operand works for the whole
-         * array, we can set NPY_OP_ITFLAG_BUFNEVER.
-         * But, if buffering is enabled, write-buffering must be
-         * one-to-one, because the buffering write back won't combine
-         * values correctly. This test doesn't catch everything, but it will
-         * catch the most common case of a broadcasting a write-buffered
-         * dimension.
-         */
-        if ((itflags&NPY_ITFLAG_BUFFER) &&
-                        (!(op_itflags[iiter]&(NPY_OP_ITFLAG_CAST|
-                                             NPY_OP_ITFLAG_COPYSWAP)) ||
-                          (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE))) {
-            int is_one_to_one = 1;
-            npy_intp stride, shape, innerstride = 0, innershape;
-            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-            npy_intp sizeof_axisdata =
-                                NIT_SIZEOF_AXISDATA(itflags, ndim, niter);
-            /* Find stride of the first non-empty shape */
-            for (idim = 0; idim < ndim; ++idim) {
-                innershape = NAD_SHAPE(axisdata);
-                if (innershape != 1) {
-                    innerstride = NAD_STRIDES(axisdata)[iiter];
-                    if (innerstride == 0) {
-                        is_one_to_one = 0;
-                    }
-                    break;
-                }
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
-            }
-            ++idim;
-            NIT_ADVANCE_AXISDATA(axisdata, 1);
-            /* Check that everything could have coalesced together */
-            for (; idim < ndim; ++idim) {
-                stride = NAD_STRIDES(axisdata)[iiter];
-                shape = NAD_SHAPE(axisdata);
-                if (shape != 1) {
-                    if (stride == 0) {
-                        is_one_to_one = 0;
-                    }
-                    /*
-                     * If N times the inner stride doesn't equal this
-                     * stride, the multi-dimensionality is needed.
-                     */
-                    if (innerstride*innershape != stride) {
-                        break;
-                    }
-                    else {
-                        innershape *= shape;
-                    }
-                }
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
-            }
-            /*
-             * If we looped all the way to the end, one stride works.
-             * Set that stride, because it may not belong to the first
-             * dimension.
-             */
-            if (idim == ndim &&
-                        !(op_itflags[iiter]&(NPY_OP_ITFLAG_CAST|
-                                             NPY_OP_ITFLAG_COPYSWAP))) {
-                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
-                NBF_STRIDES(bufferdata)[iiter] = innerstride;
-            }
-            else if (!is_one_to_one &&
-                        (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) {
-                PyErr_SetString(PyExc_ValueError,
-                        "Iterator operand requires write buffering, "
-                        "but has dimensions which have been broadcasted "
-                        "and would be combined incorrectly");
-                NpyIter_Deallocate(iter);
-                return NULL;
-            }
-        }
+    if (!npyiter_allocate_arrays(iter, op, op_dtype, subtype, op_flags,
+                            op_itflags, op_ndim, op_axes)) {
+        NpyIter_Deallocate(iter);
+        return NULL;
     }
 
     /*
@@ -1895,7 +1607,7 @@ npy_intp* NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
  * Returns 1 on success, 0 on error.
  */
 static int
-pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
+npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
 {
     if ((flags&NPY_ITER_PER_OP_FLAGS) != 0) {
         PyErr_SetString(PyExc_ValueError,
@@ -1944,6 +1656,68 @@ pyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
     return 1;
 }
 
+static int
+npyiter_check_op_axes(npy_intp niter, npy_intp oa_ndim, npy_intp **op_axes)
+{
+    char axes_dupcheck[NPY_MAXDIMS];
+    npy_intp iiter, idim;
+
+    if (oa_ndim == 0 && op_axes != NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "If 'op_axes' is not NULL in the iterator constructor, "
+                "'oa_ndim' must be greater than zero");
+        return 0;
+    }
+    else if (oa_ndim > 0) {
+        if (oa_ndim > NPY_MAXDIMS) {
+            PyErr_Format(PyExc_ValueError,
+                "Cannot construct an iterator with more than %d dimensions "
+                "(%d were requested for op_axes)",
+                (int)NPY_MAXDIMS, (int)oa_ndim);
+            return 0;
+        }
+        else if (op_axes == NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "If 'oa_ndim' is greater than zero in the iterator "
+                    "constructor, then op_axes cannot be NULL");
+            return 0;
+        }
+
+        /* Check that there are no duplicates in op_axes */
+        for (iiter = 0; iiter < niter; ++iiter) {
+            npy_intp *axes = op_axes[iiter];
+            if (axes != NULL) {
+                memset(axes_dupcheck, 0, NPY_MAXDIMS);
+                for (idim = 0; idim < oa_ndim; ++idim) {
+                    npy_intp i = axes[idim];
+                    if (i >= 0) {
+                        if (i >= NPY_MAXDIMS) {
+                            PyErr_Format(PyExc_ValueError,
+                                    "The 'op_axes' provided to the iterator "
+                                    "constructor for operand %d "
+                                    "contained invalid "
+                                    "values %d", (int)iiter, (int)i);
+                            return 0;
+                        } else if(axes_dupcheck[i] == 1) {
+                            PyErr_Format(PyExc_ValueError,
+                                    "The 'op_axes' provided to the iterator "
+                                    "constructor for operand %d "
+                                    "contained duplicate "
+                                    "value %d", (int)iiter, (int)i);
+                            return 0;
+                        }
+                        else {
+                            axes_dupcheck[i] = 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
 /*
  * Checks the per-operand input flags, and fills in op_itflags.
  *
@@ -2011,6 +1785,85 @@ npyiter_check_per_op_flags(npy_uint32 op_flags, char *op_itflags)
 }
 
 /*
+ * Process all the operands, copying new references so further processing
+ * can replace the arrays if copying is necessary.  Fill in the iterator's
+ * natural ndim.
+ */
+static int
+npyiter_prepare_operands(npy_intp niter, npy_intp *ndim, PyArrayObject **op_in,
+                    PyArrayObject **op, PyArray_Descr **op_request_dtypes,
+                    PyArray_Descr **op_dtype,
+                    npy_intp *op_ndim, npy_uint32 *op_flags, char *op_itflags)
+{
+    npy_intp iiter;
+
+    *ndim = 0;
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        op[iiter] = op_in[iiter];
+        Py_XINCREF(op[iiter]);
+        op_dtype[iiter] = NULL;
+
+        /* Check the readonly/writeonly flags, and fill in op_itflags */
+        if (!npyiter_check_per_op_flags(op_flags[iiter], &op_itflags[iiter])) {
+            npy_intp i;
+
+            for (i = 0; i <= iiter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            return 0;
+        }
+
+        /*
+         * Prepare the operand.  This produces an op_dtype[iiter] reference
+         * on success.
+         */
+        if (!npyiter_prepare_one_operand(&op[iiter],
+                        op_request_dtypes ? op_request_dtypes[iiter] : NULL,
+                        &op_dtype[iiter], &op_ndim[iiter],
+                        op_flags[iiter], &op_itflags[iiter])) {
+            npy_intp i;
+
+            for (i = 0; i <= iiter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            return 0;
+        }
+        /* The iterator dimensions is the maximum of all the inputs */
+        if (op_ndim[iiter] > *ndim) {
+            *ndim = op_ndim[iiter];
+        }
+    }
+
+
+    /* If all the operands were NULL, it's an error */
+    if (op[0] == NULL) {
+        int all_null = 1;
+        for (iiter = 1; iiter < niter; ++iiter) {
+            if (op[iiter] != NULL) {
+                all_null = 0;
+                break;
+            }
+        }
+        if (all_null) {
+            npy_intp i;
+
+            for (i = 0; i < niter; ++i) {
+                Py_XDECREF(op[i]);
+                Py_XDECREF(op_dtype[i]);
+            }
+            PyErr_SetString(PyExc_ValueError,
+                    "At least one iterator input must be non-NULL");
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
  * Returns 1 if the from -> to cast can be done, based on the casting
  * flags provided in op_flags, and 0 otherwise.
  *
@@ -2086,14 +1939,14 @@ npyiter_can_cast(PyArray_Descr *from, PyArray_Descr *to, NPY_CASTING casting)
 
 /*
  * Prepares a a constructor operand.  Assumes a reference to 'op'
- * is owned, and that 'op' may be replaced.  Fills in 'op_dtype',
- * 'op_type' and 'ndim'.
+ * is owned, and that 'op' may be replaced.  Fills in 'op_dtype'
+ * and 'ndim'.
  *
  * Returns 1 on success, 0 on failure.
  */
 static int
-pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
-                       PyArray_Descr **op_dtype, int* op_type,
+npyiter_prepare_one_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
+                       PyArray_Descr **op_dtype,
                        npy_intp* op_ndim,
                        npy_uint32 op_flags, char *op_itflags)
 {
@@ -2116,7 +1969,6 @@ pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
         /* If a requested dtype was provided, use it, otherwise NULL */
         Py_XINCREF(op_request_dtype);
         *op_dtype = op_request_dtype;
-        *op_type = NPY_ITER_OP_NULL;
         *op_ndim = 0;
         /* No copying of NULL operands */
         *op_itflags &= ~NPY_OP_ITFLAG_COPY;
@@ -2132,7 +1984,6 @@ pyiter_prepare_operand(PyArrayObject **op, PyArray_Descr *op_request_dtype,
             return 0;
 
         }
-        *op_type = NPY_ITER_OP_ARRAY;
         *op_ndim = PyArray_NDIM(*op);
         /* PyArray_DESCR does not give us a reference */
         *op_dtype = PyArray_DESCR(*op);
@@ -3267,6 +3118,220 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
     return ret;
 }
 
+static int
+npyiter_allocate_arrays(NpyIter *iter, PyArrayObject **op,
+                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
+                        npy_uint32 *op_flags, char *op_itflags,
+                        npy_intp *op_ndim, npy_intp **op_axes)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    npy_intp idim, ndim = NIT_NDIM(iter);
+    npy_intp iiter, niter = NIT_NITER(iter);
+
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op[iiter] == NULL) {
+            PyArrayObject *out;
+            PyTypeObject *op_subtype;
+
+            /* Check whether the subtype was disabled */
+            op_subtype = (op_flags[iiter]&NPY_ITER_NO_SUBTYPE) ?
+                                                &PyArray_Type : subtype;
+
+            /* Allocate the output array, if possible */
+            out = npyiter_new_temp_array(iter, op_subtype,
+                                        op_ndim[iiter], NULL,
+                                        op_dtype[iiter],
+                                        op_axes ? op_axes[iiter] : NULL);
+            if (out == NULL) {
+                return 0;
+            }
+
+            op[iiter] = out;
+            NIT_OBJECTS(iter)[iiter] = out;
+
+            /*
+             * Now we need to replace the pointers and strides with values
+             * from the new array.
+             */
+            npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
+                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+            /* New arrays are aligned and need no swapping or casting */
+            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
+        }
+        else if ((op_itflags[iiter]&
+                        (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP)) &&
+                              (op_itflags[iiter]&NPY_OP_ITFLAG_COPY)) {
+            PyArrayObject *temp;
+
+            /* Allocate the temporary array, if possible */
+            temp = npyiter_new_temp_array(iter, &PyArray_Type,
+                                        PyArray_NDIM(op[iiter]),
+                                        PyArray_DIMS(op[iiter]),
+                                        op_dtype[iiter],
+                                        op_axes ? op_axes[iiter] : NULL);
+            if (temp == NULL) {
+                return 0;
+            }
+
+            /* If the data will be read, copy it into temp */
+            if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
+                if (PyArray_CopyInto(temp, op[iiter]) != 0) {
+                    Py_DECREF(temp);
+                    return 0;
+                }
+            }
+            /* If the data will be written to, set UPDATEIFCOPY */
+            if (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) {
+                PyArray_FLAGS(temp) |= NPY_UPDATEIFCOPY;
+                PyArray_FLAGS(op[iiter]) &= ~NPY_WRITEABLE;
+                Py_INCREF(op[iiter]);
+                temp->base = (PyObject *)op[iiter];
+            }
+
+            Py_DECREF(op[iiter]);
+            op[iiter] = temp;
+            NIT_OBJECTS(iter)[iiter] = temp;
+
+            /*
+             * Now we need to replace the pointers and strides with values
+             * from the temporary array.
+             */
+            npyiter_replace_axisdata(iter, iiter, op[iiter], op_ndim[iiter],
+                    PyArray_DATA(op[iiter]), op_axes ? op_axes[iiter] : NULL);
+
+            /* The temporary copy is aligned and needs no swap or cast */
+            op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            op_itflags[iiter] &= ~(NPY_OP_ITFLAG_COPYSWAP|NPY_OP_ITFLAG_CAST);
+        }
+        else {
+            /*
+             * Buffering must be enabled for casting/conversion if copy
+             * wasn't specified.
+             */
+            if (op_itflags[iiter]&
+                        (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_COPYSWAP) &&
+                          !(itflags&NPY_ITFLAG_BUFFER)) {
+                PyErr_SetString(PyExc_TypeError,
+                        "Iterator operand required copying or buffering, "
+                        "but neither copying nor buffering was enabled");
+                return 0;
+            }
+
+            /*
+             * If the operand is aligned, any buffering can use aligned
+             * optimizations.
+             */
+            if (PyArray_ISALIGNED(op[iiter])) {
+                op_itflags[iiter] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+        }
+
+        /*
+         * If no alignment, byte swap, or casting is needed, and
+         * the inner stride of this operand works for the whole
+         * array, we can set NPY_OP_ITFLAG_BUFNEVER.
+         * But, if buffering is enabled, write-buffering must be
+         * one-to-one, because the buffering write back won't combine
+         * values correctly. This test doesn't catch everything, but it will
+         * catch the most common case of a broadcasting a write-buffered
+         * dimension.
+         */
+        if ((itflags&NPY_ITFLAG_BUFFER) &&
+                        (!(op_itflags[iiter]&(NPY_OP_ITFLAG_CAST|
+                                             NPY_OP_ITFLAG_COPYSWAP)) ||
+                          (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE))) {
+            int is_one_to_one = 1;
+            npy_intp stride, shape, innerstride = 0, innershape;
+            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+            npy_intp sizeof_axisdata =
+                                NIT_SIZEOF_AXISDATA(itflags, ndim, niter);
+            /* Find stride of the first non-empty shape */
+            for (idim = 0; idim < ndim; ++idim) {
+                innershape = NAD_SHAPE(axisdata);
+                if (innershape != 1) {
+                    innerstride = NAD_STRIDES(axisdata)[iiter];
+                    if (innerstride == 0) {
+                        is_one_to_one = 0;
+                    }
+                    break;
+                }
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+            ++idim;
+            NIT_ADVANCE_AXISDATA(axisdata, 1);
+            /* Check that everything could have coalesced together */
+            for (; idim < ndim; ++idim) {
+                stride = NAD_STRIDES(axisdata)[iiter];
+                shape = NAD_SHAPE(axisdata);
+                if (shape != 1) {
+                    if (stride == 0) {
+                        is_one_to_one = 0;
+                    }
+                    /*
+                     * If N times the inner stride doesn't equal this
+                     * stride, the multi-dimensionality is needed.
+                     */
+                    if (innerstride*innershape != stride) {
+                        break;
+                    }
+                    else {
+                        innershape *= shape;
+                    }
+                }
+                NIT_ADVANCE_AXISDATA(axisdata, 1);
+            }
+            /*
+             * If we looped all the way to the end, one stride works.
+             * Set that stride, because it may not belong to the first
+             * dimension.
+             */
+            if (idim == ndim &&
+                        !(op_itflags[iiter]&(NPY_OP_ITFLAG_CAST|
+                                             NPY_OP_ITFLAG_COPYSWAP))) {
+                op_itflags[iiter] |= NPY_OP_ITFLAG_BUFNEVER;
+                NBF_STRIDES(bufferdata)[iiter] = innerstride;
+            }
+            else if (!is_one_to_one &&
+                        (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Iterator operand requires write buffering, "
+                        "but has dimensions which have been broadcasted "
+                        "and would be combined incorrectly");
+                return 0;
+            }
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * The __array_priority__ attribute of the inputs determines
+ * the subtype of any output arrays.  This function finds the
+ * subtype of the input array with highest priority.
+ */
+static void
+npyiter_get_priority_subtype(PyArrayObject **op, char *op_itflags,
+                            npy_intp niter, double *subtype_priority,
+                            PyTypeObject **subtype)
+{
+    npy_intp iiter;
+
+    for (iiter = 0; iiter < niter; ++iiter) {
+        if (op_itflags[iiter]&NPY_OP_ITFLAG_READ) {
+            double priority = PyArray_GetPriority((PyObject *)op[iiter], 0.0);
+            if (priority > *subtype_priority) {
+                *subtype_priority = priority;
+                *subtype = Py_TYPE(op[iiter]);
+            }
+        }
+    }
+}
+
 /*
  * Calculates a dtype that all the types can be promoted to, using the
  * ufunc rules.  If only_inputs is 1, it leaves any operands that